cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

habanalabs_drv.c (15612B)


      1// SPDX-License-Identifier: GPL-2.0
      2
      3/*
      4 * Copyright 2016-2021 HabanaLabs, Ltd.
      5 * All Rights Reserved.
      6 *
      7 */
      8
      9#define pr_fmt(fmt)		"habanalabs: " fmt
     10
     11#include "habanalabs.h"
     12
     13#include <linux/pci.h>
     14#include <linux/aer.h>
     15#include <linux/module.h>
     16
     17#define HL_DRIVER_AUTHOR	"HabanaLabs Kernel Driver Team"
     18
     19#define HL_DRIVER_DESC		"Driver for HabanaLabs's AI Accelerators"
     20
     21MODULE_AUTHOR(HL_DRIVER_AUTHOR);
     22MODULE_DESCRIPTION(HL_DRIVER_DESC);
     23MODULE_LICENSE("GPL v2");
     24
     25static int hl_major;
     26static struct class *hl_class;
     27static DEFINE_IDR(hl_devs_idr);
     28static DEFINE_MUTEX(hl_devs_idr_lock);
     29
     30static int timeout_locked = 30;
     31static int reset_on_lockup = 1;
     32static int memory_scrub;
     33static ulong boot_error_status_mask = ULONG_MAX;
     34
     35module_param(timeout_locked, int, 0444);
     36MODULE_PARM_DESC(timeout_locked,
     37	"Device lockup timeout in seconds (0 = disabled, default 30s)");
     38
     39module_param(reset_on_lockup, int, 0444);
     40MODULE_PARM_DESC(reset_on_lockup,
     41	"Do device reset on lockup (0 = no, 1 = yes, default yes)");
     42
     43module_param(memory_scrub, int, 0444);
     44MODULE_PARM_DESC(memory_scrub,
     45	"Scrub device memory in various states (0 = no, 1 = yes, default no)");
     46
     47module_param(boot_error_status_mask, ulong, 0444);
     48MODULE_PARM_DESC(boot_error_status_mask,
     49	"Mask of the error status during device CPU boot (If bitX is cleared then error X is masked. Default all 1's)");
     50
     51#define PCI_VENDOR_ID_HABANALABS	0x1da3
     52
     53#define PCI_IDS_GOYA			0x0001
     54#define PCI_IDS_GAUDI			0x1000
     55#define PCI_IDS_GAUDI_SEC		0x1010
     56
     57static const struct pci_device_id ids[] = {
     58	{ PCI_DEVICE(PCI_VENDOR_ID_HABANALABS, PCI_IDS_GOYA), },
     59	{ PCI_DEVICE(PCI_VENDOR_ID_HABANALABS, PCI_IDS_GAUDI), },
     60	{ PCI_DEVICE(PCI_VENDOR_ID_HABANALABS, PCI_IDS_GAUDI_SEC), },
     61	{ 0, }
     62};
     63MODULE_DEVICE_TABLE(pci, ids);
     64
     65/*
     66 * get_asic_type - translate device id to asic type
     67 *
     68 * @device: id of the PCI device
     69 *
     70 * Translate device id to asic type.
     71 * In case of unidentified device, return -1
     72 */
     73static enum hl_asic_type get_asic_type(u16 device)
     74{
     75	enum hl_asic_type asic_type;
     76
     77	switch (device) {
     78	case PCI_IDS_GOYA:
     79		asic_type = ASIC_GOYA;
     80		break;
     81	case PCI_IDS_GAUDI:
     82		asic_type = ASIC_GAUDI;
     83		break;
     84	case PCI_IDS_GAUDI_SEC:
     85		asic_type = ASIC_GAUDI_SEC;
     86		break;
     87	default:
     88		asic_type = ASIC_INVALID;
     89		break;
     90	}
     91
     92	return asic_type;
     93}
     94
     95static bool is_asic_secured(enum hl_asic_type asic_type)
     96{
     97	switch (asic_type) {
     98	case ASIC_GAUDI_SEC:
     99		return true;
    100	default:
    101		return false;
    102	}
    103}
    104
    105/*
    106 * hl_device_open - open function for habanalabs device
    107 *
    108 * @inode: pointer to inode structure
    109 * @filp: pointer to file structure
    110 *
    111 * Called when process opens an habanalabs device.
    112 */
    113int hl_device_open(struct inode *inode, struct file *filp)
    114{
    115	enum hl_device_status status;
    116	struct hl_device *hdev;
    117	struct hl_fpriv *hpriv;
    118	int rc;
    119
    120	mutex_lock(&hl_devs_idr_lock);
    121	hdev = idr_find(&hl_devs_idr, iminor(inode));
    122	mutex_unlock(&hl_devs_idr_lock);
    123
    124	if (!hdev) {
    125		pr_err("Couldn't find device %d:%d\n",
    126			imajor(inode), iminor(inode));
    127		return -ENXIO;
    128	}
    129
    130	hpriv = kzalloc(sizeof(*hpriv), GFP_KERNEL);
    131	if (!hpriv)
    132		return -ENOMEM;
    133
    134	hpriv->hdev = hdev;
    135	filp->private_data = hpriv;
    136	hpriv->filp = filp;
    137
    138	mutex_init(&hpriv->notifier_event.lock);
    139	mutex_init(&hpriv->restore_phase_mutex);
    140	kref_init(&hpriv->refcount);
    141	nonseekable_open(inode, filp);
    142
    143	hl_ctx_mgr_init(&hpriv->ctx_mgr);
    144	hl_mem_mgr_init(hpriv->hdev->dev, &hpriv->mem_mgr);
    145
    146	hpriv->taskpid = get_task_pid(current, PIDTYPE_PID);
    147
    148	mutex_lock(&hdev->fpriv_list_lock);
    149
    150	if (!hl_device_operational(hdev, &status)) {
    151		dev_err_ratelimited(hdev->dev,
    152			"Can't open %s because it is %s\n",
    153			dev_name(hdev->dev), hdev->status[status]);
    154
    155		if (status == HL_DEVICE_STATUS_IN_RESET)
    156			rc = -EAGAIN;
    157		else
    158			rc = -EPERM;
    159
    160		goto out_err;
    161	}
    162
    163	if (hdev->is_in_dram_scrub) {
    164		dev_dbg_ratelimited(hdev->dev,
    165			"Can't open %s during dram scrub\n",
    166			dev_name(hdev->dev));
    167		rc = -EAGAIN;
    168		goto out_err;
    169	}
    170
    171	if (hdev->compute_ctx_in_release) {
    172		dev_dbg_ratelimited(hdev->dev,
    173			"Can't open %s because another user is still releasing it\n",
    174			dev_name(hdev->dev));
    175		rc = -EAGAIN;
    176		goto out_err;
    177	}
    178
    179	if (hdev->is_compute_ctx_active) {
    180		dev_dbg_ratelimited(hdev->dev,
    181			"Can't open %s because another user is working on it\n",
    182			dev_name(hdev->dev));
    183		rc = -EBUSY;
    184		goto out_err;
    185	}
    186
    187	rc = hl_ctx_create(hdev, hpriv);
    188	if (rc) {
    189		dev_err(hdev->dev, "Failed to create context %d\n", rc);
    190		goto out_err;
    191	}
    192
    193	list_add(&hpriv->dev_node, &hdev->fpriv_list);
    194	mutex_unlock(&hdev->fpriv_list_lock);
    195
    196	hl_debugfs_add_file(hpriv);
    197
    198	atomic_set(&hdev->last_error.cs_timeout.write_disable, 0);
    199	atomic_set(&hdev->last_error.razwi.write_disable, 0);
    200
    201	hdev->open_counter++;
    202	hdev->last_successful_open_jif = jiffies;
    203	hdev->last_successful_open_ktime = ktime_get();
    204
    205	return 0;
    206
    207out_err:
    208	mutex_unlock(&hdev->fpriv_list_lock);
    209	hl_mem_mgr_fini(&hpriv->mem_mgr);
    210	hl_ctx_mgr_fini(hpriv->hdev, &hpriv->ctx_mgr);
    211	filp->private_data = NULL;
    212	mutex_destroy(&hpriv->restore_phase_mutex);
    213	mutex_destroy(&hpriv->notifier_event.lock);
    214	put_pid(hpriv->taskpid);
    215
    216	kfree(hpriv);
    217
    218	return rc;
    219}
    220
    221int hl_device_open_ctrl(struct inode *inode, struct file *filp)
    222{
    223	struct hl_device *hdev;
    224	struct hl_fpriv *hpriv;
    225	int rc;
    226
    227	mutex_lock(&hl_devs_idr_lock);
    228	hdev = idr_find(&hl_devs_idr, iminor(inode));
    229	mutex_unlock(&hl_devs_idr_lock);
    230
    231	if (!hdev) {
    232		pr_err("Couldn't find device %d:%d\n",
    233			imajor(inode), iminor(inode));
    234		return -ENXIO;
    235	}
    236
    237	hpriv = kzalloc(sizeof(*hpriv), GFP_KERNEL);
    238	if (!hpriv)
    239		return -ENOMEM;
    240
    241	/* Prevent other routines from reading partial hpriv data by
    242	 * initializing hpriv fields before inserting it to the list
    243	 */
    244	hpriv->hdev = hdev;
    245	filp->private_data = hpriv;
    246	hpriv->filp = filp;
    247
    248	mutex_init(&hpriv->notifier_event.lock);
    249	nonseekable_open(inode, filp);
    250
    251	hpriv->taskpid = get_task_pid(current, PIDTYPE_PID);
    252
    253	mutex_lock(&hdev->fpriv_ctrl_list_lock);
    254
    255	if (!hl_device_operational(hdev, NULL)) {
    256		dev_err_ratelimited(hdev->dev_ctrl,
    257			"Can't open %s because it is disabled or in reset\n",
    258			dev_name(hdev->dev_ctrl));
    259		rc = -EPERM;
    260		goto out_err;
    261	}
    262
    263	list_add(&hpriv->dev_node, &hdev->fpriv_ctrl_list);
    264	mutex_unlock(&hdev->fpriv_ctrl_list_lock);
    265
    266	return 0;
    267
    268out_err:
    269	mutex_unlock(&hdev->fpriv_ctrl_list_lock);
    270	filp->private_data = NULL;
    271	put_pid(hpriv->taskpid);
    272
    273	kfree(hpriv);
    274
    275	return rc;
    276}
    277
    278static void set_driver_behavior_per_device(struct hl_device *hdev)
    279{
    280	hdev->pldm = 0;
    281	hdev->fw_components = FW_TYPE_ALL_TYPES;
    282	hdev->cpu_queues_enable = 1;
    283	hdev->heartbeat = 1;
    284	hdev->mmu_enable = 1;
    285	hdev->sram_scrambler_enable = 1;
    286	hdev->dram_scrambler_enable = 1;
    287	hdev->bmc_enable = 1;
    288	hdev->hard_reset_on_fw_events = 1;
    289	hdev->reset_on_preboot_fail = 1;
    290	hdev->reset_if_device_not_idle = 1;
    291
    292	hdev->reset_pcilink = 0;
    293	hdev->axi_drain = 0;
    294}
    295
    296static void copy_kernel_module_params_to_device(struct hl_device *hdev)
    297{
    298	hdev->major = hl_major;
    299	hdev->memory_scrub = memory_scrub;
    300	hdev->reset_on_lockup = reset_on_lockup;
    301	hdev->boot_error_status_mask = boot_error_status_mask;
    302
    303	if (timeout_locked)
    304		hdev->timeout_jiffies = msecs_to_jiffies(timeout_locked * 1000);
    305	else
    306		hdev->timeout_jiffies = MAX_SCHEDULE_TIMEOUT;
    307
    308}
    309
    310static int fixup_device_params(struct hl_device *hdev)
    311{
    312	hdev->asic_prop.fw_security_enabled = is_asic_secured(hdev->asic_type);
    313
    314	hdev->fw_poll_interval_usec = HL_FW_STATUS_POLL_INTERVAL_USEC;
    315	hdev->fw_comms_poll_interval_usec = HL_FW_STATUS_POLL_INTERVAL_USEC;
    316
    317	hdev->stop_on_err = true;
    318	hdev->reset_info.curr_reset_cause = HL_RESET_CAUSE_UNKNOWN;
    319	hdev->reset_info.prev_reset_trigger = HL_RESET_TRIGGER_DEFAULT;
    320
    321	/* Enable only after the initialization of the device */
    322	hdev->disabled = true;
    323
    324	return 0;
    325}
    326
    327/**
    328 * create_hdev - create habanalabs device instance
    329 *
    330 * @dev: will hold the pointer to the new habanalabs device structure
    331 * @pdev: pointer to the pci device
    332 *
    333 * Allocate memory for habanalabs device and initialize basic fields
    334 * Identify the ASIC type
    335 * Allocate ID (minor) for the device (only for real devices)
    336 */
    337static int create_hdev(struct hl_device **dev, struct pci_dev *pdev)
    338{
    339	int main_id, ctrl_id = 0, rc = 0;
    340	struct hl_device *hdev;
    341
    342	*dev = NULL;
    343
    344	hdev = kzalloc(sizeof(*hdev), GFP_KERNEL);
    345	if (!hdev)
    346		return -ENOMEM;
    347
    348	/* can be NULL in case of simulator device */
    349	hdev->pdev = pdev;
    350
    351	/* Assign status description string */
    352	strncpy(hdev->status[HL_DEVICE_STATUS_OPERATIONAL], "operational", HL_STR_MAX);
    353	strncpy(hdev->status[HL_DEVICE_STATUS_IN_RESET], "in reset", HL_STR_MAX);
    354	strncpy(hdev->status[HL_DEVICE_STATUS_MALFUNCTION], "disabled", HL_STR_MAX);
    355	strncpy(hdev->status[HL_DEVICE_STATUS_NEEDS_RESET], "needs reset", HL_STR_MAX);
    356	strncpy(hdev->status[HL_DEVICE_STATUS_IN_DEVICE_CREATION],
    357					"in device creation", HL_STR_MAX);
    358
    359	/* First, we must find out which ASIC are we handling. This is needed
    360	 * to configure the behavior of the driver (kernel parameters)
    361	 */
    362	hdev->asic_type = get_asic_type(pdev->device);
    363	if (hdev->asic_type == ASIC_INVALID) {
    364		dev_err(&pdev->dev, "Unsupported ASIC\n");
    365		rc = -ENODEV;
    366		goto free_hdev;
    367	}
    368
    369	copy_kernel_module_params_to_device(hdev);
    370
    371	set_driver_behavior_per_device(hdev);
    372
    373	fixup_device_params(hdev);
    374
    375	mutex_lock(&hl_devs_idr_lock);
    376
    377	/* Always save 2 numbers, 1 for main device and 1 for control.
    378	 * They must be consecutive
    379	 */
    380	main_id = idr_alloc(&hl_devs_idr, hdev, 0, HL_MAX_MINORS, GFP_KERNEL);
    381
    382	if (main_id >= 0)
    383		ctrl_id = idr_alloc(&hl_devs_idr, hdev, main_id + 1,
    384					main_id + 2, GFP_KERNEL);
    385
    386	mutex_unlock(&hl_devs_idr_lock);
    387
    388	if ((main_id < 0) || (ctrl_id < 0)) {
    389		if ((main_id == -ENOSPC) || (ctrl_id == -ENOSPC))
    390			pr_err("too many devices in the system\n");
    391
    392		if (main_id >= 0) {
    393			mutex_lock(&hl_devs_idr_lock);
    394			idr_remove(&hl_devs_idr, main_id);
    395			mutex_unlock(&hl_devs_idr_lock);
    396		}
    397
    398		rc = -EBUSY;
    399		goto free_hdev;
    400	}
    401
    402	hdev->id = main_id;
    403	hdev->id_control = ctrl_id;
    404
    405	*dev = hdev;
    406
    407	return 0;
    408
    409free_hdev:
    410	kfree(hdev);
    411	return rc;
    412}
    413
    414/*
    415 * destroy_hdev - destroy habanalabs device instance
    416 *
    417 * @dev: pointer to the habanalabs device structure
    418 *
    419 */
    420static void destroy_hdev(struct hl_device *hdev)
    421{
    422	/* Remove device from the device list */
    423	mutex_lock(&hl_devs_idr_lock);
    424	idr_remove(&hl_devs_idr, hdev->id);
    425	idr_remove(&hl_devs_idr, hdev->id_control);
    426	mutex_unlock(&hl_devs_idr_lock);
    427
    428	kfree(hdev);
    429}
    430
    431static int hl_pmops_suspend(struct device *dev)
    432{
    433	struct hl_device *hdev = dev_get_drvdata(dev);
    434
    435	pr_debug("Going to suspend PCI device\n");
    436
    437	if (!hdev) {
    438		pr_err("device pointer is NULL in suspend\n");
    439		return 0;
    440	}
    441
    442	return hl_device_suspend(hdev);
    443}
    444
    445static int hl_pmops_resume(struct device *dev)
    446{
    447	struct hl_device *hdev = dev_get_drvdata(dev);
    448
    449	pr_debug("Going to resume PCI device\n");
    450
    451	if (!hdev) {
    452		pr_err("device pointer is NULL in resume\n");
    453		return 0;
    454	}
    455
    456	return hl_device_resume(hdev);
    457}
    458
    459/**
    460 * hl_pci_probe - probe PCI habanalabs devices
    461 *
    462 * @pdev: pointer to pci device
    463 * @id: pointer to pci device id structure
    464 *
    465 * Standard PCI probe function for habanalabs device.
    466 * Create a new habanalabs device and initialize it according to the
    467 * device's type
    468 */
    469static int hl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
    470{
    471	struct hl_device *hdev;
    472	int rc;
    473
    474	dev_info(&pdev->dev, HL_NAME
    475		 " device found [%04x:%04x] (rev %x)\n",
    476		 (int)pdev->vendor, (int)pdev->device, (int)pdev->revision);
    477
    478	rc = create_hdev(&hdev, pdev);
    479	if (rc)
    480		return rc;
    481
    482	pci_set_drvdata(pdev, hdev);
    483
    484	pci_enable_pcie_error_reporting(pdev);
    485
    486	rc = hl_device_init(hdev, hl_class);
    487	if (rc) {
    488		dev_err(&pdev->dev, "Fatal error during habanalabs device init\n");
    489		rc = -ENODEV;
    490		goto disable_device;
    491	}
    492
    493	return 0;
    494
    495disable_device:
    496	pci_disable_pcie_error_reporting(pdev);
    497	pci_set_drvdata(pdev, NULL);
    498	destroy_hdev(hdev);
    499
    500	return rc;
    501}
    502
    503/*
    504 * hl_pci_remove - remove PCI habanalabs devices
    505 *
    506 * @pdev: pointer to pci device
    507 *
    508 * Standard PCI remove function for habanalabs device
    509 */
    510static void hl_pci_remove(struct pci_dev *pdev)
    511{
    512	struct hl_device *hdev;
    513
    514	hdev = pci_get_drvdata(pdev);
    515	if (!hdev)
    516		return;
    517
    518	hl_device_fini(hdev);
    519	pci_disable_pcie_error_reporting(pdev);
    520	pci_set_drvdata(pdev, NULL);
    521	destroy_hdev(hdev);
    522}
    523
    524/**
    525 * hl_pci_err_detected - a PCI bus error detected on this device
    526 *
    527 * @pdev: pointer to pci device
    528 * @state: PCI error type
    529 *
    530 * Called by the PCI subsystem whenever a non-correctable
    531 * PCI bus error is detected
    532 */
    533static pci_ers_result_t
    534hl_pci_err_detected(struct pci_dev *pdev, pci_channel_state_t state)
    535{
    536	struct hl_device *hdev = pci_get_drvdata(pdev);
    537	enum pci_ers_result result;
    538
    539	switch (state) {
    540	case pci_channel_io_normal:
    541		return PCI_ERS_RESULT_CAN_RECOVER;
    542
    543	case pci_channel_io_frozen:
    544		dev_warn(hdev->dev, "frozen state error detected\n");
    545		result = PCI_ERS_RESULT_NEED_RESET;
    546		break;
    547
    548	case pci_channel_io_perm_failure:
    549		dev_warn(hdev->dev, "failure state error detected\n");
    550		result = PCI_ERS_RESULT_DISCONNECT;
    551		break;
    552
    553	default:
    554		result = PCI_ERS_RESULT_NONE;
    555	}
    556
    557	hdev->asic_funcs->halt_engines(hdev, true, false);
    558
    559	return result;
    560}
    561
    562/**
    563 * hl_pci_err_resume - resume after a PCI slot reset
    564 *
    565 * @pdev: pointer to pci device
    566 *
    567 */
    568static void hl_pci_err_resume(struct pci_dev *pdev)
    569{
    570	struct hl_device *hdev = pci_get_drvdata(pdev);
    571
    572	dev_warn(hdev->dev, "Resuming device after PCI slot reset\n");
    573	hl_device_resume(hdev);
    574}
    575
    576/**
    577 * hl_pci_err_slot_reset - a PCI slot reset has just happened
    578 *
    579 * @pdev: pointer to pci device
    580 *
    581 * Determine if the driver can recover from the PCI slot reset
    582 */
    583static pci_ers_result_t hl_pci_err_slot_reset(struct pci_dev *pdev)
    584{
    585	return PCI_ERS_RESULT_RECOVERED;
    586}
    587
    588static const struct dev_pm_ops hl_pm_ops = {
    589	.suspend = hl_pmops_suspend,
    590	.resume = hl_pmops_resume,
    591};
    592
    593static const struct pci_error_handlers hl_pci_err_handler = {
    594	.error_detected = hl_pci_err_detected,
    595	.slot_reset = hl_pci_err_slot_reset,
    596	.resume = hl_pci_err_resume,
    597};
    598
    599static struct pci_driver hl_pci_driver = {
    600	.name = HL_NAME,
    601	.id_table = ids,
    602	.probe = hl_pci_probe,
    603	.remove = hl_pci_remove,
    604	.shutdown = hl_pci_remove,
    605	.driver = {
    606		.name = HL_NAME,
    607		.pm = &hl_pm_ops,
    608		.probe_type = PROBE_PREFER_ASYNCHRONOUS,
    609	},
    610	.err_handler = &hl_pci_err_handler,
    611};
    612
    613/*
    614 * hl_init - Initialize the habanalabs kernel driver
    615 */
    616static int __init hl_init(void)
    617{
    618	int rc;
    619	dev_t dev;
    620
    621	pr_info("loading driver\n");
    622
    623	rc = alloc_chrdev_region(&dev, 0, HL_MAX_MINORS, HL_NAME);
    624	if (rc < 0) {
    625		pr_err("unable to get major\n");
    626		return rc;
    627	}
    628
    629	hl_major = MAJOR(dev);
    630
    631	hl_class = class_create(THIS_MODULE, HL_NAME);
    632	if (IS_ERR(hl_class)) {
    633		pr_err("failed to allocate class\n");
    634		rc = PTR_ERR(hl_class);
    635		goto remove_major;
    636	}
    637
    638	hl_debugfs_init();
    639
    640	rc = pci_register_driver(&hl_pci_driver);
    641	if (rc) {
    642		pr_err("failed to register pci device\n");
    643		goto remove_debugfs;
    644	}
    645
    646	pr_debug("driver loaded\n");
    647
    648	return 0;
    649
    650remove_debugfs:
    651	hl_debugfs_fini();
    652	class_destroy(hl_class);
    653remove_major:
    654	unregister_chrdev_region(MKDEV(hl_major, 0), HL_MAX_MINORS);
    655	return rc;
    656}
    657
    658/*
    659 * hl_exit - Release all resources of the habanalabs kernel driver
    660 */
    661static void __exit hl_exit(void)
    662{
    663	pci_unregister_driver(&hl_pci_driver);
    664
    665	/*
    666	 * Removing debugfs must be after all devices or simulator devices
    667	 * have been removed because otherwise we get a bug in the
    668	 * debugfs module for referencing NULL objects
    669	 */
    670	hl_debugfs_fini();
    671
    672	class_destroy(hl_class);
    673	unregister_chrdev_region(MKDEV(hl_major, 0), HL_MAX_MINORS);
    674
    675	idr_destroy(&hl_devs_idr);
    676
    677	pr_debug("driver removed\n");
    678}
    679
    680module_init(hl_init);
    681module_exit(hl_exit);