adf_aer.c (6227B)
1// SPDX-License-Identifier: (BSD-3-Clause OR GPL-2.0-only) 2/* Copyright(c) 2014 - 2020 Intel Corporation */ 3#include <linux/kernel.h> 4#include <linux/pci.h> 5#include <linux/aer.h> 6#include <linux/completion.h> 7#include <linux/workqueue.h> 8#include <linux/delay.h> 9#include "adf_accel_devices.h" 10#include "adf_common_drv.h" 11 12static struct workqueue_struct *device_reset_wq; 13 14static pci_ers_result_t adf_error_detected(struct pci_dev *pdev, 15 pci_channel_state_t state) 16{ 17 struct adf_accel_dev *accel_dev = adf_devmgr_pci_to_accel_dev(pdev); 18 19 dev_info(&pdev->dev, "Acceleration driver hardware error detected.\n"); 20 if (!accel_dev) { 21 dev_err(&pdev->dev, "Can't find acceleration device\n"); 22 return PCI_ERS_RESULT_DISCONNECT; 23 } 24 25 if (state == pci_channel_io_perm_failure) { 26 dev_err(&pdev->dev, "Can't recover from device error\n"); 27 return PCI_ERS_RESULT_DISCONNECT; 28 } 29 30 return PCI_ERS_RESULT_NEED_RESET; 31} 32 33/* reset dev data */ 34struct adf_reset_dev_data { 35 int mode; 36 struct adf_accel_dev *accel_dev; 37 struct completion compl; 38 struct work_struct reset_work; 39}; 40 41void adf_reset_sbr(struct adf_accel_dev *accel_dev) 42{ 43 struct pci_dev *pdev = accel_to_pci_dev(accel_dev); 44 struct pci_dev *parent = pdev->bus->self; 45 u16 bridge_ctl = 0; 46 47 if (!parent) 48 parent = pdev; 49 50 if (!pci_wait_for_pending_transaction(pdev)) 51 dev_info(&GET_DEV(accel_dev), 52 "Transaction still in progress. Proceeding\n"); 53 54 dev_info(&GET_DEV(accel_dev), "Secondary bus reset\n"); 55 56 pci_read_config_word(parent, PCI_BRIDGE_CONTROL, &bridge_ctl); 57 bridge_ctl |= PCI_BRIDGE_CTL_BUS_RESET; 58 pci_write_config_word(parent, PCI_BRIDGE_CONTROL, bridge_ctl); 59 msleep(100); 60 bridge_ctl &= ~PCI_BRIDGE_CTL_BUS_RESET; 61 pci_write_config_word(parent, PCI_BRIDGE_CONTROL, bridge_ctl); 62 msleep(100); 63} 64EXPORT_SYMBOL_GPL(adf_reset_sbr); 65 66void adf_reset_flr(struct adf_accel_dev *accel_dev) 67{ 68 pcie_flr(accel_to_pci_dev(accel_dev)); 69} 70EXPORT_SYMBOL_GPL(adf_reset_flr); 71 72void adf_dev_restore(struct adf_accel_dev *accel_dev) 73{ 74 struct adf_hw_device_data *hw_device = accel_dev->hw_device; 75 struct pci_dev *pdev = accel_to_pci_dev(accel_dev); 76 77 if (hw_device->reset_device) { 78 dev_info(&GET_DEV(accel_dev), "Resetting device qat_dev%d\n", 79 accel_dev->accel_id); 80 hw_device->reset_device(accel_dev); 81 pci_restore_state(pdev); 82 pci_save_state(pdev); 83 } 84} 85 86static void adf_device_reset_worker(struct work_struct *work) 87{ 88 struct adf_reset_dev_data *reset_data = 89 container_of(work, struct adf_reset_dev_data, reset_work); 90 struct adf_accel_dev *accel_dev = reset_data->accel_dev; 91 92 adf_dev_restarting_notify(accel_dev); 93 adf_dev_stop(accel_dev); 94 adf_dev_shutdown(accel_dev); 95 if (adf_dev_init(accel_dev) || adf_dev_start(accel_dev)) { 96 /* The device hanged and we can't restart it so stop here */ 97 dev_err(&GET_DEV(accel_dev), "Restart device failed\n"); 98 kfree(reset_data); 99 WARN(1, "QAT: device restart failed. Device is unusable\n"); 100 return; 101 } 102 adf_dev_restarted_notify(accel_dev); 103 clear_bit(ADF_STATUS_RESTARTING, &accel_dev->status); 104 105 /* The dev is back alive. Notify the caller if in sync mode */ 106 if (reset_data->mode == ADF_DEV_RESET_SYNC) 107 complete(&reset_data->compl); 108 else 109 kfree(reset_data); 110} 111 112static int adf_dev_aer_schedule_reset(struct adf_accel_dev *accel_dev, 113 enum adf_dev_reset_mode mode) 114{ 115 struct adf_reset_dev_data *reset_data; 116 117 if (!adf_dev_started(accel_dev) || 118 test_bit(ADF_STATUS_RESTARTING, &accel_dev->status)) 119 return 0; 120 121 set_bit(ADF_STATUS_RESTARTING, &accel_dev->status); 122 reset_data = kzalloc(sizeof(*reset_data), GFP_KERNEL); 123 if (!reset_data) 124 return -ENOMEM; 125 reset_data->accel_dev = accel_dev; 126 init_completion(&reset_data->compl); 127 reset_data->mode = mode; 128 INIT_WORK(&reset_data->reset_work, adf_device_reset_worker); 129 queue_work(device_reset_wq, &reset_data->reset_work); 130 131 /* If in sync mode wait for the result */ 132 if (mode == ADF_DEV_RESET_SYNC) { 133 int ret = 0; 134 /* Maximum device reset time is 10 seconds */ 135 unsigned long wait_jiffies = msecs_to_jiffies(10000); 136 unsigned long timeout = wait_for_completion_timeout( 137 &reset_data->compl, wait_jiffies); 138 if (!timeout) { 139 dev_err(&GET_DEV(accel_dev), 140 "Reset device timeout expired\n"); 141 ret = -EFAULT; 142 } 143 kfree(reset_data); 144 return ret; 145 } 146 return 0; 147} 148 149static pci_ers_result_t adf_slot_reset(struct pci_dev *pdev) 150{ 151 struct adf_accel_dev *accel_dev = adf_devmgr_pci_to_accel_dev(pdev); 152 153 if (!accel_dev) { 154 pr_err("QAT: Can't find acceleration device\n"); 155 return PCI_ERS_RESULT_DISCONNECT; 156 } 157 if (adf_dev_aer_schedule_reset(accel_dev, ADF_DEV_RESET_SYNC)) 158 return PCI_ERS_RESULT_DISCONNECT; 159 160 return PCI_ERS_RESULT_RECOVERED; 161} 162 163static void adf_resume(struct pci_dev *pdev) 164{ 165 dev_info(&pdev->dev, "Acceleration driver reset completed\n"); 166 dev_info(&pdev->dev, "Device is up and running\n"); 167} 168 169const struct pci_error_handlers adf_err_handler = { 170 .error_detected = adf_error_detected, 171 .slot_reset = adf_slot_reset, 172 .resume = adf_resume, 173}; 174EXPORT_SYMBOL_GPL(adf_err_handler); 175 176/** 177 * adf_enable_aer() - Enable Advance Error Reporting for acceleration device 178 * @accel_dev: Pointer to acceleration device. 179 * 180 * Function enables PCI Advance Error Reporting for the 181 * QAT acceleration device accel_dev. 182 * To be used by QAT device specific drivers. 183 */ 184void adf_enable_aer(struct adf_accel_dev *accel_dev) 185{ 186 struct pci_dev *pdev = accel_to_pci_dev(accel_dev); 187 188 pci_enable_pcie_error_reporting(pdev); 189} 190EXPORT_SYMBOL_GPL(adf_enable_aer); 191 192/** 193 * adf_disable_aer() - Disable Advance Error Reporting for acceleration device 194 * @accel_dev: Pointer to acceleration device. 195 * 196 * Function disables PCI Advance Error Reporting for the 197 * QAT acceleration device accel_dev. 198 * To be used by QAT device specific drivers. 199 * 200 * Return: void 201 */ 202void adf_disable_aer(struct adf_accel_dev *accel_dev) 203{ 204 struct pci_dev *pdev = accel_to_pci_dev(accel_dev); 205 206 pci_disable_pcie_error_reporting(pdev); 207} 208EXPORT_SYMBOL_GPL(adf_disable_aer); 209 210int adf_init_aer(void) 211{ 212 device_reset_wq = alloc_workqueue("qat_device_reset_wq", 213 WQ_MEM_RECLAIM, 0); 214 return !device_reset_wq ? -EFAULT : 0; 215} 216 217void adf_exit_aer(void) 218{ 219 if (device_reset_wq) 220 destroy_workqueue(device_reset_wq); 221 device_reset_wq = NULL; 222}