events.c (14762B)
1// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB 2// Copyright (c) 2018 Mellanox Technologies 3 4#include <linux/mlx5/driver.h> 5 6#include "mlx5_core.h" 7#include "lib/eq.h" 8#include "lib/mlx5.h" 9 10struct mlx5_event_nb { 11 struct mlx5_nb nb; 12 void *ctx; 13}; 14 15/* General events handlers for the low level mlx5_core driver 16 * 17 * Other Major feature specific events such as 18 * clock/eswitch/fpga/FW trace and many others, are handled elsewhere, with 19 * separate notifiers callbacks, specifically by those mlx5 components. 20 */ 21static int any_notifier(struct notifier_block *, unsigned long, void *); 22static int temp_warn(struct notifier_block *, unsigned long, void *); 23static int port_module(struct notifier_block *, unsigned long, void *); 24static int pcie_core(struct notifier_block *, unsigned long, void *); 25 26/* handler which forwards the event to events->fw_nh, driver notifiers */ 27static int forward_event(struct notifier_block *, unsigned long, void *); 28 29static struct mlx5_nb events_nbs_ref[] = { 30 /* Events to be processed by mlx5_core */ 31 {.nb.notifier_call = any_notifier, .event_type = MLX5_EVENT_TYPE_NOTIFY_ANY }, 32 {.nb.notifier_call = temp_warn, .event_type = MLX5_EVENT_TYPE_TEMP_WARN_EVENT }, 33 {.nb.notifier_call = port_module, .event_type = MLX5_EVENT_TYPE_PORT_MODULE_EVENT }, 34 {.nb.notifier_call = pcie_core, .event_type = MLX5_EVENT_TYPE_GENERAL_EVENT }, 35 36 /* Events to be forwarded (as is) to mlx5 core interfaces (mlx5e/mlx5_ib) */ 37 {.nb.notifier_call = forward_event, .event_type = MLX5_EVENT_TYPE_PORT_CHANGE }, 38 {.nb.notifier_call = forward_event, .event_type = MLX5_EVENT_TYPE_GENERAL_EVENT }, 39 /* QP/WQ resource events to forward */ 40 {.nb.notifier_call = forward_event, .event_type = MLX5_EVENT_TYPE_DCT_DRAINED }, 41 {.nb.notifier_call = forward_event, .event_type = MLX5_EVENT_TYPE_PATH_MIG }, 42 {.nb.notifier_call = forward_event, .event_type = MLX5_EVENT_TYPE_COMM_EST }, 43 {.nb.notifier_call = forward_event, .event_type = MLX5_EVENT_TYPE_SQ_DRAINED }, 44 {.nb.notifier_call = forward_event, .event_type = MLX5_EVENT_TYPE_SRQ_LAST_WQE }, 45 {.nb.notifier_call = forward_event, .event_type = MLX5_EVENT_TYPE_WQ_CATAS_ERROR }, 46 {.nb.notifier_call = forward_event, .event_type = MLX5_EVENT_TYPE_PATH_MIG_FAILED }, 47 {.nb.notifier_call = forward_event, .event_type = MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR }, 48 {.nb.notifier_call = forward_event, .event_type = MLX5_EVENT_TYPE_WQ_ACCESS_ERROR }, 49 /* SRQ events */ 50 {.nb.notifier_call = forward_event, .event_type = MLX5_EVENT_TYPE_SRQ_CATAS_ERROR }, 51 {.nb.notifier_call = forward_event, .event_type = MLX5_EVENT_TYPE_SRQ_RQ_LIMIT }, 52}; 53 54struct mlx5_events { 55 struct mlx5_core_dev *dev; 56 struct workqueue_struct *wq; 57 struct mlx5_event_nb notifiers[ARRAY_SIZE(events_nbs_ref)]; 58 /* driver notifier chain for fw events */ 59 struct atomic_notifier_head fw_nh; 60 /* port module events stats */ 61 struct mlx5_pme_stats pme_stats; 62 /*pcie_core*/ 63 struct work_struct pcie_core_work; 64 /* driver notifier chain for sw events */ 65 struct blocking_notifier_head sw_nh; 66}; 67 68static const char *eqe_type_str(u8 type) 69{ 70 switch (type) { 71 case MLX5_EVENT_TYPE_COMP: 72 return "MLX5_EVENT_TYPE_COMP"; 73 case MLX5_EVENT_TYPE_PATH_MIG: 74 return "MLX5_EVENT_TYPE_PATH_MIG"; 75 case MLX5_EVENT_TYPE_COMM_EST: 76 return "MLX5_EVENT_TYPE_COMM_EST"; 77 case MLX5_EVENT_TYPE_SQ_DRAINED: 78 return "MLX5_EVENT_TYPE_SQ_DRAINED"; 79 case MLX5_EVENT_TYPE_SRQ_LAST_WQE: 80 return "MLX5_EVENT_TYPE_SRQ_LAST_WQE"; 81 case MLX5_EVENT_TYPE_SRQ_RQ_LIMIT: 82 return "MLX5_EVENT_TYPE_SRQ_RQ_LIMIT"; 83 case MLX5_EVENT_TYPE_CQ_ERROR: 84 return "MLX5_EVENT_TYPE_CQ_ERROR"; 85 case MLX5_EVENT_TYPE_WQ_CATAS_ERROR: 86 return "MLX5_EVENT_TYPE_WQ_CATAS_ERROR"; 87 case MLX5_EVENT_TYPE_PATH_MIG_FAILED: 88 return "MLX5_EVENT_TYPE_PATH_MIG_FAILED"; 89 case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR: 90 return "MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR"; 91 case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR: 92 return "MLX5_EVENT_TYPE_WQ_ACCESS_ERROR"; 93 case MLX5_EVENT_TYPE_SRQ_CATAS_ERROR: 94 return "MLX5_EVENT_TYPE_SRQ_CATAS_ERROR"; 95 case MLX5_EVENT_TYPE_INTERNAL_ERROR: 96 return "MLX5_EVENT_TYPE_INTERNAL_ERROR"; 97 case MLX5_EVENT_TYPE_PORT_CHANGE: 98 return "MLX5_EVENT_TYPE_PORT_CHANGE"; 99 case MLX5_EVENT_TYPE_GPIO_EVENT: 100 return "MLX5_EVENT_TYPE_GPIO_EVENT"; 101 case MLX5_EVENT_TYPE_PORT_MODULE_EVENT: 102 return "MLX5_EVENT_TYPE_PORT_MODULE_EVENT"; 103 case MLX5_EVENT_TYPE_TEMP_WARN_EVENT: 104 return "MLX5_EVENT_TYPE_TEMP_WARN_EVENT"; 105 case MLX5_EVENT_TYPE_REMOTE_CONFIG: 106 return "MLX5_EVENT_TYPE_REMOTE_CONFIG"; 107 case MLX5_EVENT_TYPE_DB_BF_CONGESTION: 108 return "MLX5_EVENT_TYPE_DB_BF_CONGESTION"; 109 case MLX5_EVENT_TYPE_STALL_EVENT: 110 return "MLX5_EVENT_TYPE_STALL_EVENT"; 111 case MLX5_EVENT_TYPE_CMD: 112 return "MLX5_EVENT_TYPE_CMD"; 113 case MLX5_EVENT_TYPE_ESW_FUNCTIONS_CHANGED: 114 return "MLX5_EVENT_TYPE_ESW_FUNCTIONS_CHANGED"; 115 case MLX5_EVENT_TYPE_VHCA_STATE_CHANGE: 116 return "MLX5_EVENT_TYPE_VHCA_STATE_CHANGE"; 117 case MLX5_EVENT_TYPE_PAGE_REQUEST: 118 return "MLX5_EVENT_TYPE_PAGE_REQUEST"; 119 case MLX5_EVENT_TYPE_PAGE_FAULT: 120 return "MLX5_EVENT_TYPE_PAGE_FAULT"; 121 case MLX5_EVENT_TYPE_PPS_EVENT: 122 return "MLX5_EVENT_TYPE_PPS_EVENT"; 123 case MLX5_EVENT_TYPE_NIC_VPORT_CHANGE: 124 return "MLX5_EVENT_TYPE_NIC_VPORT_CHANGE"; 125 case MLX5_EVENT_TYPE_FPGA_ERROR: 126 return "MLX5_EVENT_TYPE_FPGA_ERROR"; 127 case MLX5_EVENT_TYPE_FPGA_QP_ERROR: 128 return "MLX5_EVENT_TYPE_FPGA_QP_ERROR"; 129 case MLX5_EVENT_TYPE_GENERAL_EVENT: 130 return "MLX5_EVENT_TYPE_GENERAL_EVENT"; 131 case MLX5_EVENT_TYPE_MONITOR_COUNTER: 132 return "MLX5_EVENT_TYPE_MONITOR_COUNTER"; 133 case MLX5_EVENT_TYPE_DEVICE_TRACER: 134 return "MLX5_EVENT_TYPE_DEVICE_TRACER"; 135 default: 136 return "Unrecognized event"; 137 } 138} 139 140/* handles all FW events, type == eqe->type */ 141static int any_notifier(struct notifier_block *nb, 142 unsigned long type, void *data) 143{ 144 struct mlx5_event_nb *event_nb = mlx5_nb_cof(nb, struct mlx5_event_nb, nb); 145 struct mlx5_events *events = event_nb->ctx; 146 struct mlx5_eqe *eqe = data; 147 148 mlx5_core_dbg(events->dev, "Async eqe type %s, subtype (%d)\n", 149 eqe_type_str(eqe->type), eqe->sub_type); 150 return NOTIFY_OK; 151} 152 153/* type == MLX5_EVENT_TYPE_TEMP_WARN_EVENT */ 154static int temp_warn(struct notifier_block *nb, unsigned long type, void *data) 155{ 156 struct mlx5_event_nb *event_nb = mlx5_nb_cof(nb, struct mlx5_event_nb, nb); 157 struct mlx5_events *events = event_nb->ctx; 158 struct mlx5_eqe *eqe = data; 159 u64 value_lsb; 160 u64 value_msb; 161 162 value_lsb = be64_to_cpu(eqe->data.temp_warning.sensor_warning_lsb); 163 value_msb = be64_to_cpu(eqe->data.temp_warning.sensor_warning_msb); 164 165 mlx5_core_warn(events->dev, 166 "High temperature on sensors with bit set %llx %llx", 167 value_msb, value_lsb); 168 169 return NOTIFY_OK; 170} 171 172/* MLX5_EVENT_TYPE_PORT_MODULE_EVENT */ 173static const char *mlx5_pme_status_to_string(enum port_module_event_status_type status) 174{ 175 switch (status) { 176 case MLX5_MODULE_STATUS_PLUGGED: 177 return "Cable plugged"; 178 case MLX5_MODULE_STATUS_UNPLUGGED: 179 return "Cable unplugged"; 180 case MLX5_MODULE_STATUS_ERROR: 181 return "Cable error"; 182 case MLX5_MODULE_STATUS_DISABLED: 183 return "Cable disabled"; 184 default: 185 return "Unknown status"; 186 } 187} 188 189static const char *mlx5_pme_error_to_string(enum port_module_event_error_type error) 190{ 191 switch (error) { 192 case MLX5_MODULE_EVENT_ERROR_POWER_BUDGET_EXCEEDED: 193 return "Power budget exceeded"; 194 case MLX5_MODULE_EVENT_ERROR_LONG_RANGE_FOR_NON_MLNX: 195 return "Long Range for non MLNX cable"; 196 case MLX5_MODULE_EVENT_ERROR_BUS_STUCK: 197 return "Bus stuck (I2C or data shorted)"; 198 case MLX5_MODULE_EVENT_ERROR_NO_EEPROM_RETRY_TIMEOUT: 199 return "No EEPROM/retry timeout"; 200 case MLX5_MODULE_EVENT_ERROR_ENFORCE_PART_NUMBER_LIST: 201 return "Enforce part number list"; 202 case MLX5_MODULE_EVENT_ERROR_UNKNOWN_IDENTIFIER: 203 return "Unknown identifier"; 204 case MLX5_MODULE_EVENT_ERROR_HIGH_TEMPERATURE: 205 return "High Temperature"; 206 case MLX5_MODULE_EVENT_ERROR_BAD_CABLE: 207 return "Bad or shorted cable/module"; 208 case MLX5_MODULE_EVENT_ERROR_PCIE_POWER_SLOT_EXCEEDED: 209 return "One or more network ports have been powered down due to insufficient/unadvertised power on the PCIe slot"; 210 default: 211 return "Unknown error"; 212 } 213} 214 215/* type == MLX5_EVENT_TYPE_PORT_MODULE_EVENT */ 216static int port_module(struct notifier_block *nb, unsigned long type, void *data) 217{ 218 struct mlx5_event_nb *event_nb = mlx5_nb_cof(nb, struct mlx5_event_nb, nb); 219 struct mlx5_events *events = event_nb->ctx; 220 struct mlx5_eqe *eqe = data; 221 222 enum port_module_event_status_type module_status; 223 enum port_module_event_error_type error_type; 224 struct mlx5_eqe_port_module *module_event_eqe; 225 const char *status_str; 226 u8 module_num; 227 228 module_event_eqe = &eqe->data.port_module; 229 module_status = module_event_eqe->module_status & 230 PORT_MODULE_EVENT_MODULE_STATUS_MASK; 231 error_type = module_event_eqe->error_type & 232 PORT_MODULE_EVENT_ERROR_TYPE_MASK; 233 234 if (module_status < MLX5_MODULE_STATUS_NUM) 235 events->pme_stats.status_counters[module_status]++; 236 237 if (module_status == MLX5_MODULE_STATUS_ERROR) 238 if (error_type < MLX5_MODULE_EVENT_ERROR_NUM) 239 events->pme_stats.error_counters[error_type]++; 240 241 if (!printk_ratelimit()) 242 return NOTIFY_OK; 243 244 module_num = module_event_eqe->module; 245 status_str = mlx5_pme_status_to_string(module_status); 246 if (module_status == MLX5_MODULE_STATUS_ERROR) { 247 const char *error_str = mlx5_pme_error_to_string(error_type); 248 249 mlx5_core_err(events->dev, 250 "Port module event[error]: module %u, %s, %s\n", 251 module_num, status_str, error_str); 252 } else { 253 mlx5_core_info(events->dev, 254 "Port module event: module %u, %s\n", 255 module_num, status_str); 256 } 257 258 return NOTIFY_OK; 259} 260 261enum { 262 MLX5_PCI_POWER_COULD_NOT_BE_READ = 0x0, 263 MLX5_PCI_POWER_SUFFICIENT_REPORTED = 0x1, 264 MLX5_PCI_POWER_INSUFFICIENT_REPORTED = 0x2, 265}; 266 267static void mlx5_pcie_event(struct work_struct *work) 268{ 269 u32 out[MLX5_ST_SZ_DW(mpein_reg)] = {0}; 270 u32 in[MLX5_ST_SZ_DW(mpein_reg)] = {0}; 271 struct mlx5_events *events; 272 struct mlx5_core_dev *dev; 273 u8 power_status; 274 u16 pci_power; 275 276 events = container_of(work, struct mlx5_events, pcie_core_work); 277 dev = events->dev; 278 279 if (!MLX5_CAP_MCAM_FEATURE(dev, pci_status_and_power)) 280 return; 281 282 mlx5_core_access_reg(dev, in, sizeof(in), out, sizeof(out), 283 MLX5_REG_MPEIN, 0, 0); 284 power_status = MLX5_GET(mpein_reg, out, pwr_status); 285 pci_power = MLX5_GET(mpein_reg, out, pci_power); 286 287 switch (power_status) { 288 case MLX5_PCI_POWER_COULD_NOT_BE_READ: 289 mlx5_core_info_rl(dev, 290 "PCIe slot power capability was not advertised.\n"); 291 break; 292 case MLX5_PCI_POWER_INSUFFICIENT_REPORTED: 293 mlx5_core_warn_rl(dev, 294 "Detected insufficient power on the PCIe slot (%uW).\n", 295 pci_power); 296 break; 297 case MLX5_PCI_POWER_SUFFICIENT_REPORTED: 298 mlx5_core_info_rl(dev, 299 "PCIe slot advertised sufficient power (%uW).\n", 300 pci_power); 301 break; 302 } 303} 304 305static int pcie_core(struct notifier_block *nb, unsigned long type, void *data) 306{ 307 struct mlx5_event_nb *event_nb = mlx5_nb_cof(nb, 308 struct mlx5_event_nb, 309 nb); 310 struct mlx5_events *events = event_nb->ctx; 311 struct mlx5_eqe *eqe = data; 312 313 switch (eqe->sub_type) { 314 case MLX5_GENERAL_SUBTYPE_PCI_POWER_CHANGE_EVENT: 315 queue_work(events->wq, &events->pcie_core_work); 316 break; 317 default: 318 return NOTIFY_DONE; 319 } 320 321 return NOTIFY_OK; 322} 323 324void mlx5_get_pme_stats(struct mlx5_core_dev *dev, struct mlx5_pme_stats *stats) 325{ 326 *stats = dev->priv.events->pme_stats; 327} 328 329/* forward event as is to registered interfaces (mlx5e/mlx5_ib) */ 330static int forward_event(struct notifier_block *nb, unsigned long event, void *data) 331{ 332 struct mlx5_event_nb *event_nb = mlx5_nb_cof(nb, struct mlx5_event_nb, nb); 333 struct mlx5_events *events = event_nb->ctx; 334 struct mlx5_eqe *eqe = data; 335 336 mlx5_core_dbg(events->dev, "Async eqe type %s, subtype (%d) forward to interfaces\n", 337 eqe_type_str(eqe->type), eqe->sub_type); 338 atomic_notifier_call_chain(&events->fw_nh, event, data); 339 return NOTIFY_OK; 340} 341 342int mlx5_events_init(struct mlx5_core_dev *dev) 343{ 344 struct mlx5_events *events = kzalloc(sizeof(*events), GFP_KERNEL); 345 346 if (!events) 347 return -ENOMEM; 348 349 ATOMIC_INIT_NOTIFIER_HEAD(&events->fw_nh); 350 events->dev = dev; 351 dev->priv.events = events; 352 events->wq = create_singlethread_workqueue("mlx5_events"); 353 if (!events->wq) { 354 kfree(events); 355 return -ENOMEM; 356 } 357 INIT_WORK(&events->pcie_core_work, mlx5_pcie_event); 358 BLOCKING_INIT_NOTIFIER_HEAD(&events->sw_nh); 359 360 return 0; 361} 362 363void mlx5_events_cleanup(struct mlx5_core_dev *dev) 364{ 365 destroy_workqueue(dev->priv.events->wq); 366 kvfree(dev->priv.events); 367} 368 369void mlx5_events_start(struct mlx5_core_dev *dev) 370{ 371 struct mlx5_events *events = dev->priv.events; 372 int i; 373 374 for (i = 0; i < ARRAY_SIZE(events_nbs_ref); i++) { 375 events->notifiers[i].nb = events_nbs_ref[i]; 376 events->notifiers[i].ctx = events; 377 mlx5_eq_notifier_register(dev, &events->notifiers[i].nb); 378 } 379} 380 381void mlx5_events_stop(struct mlx5_core_dev *dev) 382{ 383 struct mlx5_events *events = dev->priv.events; 384 int i; 385 386 for (i = ARRAY_SIZE(events_nbs_ref) - 1; i >= 0 ; i--) 387 mlx5_eq_notifier_unregister(dev, &events->notifiers[i].nb); 388 flush_workqueue(events->wq); 389} 390 391/* This API is used only for processing and forwarding firmware 392 * events to mlx5 consumer. 393 */ 394int mlx5_notifier_register(struct mlx5_core_dev *dev, struct notifier_block *nb) 395{ 396 struct mlx5_events *events = dev->priv.events; 397 398 return atomic_notifier_chain_register(&events->fw_nh, nb); 399} 400EXPORT_SYMBOL(mlx5_notifier_register); 401 402int mlx5_notifier_unregister(struct mlx5_core_dev *dev, struct notifier_block *nb) 403{ 404 struct mlx5_events *events = dev->priv.events; 405 406 return atomic_notifier_chain_unregister(&events->fw_nh, nb); 407} 408EXPORT_SYMBOL(mlx5_notifier_unregister); 409 410int mlx5_notifier_call_chain(struct mlx5_events *events, unsigned int event, void *data) 411{ 412 return atomic_notifier_call_chain(&events->fw_nh, event, data); 413} 414 415/* This API is used only for processing and forwarding driver-specific 416 * events to mlx5 consumers. 417 */ 418int mlx5_blocking_notifier_register(struct mlx5_core_dev *dev, struct notifier_block *nb) 419{ 420 struct mlx5_events *events = dev->priv.events; 421 422 return blocking_notifier_chain_register(&events->sw_nh, nb); 423} 424 425int mlx5_blocking_notifier_unregister(struct mlx5_core_dev *dev, struct notifier_block *nb) 426{ 427 struct mlx5_events *events = dev->priv.events; 428 429 return blocking_notifier_chain_unregister(&events->sw_nh, nb); 430} 431 432int mlx5_blocking_notifier_call_chain(struct mlx5_core_dev *dev, unsigned int event, 433 void *data) 434{ 435 struct mlx5_events *events = dev->priv.events; 436 437 return blocking_notifier_call_chain(&events->sw_nh, event, data); 438} 439 440void mlx5_events_work_enqueue(struct mlx5_core_dev *dev, struct work_struct *work) 441{ 442 queue_work(dev->priv.events->wq, work); 443}