bluefield_edac.c (9271B)
1// SPDX-License-Identifier: GPL-2.0 2/* 3 * Bluefield-specific EDAC driver. 4 * 5 * Copyright (c) 2019 Mellanox Technologies. 6 */ 7 8#include <linux/acpi.h> 9#include <linux/arm-smccc.h> 10#include <linux/bitfield.h> 11#include <linux/edac.h> 12#include <linux/io.h> 13#include <linux/module.h> 14#include <linux/platform_device.h> 15 16#include "edac_module.h" 17 18#define DRIVER_NAME "bluefield-edac" 19 20/* 21 * Mellanox BlueField EMI (External Memory Interface) register definitions. 22 */ 23 24#define MLXBF_ECC_CNT 0x340 25#define MLXBF_ECC_CNT__SERR_CNT GENMASK(15, 0) 26#define MLXBF_ECC_CNT__DERR_CNT GENMASK(31, 16) 27 28#define MLXBF_ECC_ERR 0x348 29#define MLXBF_ECC_ERR__SECC BIT(0) 30#define MLXBF_ECC_ERR__DECC BIT(16) 31 32#define MLXBF_ECC_LATCH_SEL 0x354 33#define MLXBF_ECC_LATCH_SEL__START BIT(24) 34 35#define MLXBF_ERR_ADDR_0 0x358 36 37#define MLXBF_ERR_ADDR_1 0x37c 38 39#define MLXBF_SYNDROM 0x35c 40#define MLXBF_SYNDROM__DERR BIT(0) 41#define MLXBF_SYNDROM__SERR BIT(1) 42#define MLXBF_SYNDROM__SYN GENMASK(25, 16) 43 44#define MLXBF_ADD_INFO 0x364 45#define MLXBF_ADD_INFO__ERR_PRANK GENMASK(9, 8) 46 47#define MLXBF_EDAC_MAX_DIMM_PER_MC 2 48#define MLXBF_EDAC_ERROR_GRAIN 8 49 50/* 51 * Request MLNX_SIP_GET_DIMM_INFO 52 * 53 * Retrieve information about DIMM on a certain slot. 54 * 55 * Call register usage: 56 * a0: MLNX_SIP_GET_DIMM_INFO 57 * a1: (Memory controller index) << 16 | (Dimm index in memory controller) 58 * a2-7: not used. 59 * 60 * Return status: 61 * a0: MLXBF_DIMM_INFO defined below describing the DIMM. 62 * a1-3: not used. 63 */ 64#define MLNX_SIP_GET_DIMM_INFO 0x82000008 65 66/* Format for the SMC response about the memory information */ 67#define MLXBF_DIMM_INFO__SIZE_GB GENMASK_ULL(15, 0) 68#define MLXBF_DIMM_INFO__IS_RDIMM BIT(16) 69#define MLXBF_DIMM_INFO__IS_LRDIMM BIT(17) 70#define MLXBF_DIMM_INFO__IS_NVDIMM BIT(18) 71#define MLXBF_DIMM_INFO__RANKS GENMASK_ULL(23, 21) 72#define MLXBF_DIMM_INFO__PACKAGE_X GENMASK_ULL(31, 24) 73 74struct bluefield_edac_priv { 75 int dimm_ranks[MLXBF_EDAC_MAX_DIMM_PER_MC]; 76 void __iomem *emi_base; 77 int dimm_per_mc; 78}; 79 80static u64 smc_call1(u64 smc_op, u64 smc_arg) 81{ 82 struct arm_smccc_res res; 83 84 arm_smccc_smc(smc_op, smc_arg, 0, 0, 0, 0, 0, 0, &res); 85 86 return res.a0; 87} 88 89/* 90 * Gather the ECC information from the External Memory Interface registers 91 * and report it to the edac handler. 92 */ 93static void bluefield_gather_report_ecc(struct mem_ctl_info *mci, 94 int error_cnt, 95 int is_single_ecc) 96{ 97 struct bluefield_edac_priv *priv = mci->pvt_info; 98 u32 dram_additional_info, err_prank, edea0, edea1; 99 u32 ecc_latch_select, dram_syndrom, serr, derr, syndrom; 100 enum hw_event_mc_err_type ecc_type; 101 u64 ecc_dimm_addr; 102 int ecc_dimm; 103 104 ecc_type = is_single_ecc ? HW_EVENT_ERR_CORRECTED : 105 HW_EVENT_ERR_UNCORRECTED; 106 107 /* 108 * Tell the External Memory Interface to populate the relevant 109 * registers with information about the last ECC error occurrence. 110 */ 111 ecc_latch_select = MLXBF_ECC_LATCH_SEL__START; 112 writel(ecc_latch_select, priv->emi_base + MLXBF_ECC_LATCH_SEL); 113 114 /* 115 * Verify that the ECC reported info in the registers is of the 116 * same type as the one asked to report. If not, just report the 117 * error without the detailed information. 118 */ 119 dram_syndrom = readl(priv->emi_base + MLXBF_SYNDROM); 120 serr = FIELD_GET(MLXBF_SYNDROM__SERR, dram_syndrom); 121 derr = FIELD_GET(MLXBF_SYNDROM__DERR, dram_syndrom); 122 syndrom = FIELD_GET(MLXBF_SYNDROM__SYN, dram_syndrom); 123 124 if ((is_single_ecc && !serr) || (!is_single_ecc && !derr)) { 125 edac_mc_handle_error(ecc_type, mci, error_cnt, 0, 0, 0, 126 0, 0, -1, mci->ctl_name, ""); 127 return; 128 } 129 130 dram_additional_info = readl(priv->emi_base + MLXBF_ADD_INFO); 131 err_prank = FIELD_GET(MLXBF_ADD_INFO__ERR_PRANK, dram_additional_info); 132 133 ecc_dimm = (err_prank >= 2 && priv->dimm_ranks[0] <= 2) ? 1 : 0; 134 135 edea0 = readl(priv->emi_base + MLXBF_ERR_ADDR_0); 136 edea1 = readl(priv->emi_base + MLXBF_ERR_ADDR_1); 137 138 ecc_dimm_addr = ((u64)edea1 << 32) | edea0; 139 140 edac_mc_handle_error(ecc_type, mci, error_cnt, 141 PFN_DOWN(ecc_dimm_addr), 142 offset_in_page(ecc_dimm_addr), 143 syndrom, ecc_dimm, 0, 0, mci->ctl_name, ""); 144} 145 146static void bluefield_edac_check(struct mem_ctl_info *mci) 147{ 148 struct bluefield_edac_priv *priv = mci->pvt_info; 149 u32 ecc_count, single_error_count, double_error_count, ecc_error = 0; 150 151 /* 152 * The memory controller might not be initialized by the firmware 153 * when there isn't memory, which may lead to bad register readings. 154 */ 155 if (mci->edac_cap == EDAC_FLAG_NONE) 156 return; 157 158 ecc_count = readl(priv->emi_base + MLXBF_ECC_CNT); 159 single_error_count = FIELD_GET(MLXBF_ECC_CNT__SERR_CNT, ecc_count); 160 double_error_count = FIELD_GET(MLXBF_ECC_CNT__DERR_CNT, ecc_count); 161 162 if (single_error_count) { 163 ecc_error |= MLXBF_ECC_ERR__SECC; 164 165 bluefield_gather_report_ecc(mci, single_error_count, 1); 166 } 167 168 if (double_error_count) { 169 ecc_error |= MLXBF_ECC_ERR__DECC; 170 171 bluefield_gather_report_ecc(mci, double_error_count, 0); 172 } 173 174 /* Write to clear reported errors. */ 175 if (ecc_count) 176 writel(ecc_error, priv->emi_base + MLXBF_ECC_ERR); 177} 178 179/* Initialize the DIMMs information for the given memory controller. */ 180static void bluefield_edac_init_dimms(struct mem_ctl_info *mci) 181{ 182 struct bluefield_edac_priv *priv = mci->pvt_info; 183 int mem_ctrl_idx = mci->mc_idx; 184 struct dimm_info *dimm; 185 u64 smc_info, smc_arg; 186 int is_empty = 1, i; 187 188 for (i = 0; i < priv->dimm_per_mc; i++) { 189 dimm = mci->dimms[i]; 190 191 smc_arg = mem_ctrl_idx << 16 | i; 192 smc_info = smc_call1(MLNX_SIP_GET_DIMM_INFO, smc_arg); 193 194 if (!FIELD_GET(MLXBF_DIMM_INFO__SIZE_GB, smc_info)) { 195 dimm->mtype = MEM_EMPTY; 196 continue; 197 } 198 199 is_empty = 0; 200 201 dimm->edac_mode = EDAC_SECDED; 202 203 if (FIELD_GET(MLXBF_DIMM_INFO__IS_NVDIMM, smc_info)) 204 dimm->mtype = MEM_NVDIMM; 205 else if (FIELD_GET(MLXBF_DIMM_INFO__IS_LRDIMM, smc_info)) 206 dimm->mtype = MEM_LRDDR4; 207 else if (FIELD_GET(MLXBF_DIMM_INFO__IS_RDIMM, smc_info)) 208 dimm->mtype = MEM_RDDR4; 209 else 210 dimm->mtype = MEM_DDR4; 211 212 dimm->nr_pages = 213 FIELD_GET(MLXBF_DIMM_INFO__SIZE_GB, smc_info) * 214 (SZ_1G / PAGE_SIZE); 215 dimm->grain = MLXBF_EDAC_ERROR_GRAIN; 216 217 /* Mem controller for BlueField only supports x4, x8 and x16 */ 218 switch (FIELD_GET(MLXBF_DIMM_INFO__PACKAGE_X, smc_info)) { 219 case 4: 220 dimm->dtype = DEV_X4; 221 break; 222 case 8: 223 dimm->dtype = DEV_X8; 224 break; 225 case 16: 226 dimm->dtype = DEV_X16; 227 break; 228 default: 229 dimm->dtype = DEV_UNKNOWN; 230 } 231 232 priv->dimm_ranks[i] = 233 FIELD_GET(MLXBF_DIMM_INFO__RANKS, smc_info); 234 } 235 236 if (is_empty) 237 mci->edac_cap = EDAC_FLAG_NONE; 238 else 239 mci->edac_cap = EDAC_FLAG_SECDED; 240} 241 242static int bluefield_edac_mc_probe(struct platform_device *pdev) 243{ 244 struct bluefield_edac_priv *priv; 245 struct device *dev = &pdev->dev; 246 struct edac_mc_layer layers[1]; 247 struct mem_ctl_info *mci; 248 struct resource *emi_res; 249 unsigned int mc_idx, dimm_count; 250 int rc, ret; 251 252 /* Read the MSS (Memory SubSystem) index from ACPI table. */ 253 if (device_property_read_u32(dev, "mss_number", &mc_idx)) { 254 dev_warn(dev, "bf_edac: MSS number unknown\n"); 255 return -EINVAL; 256 } 257 258 /* Read the DIMMs per MC from ACPI table. */ 259 if (device_property_read_u32(dev, "dimm_per_mc", &dimm_count)) { 260 dev_warn(dev, "bf_edac: DIMMs per MC unknown\n"); 261 return -EINVAL; 262 } 263 264 if (dimm_count > MLXBF_EDAC_MAX_DIMM_PER_MC) { 265 dev_warn(dev, "bf_edac: DIMMs per MC not valid\n"); 266 return -EINVAL; 267 } 268 269 emi_res = platform_get_resource(pdev, IORESOURCE_MEM, 0); 270 if (!emi_res) 271 return -EINVAL; 272 273 layers[0].type = EDAC_MC_LAYER_SLOT; 274 layers[0].size = dimm_count; 275 layers[0].is_virt_csrow = true; 276 277 mci = edac_mc_alloc(mc_idx, ARRAY_SIZE(layers), layers, sizeof(*priv)); 278 if (!mci) 279 return -ENOMEM; 280 281 priv = mci->pvt_info; 282 283 priv->dimm_per_mc = dimm_count; 284 priv->emi_base = devm_ioremap_resource(dev, emi_res); 285 if (IS_ERR(priv->emi_base)) { 286 dev_err(dev, "failed to map EMI IO resource\n"); 287 ret = PTR_ERR(priv->emi_base); 288 goto err; 289 } 290 291 mci->pdev = dev; 292 mci->mtype_cap = MEM_FLAG_DDR4 | MEM_FLAG_RDDR4 | 293 MEM_FLAG_LRDDR4 | MEM_FLAG_NVDIMM; 294 mci->edac_ctl_cap = EDAC_FLAG_SECDED; 295 296 mci->mod_name = DRIVER_NAME; 297 mci->ctl_name = "BlueField_Memory_Controller"; 298 mci->dev_name = dev_name(dev); 299 mci->edac_check = bluefield_edac_check; 300 301 /* Initialize mci with the actual populated DIMM information. */ 302 bluefield_edac_init_dimms(mci); 303 304 platform_set_drvdata(pdev, mci); 305 306 /* Register with EDAC core */ 307 rc = edac_mc_add_mc(mci); 308 if (rc) { 309 dev_err(dev, "failed to register with EDAC core\n"); 310 ret = rc; 311 goto err; 312 } 313 314 /* Only POLL mode supported so far. */ 315 edac_op_state = EDAC_OPSTATE_POLL; 316 317 return 0; 318 319err: 320 edac_mc_free(mci); 321 322 return ret; 323 324} 325 326static int bluefield_edac_mc_remove(struct platform_device *pdev) 327{ 328 struct mem_ctl_info *mci = platform_get_drvdata(pdev); 329 330 edac_mc_del_mc(&pdev->dev); 331 edac_mc_free(mci); 332 333 return 0; 334} 335 336static const struct acpi_device_id bluefield_mc_acpi_ids[] = { 337 {"MLNXBF08", 0}, 338 {} 339}; 340 341MODULE_DEVICE_TABLE(acpi, bluefield_mc_acpi_ids); 342 343static struct platform_driver bluefield_edac_mc_driver = { 344 .driver = { 345 .name = DRIVER_NAME, 346 .acpi_match_table = bluefield_mc_acpi_ids, 347 }, 348 .probe = bluefield_edac_mc_probe, 349 .remove = bluefield_edac_mc_remove, 350}; 351 352module_platform_driver(bluefield_edac_mc_driver); 353 354MODULE_DESCRIPTION("Mellanox BlueField memory edac driver"); 355MODULE_AUTHOR("Mellanox Technologies"); 356MODULE_LICENSE("GPL v2");