runtest.c (7562B)
1// SPDX-License-Identifier: GPL-2.0-only 2/* Copyright(c) 2022 Intel Corporation. */ 3 4#include <linux/cpu.h> 5#include <linux/delay.h> 6#include <linux/fs.h> 7#include <linux/nmi.h> 8#include <linux/slab.h> 9#include <linux/stop_machine.h> 10 11#include "ifs.h" 12 13/* 14 * Note all code and data in this file is protected by 15 * ifs_sem. On HT systems all threads on a core will 16 * execute together, but only the first thread on the 17 * core will update results of the test. 18 */ 19 20#define CREATE_TRACE_POINTS 21#include <trace/events/intel_ifs.h> 22 23/* Max retries on the same chunk */ 24#define MAX_IFS_RETRIES 5 25 26/* 27 * Number of TSC cycles that a logical CPU will wait for the other 28 * logical CPU on the core in the WRMSR(ACTIVATE_SCAN). 29 */ 30#define IFS_THREAD_WAIT 100000 31 32enum ifs_status_err_code { 33 IFS_NO_ERROR = 0, 34 IFS_OTHER_THREAD_COULD_NOT_JOIN = 1, 35 IFS_INTERRUPTED_BEFORE_RENDEZVOUS = 2, 36 IFS_POWER_MGMT_INADEQUATE_FOR_SCAN = 3, 37 IFS_INVALID_CHUNK_RANGE = 4, 38 IFS_MISMATCH_ARGUMENTS_BETWEEN_THREADS = 5, 39 IFS_CORE_NOT_CAPABLE_CURRENTLY = 6, 40 IFS_UNASSIGNED_ERROR_CODE = 7, 41 IFS_EXCEED_NUMBER_OF_THREADS_CONCURRENT = 8, 42 IFS_INTERRUPTED_DURING_EXECUTION = 9, 43}; 44 45static const char * const scan_test_status[] = { 46 [IFS_NO_ERROR] = "SCAN no error", 47 [IFS_OTHER_THREAD_COULD_NOT_JOIN] = "Other thread could not join.", 48 [IFS_INTERRUPTED_BEFORE_RENDEZVOUS] = "Interrupt occurred prior to SCAN coordination.", 49 [IFS_POWER_MGMT_INADEQUATE_FOR_SCAN] = 50 "Core Abort SCAN Response due to power management condition.", 51 [IFS_INVALID_CHUNK_RANGE] = "Non valid chunks in the range", 52 [IFS_MISMATCH_ARGUMENTS_BETWEEN_THREADS] = "Mismatch in arguments between threads T0/T1.", 53 [IFS_CORE_NOT_CAPABLE_CURRENTLY] = "Core not capable of performing SCAN currently", 54 [IFS_UNASSIGNED_ERROR_CODE] = "Unassigned error code 0x7", 55 [IFS_EXCEED_NUMBER_OF_THREADS_CONCURRENT] = 56 "Exceeded number of Logical Processors (LP) allowed to run Scan-At-Field concurrently", 57 [IFS_INTERRUPTED_DURING_EXECUTION] = "Interrupt occurred prior to SCAN start", 58}; 59 60static void message_not_tested(struct device *dev, int cpu, union ifs_status status) 61{ 62 if (status.error_code < ARRAY_SIZE(scan_test_status)) { 63 dev_info(dev, "CPU(s) %*pbl: SCAN operation did not start. %s\n", 64 cpumask_pr_args(cpu_smt_mask(cpu)), 65 scan_test_status[status.error_code]); 66 } else if (status.error_code == IFS_SW_TIMEOUT) { 67 dev_info(dev, "CPU(s) %*pbl: software timeout during scan\n", 68 cpumask_pr_args(cpu_smt_mask(cpu))); 69 } else if (status.error_code == IFS_SW_PARTIAL_COMPLETION) { 70 dev_info(dev, "CPU(s) %*pbl: %s\n", 71 cpumask_pr_args(cpu_smt_mask(cpu)), 72 "Not all scan chunks were executed. Maximum forward progress retries exceeded"); 73 } else { 74 dev_info(dev, "CPU(s) %*pbl: SCAN unknown status %llx\n", 75 cpumask_pr_args(cpu_smt_mask(cpu)), status.data); 76 } 77} 78 79static void message_fail(struct device *dev, int cpu, union ifs_status status) 80{ 81 /* 82 * control_error is set when the microcode runs into a problem 83 * loading the image from the reserved BIOS memory, or it has 84 * been corrupted. Reloading the image may fix this issue. 85 */ 86 if (status.control_error) { 87 dev_err(dev, "CPU(s) %*pbl: could not execute from loaded scan image\n", 88 cpumask_pr_args(cpu_smt_mask(cpu))); 89 } 90 91 /* 92 * signature_error is set when the output from the scan chains does not 93 * match the expected signature. This might be a transient problem (e.g. 94 * due to a bit flip from an alpha particle or neutron). If the problem 95 * repeats on a subsequent test, then it indicates an actual problem in 96 * the core being tested. 97 */ 98 if (status.signature_error) { 99 dev_err(dev, "CPU(s) %*pbl: test signature incorrect.\n", 100 cpumask_pr_args(cpu_smt_mask(cpu))); 101 } 102} 103 104static bool can_restart(union ifs_status status) 105{ 106 enum ifs_status_err_code err_code = status.error_code; 107 108 /* Signature for chunk is bad, or scan test failed */ 109 if (status.signature_error || status.control_error) 110 return false; 111 112 switch (err_code) { 113 case IFS_NO_ERROR: 114 case IFS_OTHER_THREAD_COULD_NOT_JOIN: 115 case IFS_INTERRUPTED_BEFORE_RENDEZVOUS: 116 case IFS_POWER_MGMT_INADEQUATE_FOR_SCAN: 117 case IFS_EXCEED_NUMBER_OF_THREADS_CONCURRENT: 118 case IFS_INTERRUPTED_DURING_EXECUTION: 119 return true; 120 case IFS_INVALID_CHUNK_RANGE: 121 case IFS_MISMATCH_ARGUMENTS_BETWEEN_THREADS: 122 case IFS_CORE_NOT_CAPABLE_CURRENTLY: 123 case IFS_UNASSIGNED_ERROR_CODE: 124 break; 125 } 126 return false; 127} 128 129/* 130 * Execute the scan. Called "simultaneously" on all threads of a core 131 * at high priority using the stop_cpus mechanism. 132 */ 133static int doscan(void *data) 134{ 135 int cpu = smp_processor_id(); 136 u64 *msrs = data; 137 int first; 138 139 /* Only the first logical CPU on a core reports result */ 140 first = cpumask_first(cpu_smt_mask(cpu)); 141 142 /* 143 * This WRMSR will wait for other HT threads to also write 144 * to this MSR (at most for activate.delay cycles). Then it 145 * starts scan of each requested chunk. The core scan happens 146 * during the "execution" of the WRMSR. This instruction can 147 * take up to 200 milliseconds (in the case where all chunks 148 * are processed in a single pass) before it retires. 149 */ 150 wrmsrl(MSR_ACTIVATE_SCAN, msrs[0]); 151 152 if (cpu == first) { 153 /* Pass back the result of the scan */ 154 rdmsrl(MSR_SCAN_STATUS, msrs[1]); 155 } 156 157 return 0; 158} 159 160/* 161 * Use stop_core_cpuslocked() to synchronize writing to MSR_ACTIVATE_SCAN 162 * on all threads of the core to be tested. Loop if necessary to complete 163 * run of all chunks. Include some defensive tests to make sure forward 164 * progress is made, and that the whole test completes in a reasonable time. 165 */ 166static void ifs_test_core(int cpu, struct device *dev) 167{ 168 union ifs_scan activate; 169 union ifs_status status; 170 unsigned long timeout; 171 struct ifs_data *ifsd; 172 u64 msrvals[2]; 173 int retries; 174 175 ifsd = ifs_get_data(dev); 176 177 activate.rsvd = 0; 178 activate.delay = IFS_THREAD_WAIT; 179 activate.sigmce = 0; 180 activate.start = 0; 181 activate.stop = ifsd->valid_chunks - 1; 182 183 timeout = jiffies + HZ / 2; 184 retries = MAX_IFS_RETRIES; 185 186 while (activate.start <= activate.stop) { 187 if (time_after(jiffies, timeout)) { 188 status.error_code = IFS_SW_TIMEOUT; 189 break; 190 } 191 192 msrvals[0] = activate.data; 193 stop_core_cpuslocked(cpu, doscan, msrvals); 194 195 status.data = msrvals[1]; 196 197 trace_ifs_status(cpu, activate, status); 198 199 /* Some cases can be retried, give up for others */ 200 if (!can_restart(status)) 201 break; 202 203 if (status.chunk_num == activate.start) { 204 /* Check for forward progress */ 205 if (--retries == 0) { 206 if (status.error_code == IFS_NO_ERROR) 207 status.error_code = IFS_SW_PARTIAL_COMPLETION; 208 break; 209 } 210 } else { 211 retries = MAX_IFS_RETRIES; 212 activate.start = status.chunk_num; 213 } 214 } 215 216 /* Update status for this core */ 217 ifsd->scan_details = status.data; 218 219 if (status.control_error || status.signature_error) { 220 ifsd->status = SCAN_TEST_FAIL; 221 message_fail(dev, cpu, status); 222 } else if (status.error_code) { 223 ifsd->status = SCAN_NOT_TESTED; 224 message_not_tested(dev, cpu, status); 225 } else { 226 ifsd->status = SCAN_TEST_PASS; 227 } 228} 229 230/* 231 * Initiate per core test. It wakes up work queue threads on the target cpu and 232 * its sibling cpu. Once all sibling threads wake up, the scan test gets executed and 233 * wait for all sibling threads to finish the scan test. 234 */ 235int do_core_test(int cpu, struct device *dev) 236{ 237 int ret = 0; 238 239 /* Prevent CPUs from being taken offline during the scan test */ 240 cpus_read_lock(); 241 242 if (!cpu_online(cpu)) { 243 dev_info(dev, "cannot test on the offline cpu %d\n", cpu); 244 ret = -EINVAL; 245 goto out; 246 } 247 248 ifs_test_core(cpu, dev); 249out: 250 cpus_read_unlock(); 251 return ret; 252}