cpustat_kern.c (7139B)
1// SPDX-License-Identifier: GPL-2.0 2 3#include <linux/version.h> 4#include <linux/ptrace.h> 5#include <uapi/linux/bpf.h> 6#include <bpf/bpf_helpers.h> 7 8/* 9 * The CPU number, cstate number and pstate number are based 10 * on 96boards Hikey with octa CA53 CPUs. 11 * 12 * Every CPU have three idle states for cstate: 13 * WFI, CPU_OFF, CLUSTER_OFF 14 * 15 * Every CPU have 5 operating points: 16 * 208MHz, 432MHz, 729MHz, 960MHz, 1200MHz 17 * 18 * This code is based on these assumption and other platforms 19 * need to adjust these definitions. 20 */ 21#define MAX_CPU 8 22#define MAX_PSTATE_ENTRIES 5 23#define MAX_CSTATE_ENTRIES 3 24 25static int cpu_opps[] = { 208000, 432000, 729000, 960000, 1200000 }; 26 27/* 28 * my_map structure is used to record cstate and pstate index and 29 * timestamp (Idx, Ts), when new event incoming we need to update 30 * combination for new state index and timestamp (Idx`, Ts`). 31 * 32 * Based on (Idx, Ts) and (Idx`, Ts`) we can calculate the time 33 * interval for the previous state: Duration(Idx) = Ts` - Ts. 34 * 35 * Every CPU has one below array for recording state index and 36 * timestamp, and record for cstate and pstate saperately: 37 * 38 * +--------------------------+ 39 * | cstate timestamp | 40 * +--------------------------+ 41 * | cstate index | 42 * +--------------------------+ 43 * | pstate timestamp | 44 * +--------------------------+ 45 * | pstate index | 46 * +--------------------------+ 47 */ 48#define MAP_OFF_CSTATE_TIME 0 49#define MAP_OFF_CSTATE_IDX 1 50#define MAP_OFF_PSTATE_TIME 2 51#define MAP_OFF_PSTATE_IDX 3 52#define MAP_OFF_NUM 4 53 54struct { 55 __uint(type, BPF_MAP_TYPE_ARRAY); 56 __type(key, u32); 57 __type(value, u64); 58 __uint(max_entries, MAX_CPU * MAP_OFF_NUM); 59} my_map SEC(".maps"); 60 61/* cstate_duration records duration time for every idle state per CPU */ 62struct { 63 __uint(type, BPF_MAP_TYPE_ARRAY); 64 __type(key, u32); 65 __type(value, u64); 66 __uint(max_entries, MAX_CPU * MAX_CSTATE_ENTRIES); 67} cstate_duration SEC(".maps"); 68 69/* pstate_duration records duration time for every operating point per CPU */ 70struct { 71 __uint(type, BPF_MAP_TYPE_ARRAY); 72 __type(key, u32); 73 __type(value, u64); 74 __uint(max_entries, MAX_CPU * MAX_PSTATE_ENTRIES); 75} pstate_duration SEC(".maps"); 76 77/* 78 * The trace events for cpu_idle and cpu_frequency are taken from: 79 * /sys/kernel/debug/tracing/events/power/cpu_idle/format 80 * /sys/kernel/debug/tracing/events/power/cpu_frequency/format 81 * 82 * These two events have same format, so define one common structure. 83 */ 84struct cpu_args { 85 u64 pad; 86 u32 state; 87 u32 cpu_id; 88}; 89 90/* calculate pstate index, returns MAX_PSTATE_ENTRIES for failure */ 91static u32 find_cpu_pstate_idx(u32 frequency) 92{ 93 u32 i; 94 95 for (i = 0; i < sizeof(cpu_opps) / sizeof(u32); i++) { 96 if (frequency == cpu_opps[i]) 97 return i; 98 } 99 100 return i; 101} 102 103SEC("tracepoint/power/cpu_idle") 104int bpf_prog1(struct cpu_args *ctx) 105{ 106 u64 *cts, *pts, *cstate, *pstate, prev_state, cur_ts, delta; 107 u32 key, cpu, pstate_idx; 108 u64 *val; 109 110 if (ctx->cpu_id > MAX_CPU) 111 return 0; 112 113 cpu = ctx->cpu_id; 114 115 key = cpu * MAP_OFF_NUM + MAP_OFF_CSTATE_TIME; 116 cts = bpf_map_lookup_elem(&my_map, &key); 117 if (!cts) 118 return 0; 119 120 key = cpu * MAP_OFF_NUM + MAP_OFF_CSTATE_IDX; 121 cstate = bpf_map_lookup_elem(&my_map, &key); 122 if (!cstate) 123 return 0; 124 125 key = cpu * MAP_OFF_NUM + MAP_OFF_PSTATE_TIME; 126 pts = bpf_map_lookup_elem(&my_map, &key); 127 if (!pts) 128 return 0; 129 130 key = cpu * MAP_OFF_NUM + MAP_OFF_PSTATE_IDX; 131 pstate = bpf_map_lookup_elem(&my_map, &key); 132 if (!pstate) 133 return 0; 134 135 prev_state = *cstate; 136 *cstate = ctx->state; 137 138 if (!*cts) { 139 *cts = bpf_ktime_get_ns(); 140 return 0; 141 } 142 143 cur_ts = bpf_ktime_get_ns(); 144 delta = cur_ts - *cts; 145 *cts = cur_ts; 146 147 /* 148 * When state doesn't equal to (u32)-1, the cpu will enter 149 * one idle state; for this case we need to record interval 150 * for the pstate. 151 * 152 * OPP2 153 * +---------------------+ 154 * OPP1 | | 155 * ---------+ | 156 * | Idle state 157 * +--------------- 158 * 159 * |<- pstate duration ->| 160 * ^ ^ 161 * pts cur_ts 162 */ 163 if (ctx->state != (u32)-1) { 164 165 /* record pstate after have first cpu_frequency event */ 166 if (!*pts) 167 return 0; 168 169 delta = cur_ts - *pts; 170 171 pstate_idx = find_cpu_pstate_idx(*pstate); 172 if (pstate_idx >= MAX_PSTATE_ENTRIES) 173 return 0; 174 175 key = cpu * MAX_PSTATE_ENTRIES + pstate_idx; 176 val = bpf_map_lookup_elem(&pstate_duration, &key); 177 if (val) 178 __sync_fetch_and_add((long *)val, delta); 179 180 /* 181 * When state equal to (u32)-1, the cpu just exits from one 182 * specific idle state; for this case we need to record 183 * interval for the pstate. 184 * 185 * OPP2 186 * -----------+ 187 * | OPP1 188 * | +----------- 189 * | Idle state | 190 * +---------------------+ 191 * 192 * |<- cstate duration ->| 193 * ^ ^ 194 * cts cur_ts 195 */ 196 } else { 197 198 key = cpu * MAX_CSTATE_ENTRIES + prev_state; 199 val = bpf_map_lookup_elem(&cstate_duration, &key); 200 if (val) 201 __sync_fetch_and_add((long *)val, delta); 202 } 203 204 /* Update timestamp for pstate as new start time */ 205 if (*pts) 206 *pts = cur_ts; 207 208 return 0; 209} 210 211SEC("tracepoint/power/cpu_frequency") 212int bpf_prog2(struct cpu_args *ctx) 213{ 214 u64 *pts, *cstate, *pstate, prev_state, cur_ts, delta; 215 u32 key, cpu, pstate_idx; 216 u64 *val; 217 218 cpu = ctx->cpu_id; 219 220 key = cpu * MAP_OFF_NUM + MAP_OFF_PSTATE_TIME; 221 pts = bpf_map_lookup_elem(&my_map, &key); 222 if (!pts) 223 return 0; 224 225 key = cpu * MAP_OFF_NUM + MAP_OFF_PSTATE_IDX; 226 pstate = bpf_map_lookup_elem(&my_map, &key); 227 if (!pstate) 228 return 0; 229 230 key = cpu * MAP_OFF_NUM + MAP_OFF_CSTATE_IDX; 231 cstate = bpf_map_lookup_elem(&my_map, &key); 232 if (!cstate) 233 return 0; 234 235 prev_state = *pstate; 236 *pstate = ctx->state; 237 238 if (!*pts) { 239 *pts = bpf_ktime_get_ns(); 240 return 0; 241 } 242 243 cur_ts = bpf_ktime_get_ns(); 244 delta = cur_ts - *pts; 245 *pts = cur_ts; 246 247 /* When CPU is in idle, bail out to skip pstate statistics */ 248 if (*cstate != (u32)(-1)) 249 return 0; 250 251 /* 252 * The cpu changes to another different OPP (in below diagram 253 * change frequency from OPP3 to OPP1), need recording interval 254 * for previous frequency OPP3 and update timestamp as start 255 * time for new frequency OPP1. 256 * 257 * OPP3 258 * +---------------------+ 259 * OPP2 | | 260 * ---------+ | 261 * | OPP1 262 * +--------------- 263 * 264 * |<- pstate duration ->| 265 * ^ ^ 266 * pts cur_ts 267 */ 268 pstate_idx = find_cpu_pstate_idx(*pstate); 269 if (pstate_idx >= MAX_PSTATE_ENTRIES) 270 return 0; 271 272 key = cpu * MAX_PSTATE_ENTRIES + pstate_idx; 273 val = bpf_map_lookup_elem(&pstate_duration, &key); 274 if (val) 275 __sync_fetch_and_add((long *)val, delta); 276 277 return 0; 278} 279 280char _license[] SEC("license") = "GPL"; 281u32 _version SEC("version") = LINUX_VERSION_CODE;