off_cpu.bpf.c (5227B)
1// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) 2// Copyright (c) 2022 Google 3#include "vmlinux.h" 4#include <bpf/bpf_helpers.h> 5#include <bpf/bpf_tracing.h> 6#include <bpf/bpf_core_read.h> 7 8/* task->flags for off-cpu analysis */ 9#define PF_KTHREAD 0x00200000 /* I am a kernel thread */ 10 11/* task->state for off-cpu analysis */ 12#define TASK_INTERRUPTIBLE 0x0001 13#define TASK_UNINTERRUPTIBLE 0x0002 14 15#define MAX_STACKS 32 16#define MAX_ENTRIES 102400 17 18struct tstamp_data { 19 __u32 stack_id; 20 __u32 state; 21 __u64 timestamp; 22}; 23 24struct offcpu_key { 25 __u32 pid; 26 __u32 tgid; 27 __u32 stack_id; 28 __u32 state; 29 __u64 cgroup_id; 30}; 31 32struct { 33 __uint(type, BPF_MAP_TYPE_STACK_TRACE); 34 __uint(key_size, sizeof(__u32)); 35 __uint(value_size, MAX_STACKS * sizeof(__u64)); 36 __uint(max_entries, MAX_ENTRIES); 37} stacks SEC(".maps"); 38 39struct { 40 __uint(type, BPF_MAP_TYPE_TASK_STORAGE); 41 __uint(map_flags, BPF_F_NO_PREALLOC); 42 __type(key, int); 43 __type(value, struct tstamp_data); 44} tstamp SEC(".maps"); 45 46struct { 47 __uint(type, BPF_MAP_TYPE_HASH); 48 __uint(key_size, sizeof(struct offcpu_key)); 49 __uint(value_size, sizeof(__u64)); 50 __uint(max_entries, MAX_ENTRIES); 51} off_cpu SEC(".maps"); 52 53struct { 54 __uint(type, BPF_MAP_TYPE_HASH); 55 __uint(key_size, sizeof(__u32)); 56 __uint(value_size, sizeof(__u8)); 57 __uint(max_entries, 1); 58} cpu_filter SEC(".maps"); 59 60struct { 61 __uint(type, BPF_MAP_TYPE_HASH); 62 __uint(key_size, sizeof(__u32)); 63 __uint(value_size, sizeof(__u8)); 64 __uint(max_entries, 1); 65} task_filter SEC(".maps"); 66 67struct { 68 __uint(type, BPF_MAP_TYPE_HASH); 69 __uint(key_size, sizeof(__u64)); 70 __uint(value_size, sizeof(__u8)); 71 __uint(max_entries, 1); 72} cgroup_filter SEC(".maps"); 73 74/* new kernel task_struct definition */ 75struct task_struct___new { 76 long __state; 77} __attribute__((preserve_access_index)); 78 79/* old kernel task_struct definition */ 80struct task_struct___old { 81 long state; 82} __attribute__((preserve_access_index)); 83 84int enabled = 0; 85int has_cpu = 0; 86int has_task = 0; 87int has_cgroup = 0; 88 89const volatile bool has_prev_state = false; 90const volatile bool needs_cgroup = false; 91const volatile bool uses_cgroup_v1 = false; 92 93/* 94 * Old kernel used to call it task_struct->state and now it's '__state'. 95 * Use BPF CO-RE "ignored suffix rule" to deal with it like below: 96 * 97 * https://nakryiko.com/posts/bpf-core-reference-guide/#handling-incompatible-field-and-type-changes 98 */ 99static inline int get_task_state(struct task_struct *t) 100{ 101 /* recast pointer to capture new type for compiler */ 102 struct task_struct___new *t_new = (void *)t; 103 104 if (bpf_core_field_exists(t_new->__state)) { 105 return BPF_CORE_READ(t_new, __state); 106 } else { 107 /* recast pointer to capture old type for compiler */ 108 struct task_struct___old *t_old = (void *)t; 109 110 return BPF_CORE_READ(t_old, state); 111 } 112} 113 114static inline __u64 get_cgroup_id(struct task_struct *t) 115{ 116 struct cgroup *cgrp; 117 118 if (uses_cgroup_v1) 119 cgrp = BPF_CORE_READ(t, cgroups, subsys[perf_event_cgrp_id], cgroup); 120 else 121 cgrp = BPF_CORE_READ(t, cgroups, dfl_cgrp); 122 123 return BPF_CORE_READ(cgrp, kn, id); 124} 125 126static inline int can_record(struct task_struct *t, int state) 127{ 128 /* kernel threads don't have user stack */ 129 if (t->flags & PF_KTHREAD) 130 return 0; 131 132 if (state != TASK_INTERRUPTIBLE && 133 state != TASK_UNINTERRUPTIBLE) 134 return 0; 135 136 if (has_cpu) { 137 __u32 cpu = bpf_get_smp_processor_id(); 138 __u8 *ok; 139 140 ok = bpf_map_lookup_elem(&cpu_filter, &cpu); 141 if (!ok) 142 return 0; 143 } 144 145 if (has_task) { 146 __u8 *ok; 147 __u32 pid = t->pid; 148 149 ok = bpf_map_lookup_elem(&task_filter, &pid); 150 if (!ok) 151 return 0; 152 } 153 154 if (has_cgroup) { 155 __u8 *ok; 156 __u64 cgrp_id = get_cgroup_id(t); 157 158 ok = bpf_map_lookup_elem(&cgroup_filter, &cgrp_id); 159 if (!ok) 160 return 0; 161 } 162 163 return 1; 164} 165 166static int off_cpu_stat(u64 *ctx, struct task_struct *prev, 167 struct task_struct *next, int state) 168{ 169 __u64 ts; 170 __u32 stack_id; 171 struct tstamp_data *pelem; 172 173 ts = bpf_ktime_get_ns(); 174 175 if (!can_record(prev, state)) 176 goto next; 177 178 stack_id = bpf_get_stackid(ctx, &stacks, 179 BPF_F_FAST_STACK_CMP | BPF_F_USER_STACK); 180 181 pelem = bpf_task_storage_get(&tstamp, prev, NULL, 182 BPF_LOCAL_STORAGE_GET_F_CREATE); 183 if (!pelem) 184 goto next; 185 186 pelem->timestamp = ts; 187 pelem->state = state; 188 pelem->stack_id = stack_id; 189 190next: 191 pelem = bpf_task_storage_get(&tstamp, next, NULL, 0); 192 193 if (pelem && pelem->timestamp) { 194 struct offcpu_key key = { 195 .pid = next->pid, 196 .tgid = next->tgid, 197 .stack_id = pelem->stack_id, 198 .state = pelem->state, 199 .cgroup_id = needs_cgroup ? get_cgroup_id(next) : 0, 200 }; 201 __u64 delta = ts - pelem->timestamp; 202 __u64 *total; 203 204 total = bpf_map_lookup_elem(&off_cpu, &key); 205 if (total) 206 *total += delta; 207 else 208 bpf_map_update_elem(&off_cpu, &key, &delta, BPF_ANY); 209 210 /* prevent to reuse the timestamp later */ 211 pelem->timestamp = 0; 212 } 213 214 return 0; 215} 216 217SEC("tp_btf/sched_switch") 218int on_switch(u64 *ctx) 219{ 220 struct task_struct *prev, *next; 221 int prev_state; 222 223 if (!enabled) 224 return 0; 225 226 prev = (struct task_struct *)ctx[1]; 227 next = (struct task_struct *)ctx[2]; 228 229 if (has_prev_state) 230 prev_state = (int)ctx[3]; 231 else 232 prev_state = get_task_state(prev); 233 234 return off_cpu_stat(ctx, prev, next, prev_state); 235} 236 237char LICENSE[] SEC("license") = "Dual BSD/GPL";