bperf_cgroup.bpf.c (4592B)
1// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) 2// Copyright (c) 2021 Facebook 3// Copyright (c) 2021 Google 4#include "vmlinux.h" 5#include <bpf/bpf_helpers.h> 6#include <bpf/bpf_tracing.h> 7#include <bpf/bpf_core_read.h> 8 9#define MAX_LEVELS 10 // max cgroup hierarchy level: arbitrary 10#define MAX_EVENTS 32 // max events per cgroup: arbitrary 11 12// NOTE: many of map and global data will be modified before loading 13// from the userspace (perf tool) using the skeleton helpers. 14 15// single set of global perf events to measure 16struct { 17 __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); 18 __uint(key_size, sizeof(__u32)); 19 __uint(value_size, sizeof(int)); 20 __uint(max_entries, 1); 21} events SEC(".maps"); 22 23// from cgroup id to event index 24struct { 25 __uint(type, BPF_MAP_TYPE_HASH); 26 __uint(key_size, sizeof(__u64)); 27 __uint(value_size, sizeof(__u32)); 28 __uint(max_entries, 1); 29} cgrp_idx SEC(".maps"); 30 31// per-cpu event snapshots to calculate delta 32struct { 33 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); 34 __uint(key_size, sizeof(__u32)); 35 __uint(value_size, sizeof(struct bpf_perf_event_value)); 36} prev_readings SEC(".maps"); 37 38// aggregated event values for each cgroup (per-cpu) 39// will be read from the user-space 40struct { 41 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); 42 __uint(key_size, sizeof(__u32)); 43 __uint(value_size, sizeof(struct bpf_perf_event_value)); 44} cgrp_readings SEC(".maps"); 45 46const volatile __u32 num_events = 1; 47const volatile __u32 num_cpus = 1; 48 49int enabled = 0; 50int use_cgroup_v2 = 0; 51 52static inline int get_cgroup_v1_idx(__u32 *cgrps, int size) 53{ 54 struct task_struct *p = (void *)bpf_get_current_task(); 55 struct cgroup *cgrp; 56 register int i = 0; 57 __u32 *elem; 58 int level; 59 int cnt; 60 61 cgrp = BPF_CORE_READ(p, cgroups, subsys[perf_event_cgrp_id], cgroup); 62 level = BPF_CORE_READ(cgrp, level); 63 64 for (cnt = 0; i < MAX_LEVELS; i++) { 65 __u64 cgrp_id; 66 67 if (i > level) 68 break; 69 70 // convert cgroup-id to a map index 71 cgrp_id = BPF_CORE_READ(cgrp, ancestor_ids[i]); 72 elem = bpf_map_lookup_elem(&cgrp_idx, &cgrp_id); 73 if (!elem) 74 continue; 75 76 cgrps[cnt++] = *elem; 77 if (cnt == size) 78 break; 79 } 80 81 return cnt; 82} 83 84static inline int get_cgroup_v2_idx(__u32 *cgrps, int size) 85{ 86 register int i = 0; 87 __u32 *elem; 88 int cnt; 89 90 for (cnt = 0; i < MAX_LEVELS; i++) { 91 __u64 cgrp_id = bpf_get_current_ancestor_cgroup_id(i); 92 93 if (cgrp_id == 0) 94 break; 95 96 // convert cgroup-id to a map index 97 elem = bpf_map_lookup_elem(&cgrp_idx, &cgrp_id); 98 if (!elem) 99 continue; 100 101 cgrps[cnt++] = *elem; 102 if (cnt == size) 103 break; 104 } 105 106 return cnt; 107} 108 109static int bperf_cgroup_count(void) 110{ 111 register __u32 idx = 0; // to have it in a register to pass BPF verifier 112 register int c = 0; 113 struct bpf_perf_event_value val, delta, *prev_val, *cgrp_val; 114 __u32 cpu = bpf_get_smp_processor_id(); 115 __u32 cgrp_idx[MAX_LEVELS]; 116 int cgrp_cnt; 117 __u32 key, cgrp; 118 long err; 119 120 if (use_cgroup_v2) 121 cgrp_cnt = get_cgroup_v2_idx(cgrp_idx, MAX_LEVELS); 122 else 123 cgrp_cnt = get_cgroup_v1_idx(cgrp_idx, MAX_LEVELS); 124 125 for ( ; idx < MAX_EVENTS; idx++) { 126 if (idx == num_events) 127 break; 128 129 // XXX: do not pass idx directly (for verifier) 130 key = idx; 131 // this is per-cpu array for diff 132 prev_val = bpf_map_lookup_elem(&prev_readings, &key); 133 if (!prev_val) { 134 val.counter = val.enabled = val.running = 0; 135 bpf_map_update_elem(&prev_readings, &key, &val, BPF_ANY); 136 137 prev_val = bpf_map_lookup_elem(&prev_readings, &key); 138 if (!prev_val) 139 continue; 140 } 141 142 // read from global perf_event array 143 key = idx * num_cpus + cpu; 144 err = bpf_perf_event_read_value(&events, key, &val, sizeof(val)); 145 if (err) 146 continue; 147 148 if (enabled) { 149 delta.counter = val.counter - prev_val->counter; 150 delta.enabled = val.enabled - prev_val->enabled; 151 delta.running = val.running - prev_val->running; 152 153 for (c = 0; c < MAX_LEVELS; c++) { 154 if (c == cgrp_cnt) 155 break; 156 157 cgrp = cgrp_idx[c]; 158 159 // aggregate the result by cgroup 160 key = cgrp * num_events + idx; 161 cgrp_val = bpf_map_lookup_elem(&cgrp_readings, &key); 162 if (cgrp_val) { 163 cgrp_val->counter += delta.counter; 164 cgrp_val->enabled += delta.enabled; 165 cgrp_val->running += delta.running; 166 } else { 167 bpf_map_update_elem(&cgrp_readings, &key, 168 &delta, BPF_ANY); 169 } 170 } 171 } 172 173 *prev_val = val; 174 } 175 return 0; 176} 177 178// This will be attached to cgroup-switches event for each cpu 179SEC("perf_events") 180int BPF_PROG(on_cgrp_switch) 181{ 182 return bperf_cgroup_count(); 183} 184 185SEC("raw_tp/sched_switch") 186int BPF_PROG(trigger_read) 187{ 188 return bperf_cgroup_count(); 189} 190 191char LICENSE[] SEC("license") = "Dual BSD/GPL";