summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorDaniel Borkmann <daniel@iogearbox.net>2018-03-15 01:14:01 +0100
committerDaniel Borkmann <daniel@iogearbox.net>2018-03-15 01:14:16 +0100
commit68de5ef4c786f63bf19b146fa009ee9208318d0b (patch)
treebebe74af6f86ad63719e1d50442482a51a6beedf /kernel
parent6d8cb045cde681e64a5ed80a2ab70be831a7f9b0 (diff)
parent81f77fd0deeb866d65ccc79733df8d2af5217c82 (diff)
downloadcachepc-linux-68de5ef4c786f63bf19b146fa009ee9208318d0b.tar.gz
cachepc-linux-68de5ef4c786f63bf19b146fa009ee9208318d0b.zip
Merge branch 'bpf-stackmap-build-id'
Song Liu says: ==================== This work follows up discussion at Plumbers'17 on improving addr->sym resolution of user stack traces. The following links have more information of the discussion: http://www.linuxplumbersconf.org/2017/ocw/proposals/4764 https://lwn.net/Articles/734453/ Section "Stack traces and kprobes" Currently, bpf stackmap store address for each entry in the call trace. To map these addresses to user space files, it is necessary to maintain the mapping from these virtual address to symbols in the binary. Usually, the user space profiler (such as perf) has to scan /proc/pid/maps at the beginning of profiling, and monitor mmap2() calls afterwards. Given the cost of maintaining the address map, this solution is not practical for system wide profiling that is always on. This patch tries to address this with a variation to stackmap. Instead of storing addresses, the variation stores ELF file build_id + offset. After profiling, a user space tool will look up these functions with build_id (to find the binary or shared library) and the offset. I also updated bcc/cc library for the stackmap (no python/lua support yet). You can find the work at: https://github.com/liu-song-6/bcc/commits/bpf_get_stackid_v02 Changes v5 -> v6: 1. When kernel stack is added to stackmap with build_id, use fallback mechanism to store ip (status == BPF_STACK_BUILD_ID_IP). Changes v4 -> v5: 1. Only allow build_id lookup in non-nmi context. Added comment and commit message to highlight this limitation. 2. Minor fix reported by kbuild test robot. Changes v3 -> v4: 1. Add fallback when build_id lookup failed. In this case, status is set to BPF_STACK_BUILD_ID_IP, and ip of this entry is saved. 2. Handle cases where vma is only part of the file (vma->vm_pgoff != 0). Thanks to Teng for helping me identify this issue! 3. Address feedbacks for previous versions. ==================== Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/stackmap.c257
1 files changed, 235 insertions, 22 deletions
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index b0ecf43f5894..57eeb1234b67 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -9,16 +9,19 @@
#include <linux/filter.h>
#include <linux/stacktrace.h>
#include <linux/perf_event.h>
+#include <linux/elf.h>
+#include <linux/pagemap.h>
#include "percpu_freelist.h"
-#define STACK_CREATE_FLAG_MASK \
- (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+#define STACK_CREATE_FLAG_MASK \
+ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY | \
+ BPF_F_STACK_BUILD_ID)
struct stack_map_bucket {
struct pcpu_freelist_node fnode;
u32 hash;
u32 nr;
- u64 ip[];
+ u64 data[];
};
struct bpf_stack_map {
@@ -29,6 +32,17 @@ struct bpf_stack_map {
struct stack_map_bucket *buckets[];
};
+static inline bool stack_map_use_build_id(struct bpf_map *map)
+{
+ return (map->map_flags & BPF_F_STACK_BUILD_ID);
+}
+
+static inline int stack_map_data_size(struct bpf_map *map)
+{
+ return stack_map_use_build_id(map) ?
+ sizeof(struct bpf_stack_build_id) : sizeof(u64);
+}
+
static int prealloc_elems_and_freelist(struct bpf_stack_map *smap)
{
u32 elem_size = sizeof(struct stack_map_bucket) + smap->map.value_size;
@@ -68,8 +82,16 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
/* check sanity of attributes */
if (attr->max_entries == 0 || attr->key_size != 4 ||
- value_size < 8 || value_size % 8 ||
- value_size / 8 > sysctl_perf_event_max_stack)
+ value_size < 8 || value_size % 8)
+ return ERR_PTR(-EINVAL);
+
+ BUILD_BUG_ON(sizeof(struct bpf_stack_build_id) % sizeof(u64));
+ if (attr->map_flags & BPF_F_STACK_BUILD_ID) {
+ if (value_size % sizeof(struct bpf_stack_build_id) ||
+ value_size / sizeof(struct bpf_stack_build_id)
+ > sysctl_perf_event_max_stack)
+ return ERR_PTR(-EINVAL);
+ } else if (value_size / 8 > sysctl_perf_event_max_stack)
return ERR_PTR(-EINVAL);
/* hash table size must be power of 2 */
@@ -114,13 +136,184 @@ free_smap:
return ERR_PTR(err);
}
+#define BPF_BUILD_ID 3
+/*
+ * Parse build id from the note segment. This logic can be shared between
+ * 32-bit and 64-bit system, because Elf32_Nhdr and Elf64_Nhdr are
+ * identical.
+ */
+static inline int stack_map_parse_build_id(void *page_addr,
+ unsigned char *build_id,
+ void *note_start,
+ Elf32_Word note_size)
+{
+ Elf32_Word note_offs = 0, new_offs;
+
+ /* check for overflow */
+ if (note_start < page_addr || note_start + note_size < note_start)
+ return -EINVAL;
+
+ /* only supports note that fits in the first page */
+ if (note_start + note_size > page_addr + PAGE_SIZE)
+ return -EINVAL;
+
+ while (note_offs + sizeof(Elf32_Nhdr) < note_size) {
+ Elf32_Nhdr *nhdr = (Elf32_Nhdr *)(note_start + note_offs);
+
+ if (nhdr->n_type == BPF_BUILD_ID &&
+ nhdr->n_namesz == sizeof("GNU") &&
+ nhdr->n_descsz == BPF_BUILD_ID_SIZE) {
+ memcpy(build_id,
+ note_start + note_offs +
+ ALIGN(sizeof("GNU"), 4) + sizeof(Elf32_Nhdr),
+ BPF_BUILD_ID_SIZE);
+ return 0;
+ }
+ new_offs = note_offs + sizeof(Elf32_Nhdr) +
+ ALIGN(nhdr->n_namesz, 4) + ALIGN(nhdr->n_descsz, 4);
+ if (new_offs <= note_offs) /* overflow */
+ break;
+ note_offs = new_offs;
+ }
+ return -EINVAL;
+}
+
+/* Parse build ID from 32-bit ELF */
+static int stack_map_get_build_id_32(void *page_addr,
+ unsigned char *build_id)
+{
+ Elf32_Ehdr *ehdr = (Elf32_Ehdr *)page_addr;
+ Elf32_Phdr *phdr;
+ int i;
+
+ /* only supports phdr that fits in one page */
+ if (ehdr->e_phnum >
+ (PAGE_SIZE - sizeof(Elf32_Ehdr)) / sizeof(Elf32_Phdr))
+ return -EINVAL;
+
+ phdr = (Elf32_Phdr *)(page_addr + sizeof(Elf32_Ehdr));
+
+ for (i = 0; i < ehdr->e_phnum; ++i)
+ if (phdr[i].p_type == PT_NOTE)
+ return stack_map_parse_build_id(page_addr, build_id,
+ page_addr + phdr[i].p_offset,
+ phdr[i].p_filesz);
+ return -EINVAL;
+}
+
+/* Parse build ID from 64-bit ELF */
+static int stack_map_get_build_id_64(void *page_addr,
+ unsigned char *build_id)
+{
+ Elf64_Ehdr *ehdr = (Elf64_Ehdr *)page_addr;
+ Elf64_Phdr *phdr;
+ int i;
+
+ /* only supports phdr that fits in one page */
+ if (ehdr->e_phnum >
+ (PAGE_SIZE - sizeof(Elf64_Ehdr)) / sizeof(Elf64_Phdr))
+ return -EINVAL;
+
+ phdr = (Elf64_Phdr *)(page_addr + sizeof(Elf64_Ehdr));
+
+ for (i = 0; i < ehdr->e_phnum; ++i)
+ if (phdr[i].p_type == PT_NOTE)
+ return stack_map_parse_build_id(page_addr, build_id,
+ page_addr + phdr[i].p_offset,
+ phdr[i].p_filesz);
+ return -EINVAL;
+}
+
+/* Parse build ID of ELF file mapped to vma */
+static int stack_map_get_build_id(struct vm_area_struct *vma,
+ unsigned char *build_id)
+{
+ Elf32_Ehdr *ehdr;
+ struct page *page;
+ void *page_addr;
+ int ret;
+
+ /* only works for page backed storage */
+ if (!vma->vm_file)
+ return -EINVAL;
+
+ page = find_get_page(vma->vm_file->f_mapping, 0);
+ if (!page)
+ return -EFAULT; /* page not mapped */
+
+ ret = -EINVAL;
+ page_addr = page_address(page);
+ ehdr = (Elf32_Ehdr *)page_addr;
+
+ /* compare magic x7f "ELF" */
+ if (memcmp(ehdr->e_ident, ELFMAG, SELFMAG) != 0)
+ goto out;
+
+ /* only support executable file and shared object file */
+ if (ehdr->e_type != ET_EXEC && ehdr->e_type != ET_DYN)
+ goto out;
+
+ if (ehdr->e_ident[EI_CLASS] == ELFCLASS32)
+ ret = stack_map_get_build_id_32(page_addr, build_id);
+ else if (ehdr->e_ident[EI_CLASS] == ELFCLASS64)
+ ret = stack_map_get_build_id_64(page_addr, build_id);
+out:
+ put_page(page);
+ return ret;
+}
+
+static void stack_map_get_build_id_offset(struct bpf_map *map,
+ struct stack_map_bucket *bucket,
+ u64 *ips, u32 trace_nr, bool user)
+{
+ int i;
+ struct vm_area_struct *vma;
+ struct bpf_stack_build_id *id_offs;
+
+ bucket->nr = trace_nr;
+ id_offs = (struct bpf_stack_build_id *)bucket->data;
+
+ /*
+ * We cannot do up_read() in nmi context, so build_id lookup is
+ * only supported for non-nmi events. If at some point, it is
+ * possible to run find_vma() without taking the semaphore, we
+ * would like to allow build_id lookup in nmi context.
+ *
+ * Same fallback is used for kernel stack (!user) on a stackmap
+ * with build_id.
+ */
+ if (!user || !current || !current->mm || in_nmi() ||
+ down_read_trylock(&current->mm->mmap_sem) == 0) {
+ /* cannot access current->mm, fall back to ips */
+ for (i = 0; i < trace_nr; i++) {
+ id_offs[i].status = BPF_STACK_BUILD_ID_IP;
+ id_offs[i].ip = ips[i];
+ }
+ return;
+ }
+
+ for (i = 0; i < trace_nr; i++) {
+ vma = find_vma(current->mm, ips[i]);
+ if (!vma || stack_map_get_build_id(vma, id_offs[i].build_id)) {
+ /* per entry fall back to ips */
+ id_offs[i].status = BPF_STACK_BUILD_ID_IP;
+ id_offs[i].ip = ips[i];
+ continue;
+ }
+ id_offs[i].offset = (vma->vm_pgoff << PAGE_SHIFT) + ips[i]
+ - vma->vm_start;
+ id_offs[i].status = BPF_STACK_BUILD_ID_VALID;
+ }
+ up_read(&current->mm->mmap_sem);
+}
+
BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
u64, flags)
{
struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
struct perf_callchain_entry *trace;
struct stack_map_bucket *bucket, *new_bucket, *old_bucket;
- u32 max_depth = map->value_size / 8;
+ u32 max_depth = map->value_size / stack_map_data_size(map);
/* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */
u32 init_nr = sysctl_perf_event_max_stack - max_depth;
u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
@@ -128,6 +321,7 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
bool user = flags & BPF_F_USER_STACK;
bool kernel = !user;
u64 *ips;
+ bool hash_matches;
if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID)))
@@ -156,24 +350,43 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
id = hash & (smap->n_buckets - 1);
bucket = READ_ONCE(smap->buckets[id]);
- if (bucket && bucket->hash == hash) {
- if (flags & BPF_F_FAST_STACK_CMP)
+ hash_matches = bucket && bucket->hash == hash;
+ /* fast cmp */
+ if (hash_matches && flags & BPF_F_FAST_STACK_CMP)
+ return id;
+
+ if (stack_map_use_build_id(map)) {
+ /* for build_id+offset, pop a bucket before slow cmp */
+ new_bucket = (struct stack_map_bucket *)
+ pcpu_freelist_pop(&smap->freelist);
+ if (unlikely(!new_bucket))
+ return -ENOMEM;
+ stack_map_get_build_id_offset(map, new_bucket, ips,
+ trace_nr, user);
+ trace_len = trace_nr * sizeof(struct bpf_stack_build_id);
+ if (hash_matches && bucket->nr == trace_nr &&
+ memcmp(bucket->data, new_bucket->data, trace_len) == 0) {
+ pcpu_freelist_push(&smap->freelist, &new_bucket->fnode);
return id;
- if (bucket->nr == trace_nr &&
- memcmp(bucket->ip, ips, trace_len) == 0)
+ }
+ if (bucket && !(flags & BPF_F_REUSE_STACKID)) {
+ pcpu_freelist_push(&smap->freelist, &new_bucket->fnode);
+ return -EEXIST;
+ }
+ } else {
+ if (hash_matches && bucket->nr == trace_nr &&
+ memcmp(bucket->data, ips, trace_len) == 0)
return id;
+ if (bucket && !(flags & BPF_F_REUSE_STACKID))
+ return -EEXIST;
+
+ new_bucket = (struct stack_map_bucket *)
+ pcpu_freelist_pop(&smap->freelist);
+ if (unlikely(!new_bucket))
+ return -ENOMEM;
+ memcpy(new_bucket->data, ips, trace_len);
}
- /* this call stack is not in the map, try to add it */
- if (bucket && !(flags & BPF_F_REUSE_STACKID))
- return -EEXIST;
-
- new_bucket = (struct stack_map_bucket *)
- pcpu_freelist_pop(&smap->freelist);
- if (unlikely(!new_bucket))
- return -ENOMEM;
-
- memcpy(new_bucket->ip, ips, trace_len);
new_bucket->hash = hash;
new_bucket->nr = trace_nr;
@@ -212,8 +425,8 @@ int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)
if (!bucket)
return -ENOENT;
- trace_len = bucket->nr * sizeof(u64);
- memcpy(value, bucket->ip, trace_len);
+ trace_len = bucket->nr * stack_map_data_size(map);
+ memcpy(value, bucket->data, trace_len);
memset(value + trace_len, 0, map->value_size - trace_len);
old_bucket = xchg(&smap->buckets[id], bucket);