bpf: Separate bpf_get_[stack|stackid] for perf events BPF
authorSong Liu <songliubraving@fb.com>
Thu, 23 Jul 2020 18:06:44 +0000 (11:06 -0700)
committerAlexei Starovoitov <ast@kernel.org>
Sun, 26 Jul 2020 03:16:34 +0000 (20:16 -0700)
Calling get_perf_callchain() on perf_events from PEBS entries may cause
unwinder errors. To fix this issue, the callchain is fetched early. Such
perf_events are marked with __PERF_SAMPLE_CALLCHAIN_EARLY.

Similarly, calling bpf_get_[stack|stackid] on perf_events from PEBS may
also cause unwinder errors. To fix this, add separate version of these
two helpers, bpf_get_[stack|stackid]_pe. These two hepers use callchain in
bpf_perf_event_data_kern->data->callchain.

Signed-off-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200723180648.1429892-2-songliubraving@fb.com
include/linux/bpf.h
kernel/bpf/stackmap.c
kernel/trace/bpf_trace.c

index 4175cf1f46657002c5aa923af5bbfddcc492056e..8357be3491333c78ff3007db6c4182fab258b351 100644 (file)
@@ -1675,6 +1675,8 @@ extern const struct bpf_func_proto bpf_get_current_comm_proto;
 extern const struct bpf_func_proto bpf_get_stackid_proto;
 extern const struct bpf_func_proto bpf_get_stack_proto;
 extern const struct bpf_func_proto bpf_get_task_stack_proto;
+extern const struct bpf_func_proto bpf_get_stackid_proto_pe;
+extern const struct bpf_func_proto bpf_get_stack_proto_pe;
 extern const struct bpf_func_proto bpf_sock_map_update_proto;
 extern const struct bpf_func_proto bpf_sock_hash_update_proto;
 extern const struct bpf_func_proto bpf_get_current_cgroup_id_proto;
index 48d8e739975fa113f9f031719c36ea4f84c36843..5beb2f8c23da1c81596a6403f7151dfd962d26d0 100644 (file)
@@ -4,6 +4,7 @@
 #include <linux/bpf.h>
 #include <linux/jhash.h>
 #include <linux/filter.h>
+#include <linux/kernel.h>
 #include <linux/stacktrace.h>
 #include <linux/perf_event.h>
 #include <linux/elf.h>
@@ -387,11 +388,10 @@ get_callchain_entry_for_task(struct task_struct *task, u32 init_nr)
 #endif
 }
 
-BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
-          u64, flags)
+static long __bpf_get_stackid(struct bpf_map *map,
+                             struct perf_callchain_entry *trace, u64 flags)
 {
        struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
-       struct perf_callchain_entry *trace;
        struct stack_map_bucket *bucket, *new_bucket, *old_bucket;
        u32 max_depth = map->value_size / stack_map_data_size(map);
        /* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */
@@ -399,21 +399,9 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
        u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
        u32 hash, id, trace_nr, trace_len;
        bool user = flags & BPF_F_USER_STACK;
-       bool kernel = !user;
        u64 *ips;
        bool hash_matches;
 
-       if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
-                              BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID)))
-               return -EINVAL;
-
-       trace = get_perf_callchain(regs, init_nr, kernel, user,
-                                  sysctl_perf_event_max_stack, false, false);
-
-       if (unlikely(!trace))
-               /* couldn't fetch the stack trace */
-               return -EFAULT;
-
        /* get_perf_callchain() guarantees that trace->nr >= init_nr
         * and trace-nr <= sysctl_perf_event_max_stack, so trace_nr <= max_depth
         */
@@ -478,6 +466,30 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
        return id;
 }
 
+BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
+          u64, flags)
+{
+       u32 max_depth = map->value_size / stack_map_data_size(map);
+       /* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */
+       u32 init_nr = sysctl_perf_event_max_stack - max_depth;
+       bool user = flags & BPF_F_USER_STACK;
+       struct perf_callchain_entry *trace;
+       bool kernel = !user;
+
+       if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
+                              BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID)))
+               return -EINVAL;
+
+       trace = get_perf_callchain(regs, init_nr, kernel, user,
+                                  sysctl_perf_event_max_stack, false, false);
+
+       if (unlikely(!trace))
+               /* couldn't fetch the stack trace */
+               return -EFAULT;
+
+       return __bpf_get_stackid(map, trace, flags);
+}
+
 const struct bpf_func_proto bpf_get_stackid_proto = {
        .func           = bpf_get_stackid,
        .gpl_only       = true,
@@ -487,7 +499,77 @@ const struct bpf_func_proto bpf_get_stackid_proto = {
        .arg3_type      = ARG_ANYTHING,
 };
 
+static __u64 count_kernel_ip(struct perf_callchain_entry *trace)
+{
+       __u64 nr_kernel = 0;
+
+       while (nr_kernel < trace->nr) {
+               if (trace->ip[nr_kernel] == PERF_CONTEXT_USER)
+                       break;
+               nr_kernel++;
+       }
+       return nr_kernel;
+}
+
+BPF_CALL_3(bpf_get_stackid_pe, struct bpf_perf_event_data_kern *, ctx,
+          struct bpf_map *, map, u64, flags)
+{
+       struct perf_event *event = ctx->event;
+       struct perf_callchain_entry *trace;
+       bool kernel, user;
+       __u64 nr_kernel;
+       int ret;
+
+       /* perf_sample_data doesn't have callchain, use bpf_get_stackid */
+       if (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY))
+               return bpf_get_stackid((unsigned long)(ctx->regs),
+                                      (unsigned long) map, flags, 0, 0);
+
+       if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
+                              BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID)))
+               return -EINVAL;
+
+       user = flags & BPF_F_USER_STACK;
+       kernel = !user;
+
+       trace = ctx->data->callchain;
+       if (unlikely(!trace))
+               return -EFAULT;
+
+       nr_kernel = count_kernel_ip(trace);
+
+       if (kernel) {
+               __u64 nr = trace->nr;
+
+               trace->nr = nr_kernel;
+               ret = __bpf_get_stackid(map, trace, flags);
+
+               /* restore nr */
+               trace->nr = nr;
+       } else { /* user */
+               u64 skip = flags & BPF_F_SKIP_FIELD_MASK;
+
+               skip += nr_kernel;
+               if (skip > BPF_F_SKIP_FIELD_MASK)
+                       return -EFAULT;
+
+               flags = (flags & ~BPF_F_SKIP_FIELD_MASK) | skip;
+               ret = __bpf_get_stackid(map, trace, flags);
+       }
+       return ret;
+}
+
+const struct bpf_func_proto bpf_get_stackid_proto_pe = {
+       .func           = bpf_get_stackid_pe,
+       .gpl_only       = false,
+       .ret_type       = RET_INTEGER,
+       .arg1_type      = ARG_PTR_TO_CTX,
+       .arg2_type      = ARG_CONST_MAP_PTR,
+       .arg3_type      = ARG_ANYTHING,
+};
+
 static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
+                           struct perf_callchain_entry *trace_in,
                            void *buf, u32 size, u64 flags)
 {
        u32 init_nr, trace_nr, copy_len, elem_size, num_elem;
@@ -520,7 +602,9 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
        else
                init_nr = sysctl_perf_event_max_stack - num_elem;
 
-       if (kernel && task)
+       if (trace_in)
+               trace = trace_in;
+       else if (kernel && task)
                trace = get_callchain_entry_for_task(task, init_nr);
        else
                trace = get_perf_callchain(regs, init_nr, kernel, user,
@@ -556,7 +640,7 @@ clear:
 BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size,
           u64, flags)
 {
-       return __bpf_get_stack(regs, NULL, buf, size, flags);
+       return __bpf_get_stack(regs, NULL, NULL, buf, size, flags);
 }
 
 const struct bpf_func_proto bpf_get_stack_proto = {
@@ -574,7 +658,7 @@ BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf,
 {
        struct pt_regs *regs = task_pt_regs(task);
 
-       return __bpf_get_stack(regs, task, buf, size, flags);
+       return __bpf_get_stack(regs, task, NULL, buf, size, flags);
 }
 
 BTF_ID_LIST(bpf_get_task_stack_btf_ids)
@@ -591,6 +675,70 @@ const struct bpf_func_proto bpf_get_task_stack_proto = {
        .btf_id         = bpf_get_task_stack_btf_ids,
 };
 
+BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx,
+          void *, buf, u32, size, u64, flags)
+{
+       struct perf_event *event = ctx->event;
+       struct perf_callchain_entry *trace;
+       bool kernel, user;
+       int err = -EINVAL;
+       __u64 nr_kernel;
+
+       if (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY))
+               return __bpf_get_stack(ctx->regs, NULL, NULL, buf, size, flags);
+
+       if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
+                              BPF_F_USER_BUILD_ID)))
+               goto clear;
+
+       user = flags & BPF_F_USER_STACK;
+       kernel = !user;
+
+       err = -EFAULT;
+       trace = ctx->data->callchain;
+       if (unlikely(!trace))
+               goto clear;
+
+       nr_kernel = count_kernel_ip(trace);
+
+       if (kernel) {
+               __u64 nr = trace->nr;
+
+               trace->nr = nr_kernel;
+               err = __bpf_get_stack(ctx->regs, NULL, trace, buf,
+                                     size, flags);
+
+               /* restore nr */
+               trace->nr = nr;
+       } else { /* user */
+               u64 skip = flags & BPF_F_SKIP_FIELD_MASK;
+
+               skip += nr_kernel;
+               if (skip > BPF_F_SKIP_FIELD_MASK)
+                       goto clear;
+
+               flags = (flags & ~BPF_F_SKIP_FIELD_MASK) | skip;
+               err = __bpf_get_stack(ctx->regs, NULL, trace, buf,
+                                     size, flags);
+       }
+       return err;
+
+clear:
+       memset(buf, 0, size);
+       return err;
+
+}
+
+const struct bpf_func_proto bpf_get_stack_proto_pe = {
+       .func           = bpf_get_stack_pe,
+       .gpl_only       = true,
+       .ret_type       = RET_INTEGER,
+       .arg1_type      = ARG_PTR_TO_CTX,
+       .arg2_type      = ARG_PTR_TO_UNINIT_MEM,
+       .arg3_type      = ARG_CONST_SIZE_OR_ZERO,
+       .arg4_type      = ARG_ANYTHING,
+};
+
 /* Called from eBPF program */
 static void *stack_map_lookup_elem(struct bpf_map *map, void *key)
 {
index 3cc0dcb60ca20c15c4df698df8354e6049bfbb39..cb91ef902cc4362d1d93d42deb0bb2516c5b8daa 100644 (file)
@@ -1411,9 +1411,9 @@ pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
        case BPF_FUNC_perf_event_output:
                return &bpf_perf_event_output_proto_tp;
        case BPF_FUNC_get_stackid:
-               return &bpf_get_stackid_proto_tp;
+               return &bpf_get_stackid_proto_pe;
        case BPF_FUNC_get_stack:
-               return &bpf_get_stack_proto_tp;
+               return &bpf_get_stack_proto_pe;
        case BPF_FUNC_perf_prog_read_value:
                return &bpf_perf_prog_read_value_proto;
        case BPF_FUNC_read_branch_records: