perf lock: Retrieve owner callstack in bpf program

author Chun-Tse Shao <ctshao@google.com>

Thu, 27 Feb 2025 00:28:54 +0000 (16:28 -0800)

committer Namhyung Kim <namhyung@kernel.org>

Fri, 28 Feb 2025 08:29:37 +0000 (00:29 -0800)
author Chun-Tse Shao <ctshao@google.com>
Thu, 27 Feb 2025 00:28:54 +0000 (16:28 -0800)
committer Namhyung Kim <namhyung@kernel.org>
Fri, 28 Feb 2025 08:29:37 +0000 (00:29 -0800)
diff --git a/tools/perf/util/bpf_skel/lock_contention.bpf.c b/tools/perf/util/bpf_skel/lock_contention.bpf.c

index 23fe9cc980aec3f82d9dc56ce2b750d733bf89bf..69be7a4234e076e86f0891a4958ee2015b134571 100644 (file)
--- a/tools/perf/util/bpf_skel/lock_contention.bpf.c
+++ b/tools/perf/util/bpf_skel/lock_contention.bpf.c
@@ -197,6 +197,9 @@ int data_fail;
  int task_map_full;
  int data_map_full;
  
+struct task_struct *bpf_task_from_pid(s32 pid) __ksym __weak;
+void bpf_task_release(struct task_struct *p) __ksym __weak;
+
  static inline __u64 get_current_cgroup_id(void)
  {
         struct task_struct *task;
@@ -420,6 +423,61 @@ static inline struct tstamp_data *get_tstamp_elem(__u32 flags)
         return pelem;
  }
  
+static inline s32 get_owner_stack_id(u64 *stacktrace)
+{
+       s32 *id, new_id;
+       static s64 id_gen = 1;
+
+       id = bpf_map_lookup_elem(&owner_stacks, stacktrace);
+       if (id)
+               return *id;
+
+       new_id = (s32)__sync_fetch_and_add(&id_gen, 1);
+
+       bpf_map_update_elem(&owner_stacks, stacktrace, &new_id, BPF_NOEXIST);
+
+       id = bpf_map_lookup_elem(&owner_stacks, stacktrace);
+       if (id)
+               return *id;
+
+       return -1;
+}
+
+static inline void update_contention_data(struct contention_data *data, u64 duration, u32 count)
+{
+       __sync_fetch_and_add(&data->total_time, duration);
+       __sync_fetch_and_add(&data->count, count);
+
+       /* FIXME: need atomic operations */
+       if (data->max_time < duration)
+               data->max_time = duration;
+       if (data->min_time > duration)
+               data->min_time = duration;
+}
+
+static inline void update_owner_stat(u32 id, u64 duration, u32 flags)
+{
+       struct contention_key key = {
+               .stack_id = id,
+               .pid = 0,
+               .lock_addr_or_cgroup = 0,
+       };
+       struct contention_data *data = bpf_map_lookup_elem(&owner_stat, &key);
+
+       if (!data) {
+               struct contention_data first = {
+                       .total_time = duration,
+                       .max_time = duration,
+                       .min_time = duration,
+                       .count = 1,
+                       .flags = flags,
+               };
+               bpf_map_update_elem(&owner_stat, &key, &first, BPF_NOEXIST);
+       } else {
+               update_contention_data(data, duration, 1);
+       }
+}
+
  SEC("tp_btf/contention_begin")
  int contention_begin(u64 *ctx)
  {
@@ -437,6 +495,72 @@ int contention_begin(u64 *ctx)
         pelem->flags = (__u32)ctx[1];
  
         if (needs_callstack) {
+               u32 i = 0;
+               u32 id = 0;
+               int owner_pid;
+               u64 *buf;
+               struct task_struct *task;
+               struct owner_tracing_data *otdata;
+
+               if (!lock_owner)
+                       goto skip_owner;
+
+               task = get_lock_owner(pelem->lock, pelem->flags);
+               if (!task)
+                       goto skip_owner;
+
+               owner_pid = BPF_CORE_READ(task, pid);
+
+               buf = bpf_map_lookup_elem(&stack_buf, &i);
+               if (!buf)
+                       goto skip_owner;
+               for (i = 0; i < max_stack; i++)
+                       buf[i] = 0x0;
+
+               if (!bpf_task_from_pid)
+                       goto skip_owner;
+
+               task = bpf_task_from_pid(owner_pid);
+               if (!task)
+                       goto skip_owner;
+
+               bpf_get_task_stack(task, buf, max_stack * sizeof(unsigned long), 0);
+               bpf_task_release(task);
+
+               otdata = bpf_map_lookup_elem(&owner_data, &pelem->lock);
+               id = get_owner_stack_id(buf);
+
+               /*
+                * Contention just happens, or corner case `lock` is owned by process not
+                * `owner_pid`. For the corner case we treat it as unexpected internal error and
+                * just ignore the precvious tracing record.
+                */
+               if (!otdata || otdata->pid != owner_pid) {
+                       struct owner_tracing_data first = {
+                               .pid = owner_pid,
+                               .timestamp = pelem->timestamp,
+                               .count = 1,
+                               .stack_id = id,
+                       };
+                       bpf_map_update_elem(&owner_data, &pelem->lock, &first, BPF_ANY);
+               }
+               /* Contention is ongoing and new waiter joins */
+               else {
+                       __sync_fetch_and_add(&otdata->count, 1);
+
+                       /*
+                        * The owner is the same, but stacktrace might be changed. In this case we
+                        * store/update `owner_stat` based on current owner stack id.
+                        */
+                       if (id != otdata->stack_id) {
+                               update_owner_stat(id, pelem->timestamp - otdata->timestamp,
+                                                 pelem->flags);
+
+                               otdata->timestamp = pelem->timestamp;
+                               otdata->stack_id = id;
+                       }
+               }
+skip_owner:
                 pelem->stack_id = bpf_get_stackid(ctx, &stacks,
                                                   BPF_F_FAST_STACK_CMP | stack_skip);
                 if (pelem->stack_id < 0)
@@ -473,6 +597,7 @@ int contention_end(u64 *ctx)
         struct tstamp_data *pelem;
         struct contention_key key = {};
         struct contention_data *data;
+       __u64 timestamp;
         __u64 duration;
         bool need_delete = false;
  
@@ -500,12 +625,88 @@ int contention_end(u64 *ctx)
                 need_delete = true;
         }
  
-       duration = bpf_ktime_get_ns() - pelem->timestamp;
+       timestamp = bpf_ktime_get_ns();
+       duration = timestamp - pelem->timestamp;
         if ((__s64)duration < 0) {
                 __sync_fetch_and_add(&time_fail, 1);
                 goto out;
         }
  
+       if (needs_callstack && lock_owner) {
+               struct owner_tracing_data *otdata = bpf_map_lookup_elem(&owner_data, &pelem->lock);
+
+               if (!otdata)
+                       goto skip_owner;
+
+               /* Update `owner_stat` */
+               update_owner_stat(otdata->stack_id, timestamp - otdata->timestamp, pelem->flags);
+
+               /* No contention is occurring, delete `lock` entry in `owner_data` */
+               if (otdata->count <= 1)
+                       bpf_map_delete_elem(&owner_data, &pelem->lock);
+               /*
+                * Contention is still ongoing, with a new owner (current task). `owner_data`
+                * should be updated accordingly.
+                */
+               else {
+                       u32 i = 0;
+                       s32 ret = (s32)ctx[1];
+                       u64 *buf;
+
+                       otdata->timestamp = timestamp;
+                       __sync_fetch_and_add(&otdata->count, -1);
+
+                       buf = bpf_map_lookup_elem(&stack_buf, &i);
+                       if (!buf)
+                               goto skip_owner;
+                       for (i = 0; i < (u32)max_stack; i++)
+                               buf[i] = 0x0;
+
+                       /*
+                        * `ret` has the return code of the lock function.
+                        * If `ret` is negative, the current task terminates lock waiting without
+                        * acquiring it. Owner is not changed, but we still need to update the owner
+                        * stack.
+                        */
+                       if (ret < 0) {
+                               s32 id = 0;
+                               struct task_struct *task;
+
+                               if (!bpf_task_from_pid)
+                                       goto skip_owner;
+
+                               task = bpf_task_from_pid(otdata->pid);
+                               if (!task)
+                                       goto skip_owner;
+
+                               bpf_get_task_stack(task, buf,
+                                                  max_stack * sizeof(unsigned long), 0);
+                               bpf_task_release(task);
+
+                               id = get_owner_stack_id(buf);
+
+                               /*
+                                * If owner stack is changed, update owner stack id for this lock.
+                                */
+                               if (id != otdata->stack_id)
+                                       otdata->stack_id = id;
+                       }
+                       /*
+                        * Otherwise, update tracing data with the current task, which is the new
+                        * owner.
+                        */
+                       else {
+                               otdata->pid = pid;
+                               /*
+                                * We don't want to retrieve callstack here, since it is where the
+                                * current task acquires the lock and provides no additional
+                                * information. We simply assign -1 to invalidate it.
+                                */
+                               otdata->stack_id = -1;
+                       }
+               }
+       }
+skip_owner:
         switch (aggr_mode) {
         case LOCK_AGGR_CALLER:
                 key.stack_id = pelem->stack_id;
@@ -589,14 +790,7 @@ int contention_end(u64 *ctx)
         }
  
  found:
-       __sync_fetch_and_add(&data->total_time, duration);
-       __sync_fetch_and_add(&data->count, 1);
-
-       /* FIXME: need atomic operations */
-       if (data->max_time < duration)
-               data->max_time = duration;
-       if (data->min_time > duration)
-               data->min_time = duration;
+       update_contention_data(data, duration, 1);
  
  out:
         pelem->lock = 0;
author	Chun-Tse Shao <ctshao@google.com>
	Thu, 27 Feb 2025 00:28:54 +0000 (16:28 -0800)
committer	Namhyung Kim <namhyung@kernel.org>
	Fri, 28 Feb 2025 08:29:37 +0000 (00:29 -0800)