Skip to content

Commit

Permalink
libbpf-tools/klockstat: Search for correct stack offset in userspace
Browse files Browse the repository at this point in the history
The klockstat BPF code contains a hard-coded offset (of 4) for the
captured stack entries, which is supposed to get rid of the topmost
entries that are just locking functions. However, when running the
klockstat tool on RHEL, this offset is not correct, leading to output
like the following:

                               Caller  Avg Wait    Count   Max Wait   Total Wait
                       mutex_lock+0x5    1.1 us        3     1.8 us       3.2 us
                       mutex_lock+0x5    825 ns        5     1.6 us       4.1 us
                       mutex_lock+0x5    1.1 us        4     1.5 us       4.3 us

                               Caller  Avg Hold    Count   Max Hold   Total Hold
                       mutex_lock+0x5   18.0 ms        3    23.0 ms      54.0 ms
                       mutex_lock+0x5    1.7 ms       15     2.2 ms      25.1 ms
                       mutex_lock+0x5    1.7 ms       15     2.2 ms      25.1 ms

To fix this, remove the offset in the BPF code and add logic in the
userspace component to walk the stack frames and find the right offset.
We do this by keeping a list of all the possible lock function names we
attach to, pre-resolving them into kernel symbol addresses. Then, each
time we process a stack trace obtained from the kernel, we walk the
stack until we find one of those symbols, and use that (+1) as the stack
offset to process.

This makes the output work correctly regardless of the offset of the
stack offset on a given system.

Signed-off-by: Toke Høiland-Jørgensen <[email protected]>
  • Loading branch information
tohojo committed Feb 26, 2025
1 parent 2b32635 commit 22cca3c
Show file tree
Hide file tree
Showing 2 changed files with 74 additions and 17 deletions.
12 changes: 1 addition & 11 deletions libbpf-tools/klockstat.bpf.c
Original file line number Diff line number Diff line change
Expand Up @@ -95,17 +95,7 @@ static void lock_contended(void *ctx, void *lock)

li->task_id = task_id;
li->lock_ptr = (u64)lock;
/*
* Skip 4 frames, e.g.:
* __this_module+0x34ef
* __this_module+0x34ef
* __this_module+0x8c44
* mutex_lock+0x5
*
* Note: if you make major changes to this bpf program, double check
* that you aren't skipping too many frames.
*/
li->stack_id = bpf_get_stackid(ctx, &stack_map, 4 | BPF_F_FAST_STACK_CMP);
li->stack_id = bpf_get_stackid(ctx, &stack_map, BPF_F_FAST_STACK_CMP);

/* Legit failures include EEXIST */
if (li->stack_id < 0)
Expand Down
79 changes: 73 additions & 6 deletions libbpf-tools/klockstat.c
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
#include "trace_helpers.h"

#define warn(...) fprintf(stderr, __VA_ARGS__)
#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x)))

enum {
SORT_ACQ_MAX,
Expand Down Expand Up @@ -93,6 +94,29 @@ static const char program_doc[] =
" klockstat -P # print stats per thread\n"
;

static const char *lock_ksym_names[] = {
"mutex_lock",
"mutex_lock_nested",
"mutex_lock_interruptible",
"mutex_lock_interruptible_nested",
"mutex_lock_killable",
"mutex_lock_killable_nested",
"mutex_trylock",
"down_read",
"down_read_nested",
"down_read_interruptible",
"down_read_killable",
"down_read_killable_nested",
"down_read_trylock",
"down_write",
"down_write_nested",
"down_write_killable",
"down_write_killable_nested",
"down_write_trylock",
};

static unsigned long lock_ksym_addr[ARRAY_SIZE(lock_ksym_names)];

static const struct argp_option opts[] = {
{ "pid", 'p', "PID", 0, "Filter by process ID", 0 },
{ "tid", 't', "TID", 0, "Filter by thread ID", 0 },
Expand Down Expand Up @@ -359,6 +383,38 @@ static char *symname(struct ksyms *ksyms, uint64_t pc, char *buf, size_t n)
return buf;
}

static int resolve_lock_ksyms(struct ksyms *ksyms)
{
const struct ksym *ksym;
int i, j = 0;

for (i = 0; i < ARRAY_SIZE(lock_ksym_names); i++) {
ksym = ksyms__get_symbol(ksyms, lock_ksym_names[i]);
if (!ksym)
continue;

lock_ksym_addr[j++] = ksym->addr;
}

return !j;
}

static int find_stack_offset(struct ksyms *ksyms, struct stack_stat *ss)
{
const struct ksym *ksym;
int i, j;

for (i = 0; i < PERF_MAX_STACK_DEPTH && ss->bt[i]; i++) {
ksym = ksyms__map_addr(ksyms, ss->bt[i]);

for (j = 0; j < ARRAY_SIZE(lock_ksym_addr) && lock_ksym_addr[j]; j++)
if (ksym->addr == lock_ksym_addr[j])
return i + 1;
}

return 0;
}

static char *print_caller(char *buf, int size, struct stack_stat *ss)
{
snprintf(buf, size, "%u %16s", ss->stack_id, ss->ls.acq_max_comm);
Expand Down Expand Up @@ -404,19 +460,20 @@ static void print_acq_header(void)
static void print_acq_stat(struct ksyms *ksyms, struct stack_stat *ss,
int nr_stack_entries)
{
int offset = find_stack_offset(ksyms, ss);
char buf[40];
char avg[40];
char max[40];
char tot[40];
int i;

printf("%37s %9s %8llu %10s %12s\n",
symname(ksyms, ss->bt[0], buf, sizeof(buf)),
symname(ksyms, ss->bt[offset], buf, sizeof(buf)),
print_time(avg, sizeof(avg), ss->ls.acq_total_time / ss->ls.acq_count),
ss->ls.acq_count,
print_time(max, sizeof(max), ss->ls.acq_max_time),
print_time(tot, sizeof(tot), ss->ls.acq_total_time));
for (i = 1; i < nr_stack_entries; i++) {
for (i = offset + 1; i < nr_stack_entries + offset; i++) {
if (!ss->bt[i] || env.per_thread)
break;
printf("%37s\n", symname(ksyms, ss->bt[i], buf, sizeof(buf)));
Expand Down Expand Up @@ -457,19 +514,20 @@ static void print_hld_header(void)
static void print_hld_stat(struct ksyms *ksyms, struct stack_stat *ss,
int nr_stack_entries)
{
int offset = find_stack_offset(ksyms, ss);
char buf[40];
char avg[40];
char max[40];
char tot[40];
int i;

printf("%37s %9s %8llu %10s %12s\n",
symname(ksyms, ss->bt[0], buf, sizeof(buf)),
symname(ksyms, ss->bt[offset], buf, sizeof(buf)),
print_time(avg, sizeof(avg), ss->ls.hld_total_time / ss->ls.hld_count),
ss->ls.hld_count,
print_time(max, sizeof(max), ss->ls.hld_max_time),
print_time(tot, sizeof(tot), ss->ls.hld_total_time));
for (i = 1; i < nr_stack_entries; i++) {
for (i = offset + 1; i < nr_stack_entries + offset; i++) {
if (!ss->bt[i] || env.per_thread)
break;
printf("%37s\n", symname(ksyms, ss->bt[i], buf, sizeof(buf)));
Expand Down Expand Up @@ -504,7 +562,7 @@ static int print_stats(struct ksyms *ksyms, int stack_map, int stat_map)
size_t stats_sz = 1;
uint32_t lookup_key = 0;
uint32_t stack_id;
int ret, i;
int ret, i, offset;
int nr_stack_entries;

stats = calloc(stats_sz, sizeof(void *));
Expand Down Expand Up @@ -538,7 +596,9 @@ static int print_stats(struct ksyms *ksyms, int stack_map, int stat_map)
/* Can still report the results without a backtrace. */
warn("failed to lookup stack_id %u\n", stack_id);
}
if (!env.per_thread && !caller_is_traced(ksyms, ss->bt[0])) {

offset = find_stack_offset(ksyms, ss);
if (!env.per_thread && !caller_is_traced(ksyms, ss->bt[offset])) {
free(ss);
continue;
}
Expand Down Expand Up @@ -721,6 +781,13 @@ int main(int argc, char **argv)
err = 1;
goto cleanup;
}

err = resolve_lock_ksyms(ksyms);
if (err) {
warn("failed to resolve lock ksyms\n");
goto cleanup;
}

if (env.lock_name) {
lock_addr = get_lock_addr(ksyms, env.lock_name);
if (!lock_addr) {
Expand Down

0 comments on commit 22cca3c

Please sign in to comment.