On Tue, Mar 31, 2026 at 11:13 AM Steven Rostedt <[email protected]> wrote: > > From: Steven Rostedt <[email protected]> > > Add specific reporting of the futex system call. This allows for debugging > the futex code a bit easier. Instead of just showing the values passed > into the futex system call, read the value of the user space memory > pointed to by the addr parameter. > > Also make the op parameter more readable by parsing the values to show > what the command is: > > futex_requeue_p-3251 [002] ..... 2101.068479: sys_futex(uaddr: > 0x55e79a4da834 (0x80000cb1), FUTEX_LOCK_PI|FUTEX_PRIVATE_FLAG, val: 0) > futex_requeue_p-3248 [001] ..... 2101.068970: sys_futex(uaddr: > 0x7f859072f990 (0xcb2), FUTEX_WAIT_BITSET|FUTEX_CLOCK_REALTIME, val: 3250)
nit: Would this read better if `val` were also hex? futex_requeue_p-3248 [001] ..... 2101.068970: sys_futex(uaddr: 0x7f859072f990 (0xcb2), FUTEX_WAIT_BITSET|FUTEX_CLOCK_REALTIME, val: 0xcb2) The val matching the value in uaddr is the condition necessary for the futex to sleep. > futex_requeue_p-3252 [005] ..... 2101.069108: sys_futex(uaddr: > 0x55e79a4da838 (0), FUTEX_WAIT_REQUEUE_PI|FUTEX_PRIVATE_FLAG, val: 0, > timespec: 0x7ffe61076aa0, uaddr2: 0x55e79a4da834, uaddr2: 94453214586932, > val3: 0) > futex_requeue_p-3252 [005] ..... 2101.069410: sys_futex(uaddr: > 0x55e79a4da834 (0x80000cb1), FUTEX_LOCK_PI|FUTEX_PRIVATE_FLAG, val: 0) > > Signed-off-by: Steven Rostedt (Google) <[email protected]> > --- > Changes since v2: https://patch.msgid.link/[email protected] > > - Removed unused "buf" variable (kernel test robot) > > - Iterate __futex_cmds[] make the print statement. > Note this required exposing __futex_cmds[] to trace_syscall.c > > - Added back val statement (with the move to futex/syscall.c the > third parameter was dropped). > > include/linux/futex.h | 8 +- > kernel/futex/syscalls.c | 97 +++++++++++++++++++++++ > kernel/trace/trace_syscalls.c | 144 +++++++++++++++++++++++++++++++++- > 3 files changed, 245 insertions(+), 4 deletions(-) > > diff --git a/include/linux/futex.h b/include/linux/futex.h > index 9e9750f04980..7eaf01ff68cf 100644 > --- a/include/linux/futex.h > +++ b/include/linux/futex.h > @@ -82,6 +82,11 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t > *timeout, > u32 __user *uaddr2, u32 val2, u32 val3); > int futex_hash_prctl(unsigned long arg2, unsigned long arg3, unsigned long > arg4); > > +#ifdef CONFIG_FTRACE_SYSCALLS > +void futex_print_syscall(struct seq_buf *s, int nr_args, unsigned long > *args, u32 *ptr); > +extern const char * __futex_cmds[]; > +#endif > + > #ifdef CONFIG_FUTEX_PRIVATE_HASH > int futex_hash_allocate_default(void); > void futex_hash_free(struct mm_struct *mm); > @@ -114,7 +119,8 @@ static inline int futex_hash_allocate_default(void) > } > static inline int futex_hash_free(struct mm_struct *mm) { return 0; } > static inline int futex_mm_init(struct mm_struct *mm) { return 0; } > - > +static inline void futex_print_syscall(struct seq_buf *s, int nr_args, > + unsigned long *args, u32 *ptr) { } > #endif > > #endif > diff --git a/kernel/futex/syscalls.c b/kernel/futex/syscalls.c > index 743c7a728237..3ba8ca017c9c 100644 > --- a/kernel/futex/syscalls.c > +++ b/kernel/futex/syscalls.c > @@ -171,6 +171,18 @@ static __always_inline bool futex_cmd_has_timeout(u32 > cmd) > return false; > } > > +static __always_inline bool futex_cmd_has_addr2(u32 cmd) > +{ > + switch (cmd) { > + case FUTEX_REQUEUE: > + case FUTEX_CMP_REQUEUE: > + case FUTEX_WAKE_OP: > + case FUTEX_WAIT_REQUEUE_PI: > + return true; > + } > + return false; > +} > + > static __always_inline int > futex_init_timeout(u32 cmd, u32 op, struct timespec64 *ts, ktime_t *t) > { > @@ -207,6 +219,91 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, > u32, val, > return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, > val3); > } > > +#ifdef CONFIG_FTRACE_SYSCALLS > + > +/* End with NULL for iterators */ > +const char * __futex_cmds[] = > +{ > + "FUTEX_WAIT", "FUTEX_WAKE", "FUTEX_FD", "FUTEX_REQUEUE", > + "FUTEX_CMP_REQUEUE", "FUTEX_WAKE_OP", "FUTEX_LOCK_PI", > + "FUTEX_UNLOCK_PI", "FUTEX_TRYLOCK_PI", "FUTEX_WAIT_BITSET", > + "FUTEX_WAKE_BITSET", "FUTEX_WAIT_REQUEUE_PI", "FUTEX_CMP_REQUEUE_PI", > + "FUTEX_LOCK_PI2", NULL > +}; > + > +void futex_print_syscall(struct seq_buf *s, int nr_args, unsigned long > *args, u32 *ptr) > +{ > + unsigned int op, cmd; > + bool done = false; > + > + for (int i = 0; !done && i < nr_args; i++) { > + > + if (seq_buf_has_overflowed(s)) > + break; > + > + switch (i) { > + case 0: > + seq_buf_printf(s, "uaddr: 0x%lx", args[i]); > + if (ptr) { > + u32 val = *ptr; > + if (val < 10) > + seq_buf_printf(s, " (%u)", val); > + else > + seq_buf_printf(s, " (0x%x)", val); > + } > + continue; > + case 1: > + op = args[i]; > + cmd = op & FUTEX_CMD_MASK; > + if (cmd <= FUTEX_LOCK_PI2) > + seq_buf_printf(s, ", %s", __futex_cmds[cmd]); > + else > + seq_buf_puts(s, ", UNKNOWN"); > + > + if (op & FUTEX_PRIVATE_FLAG) > + seq_buf_puts(s, "|FUTEX_PRIVATE_FLAG"); > + if (op & FUTEX_CLOCK_REALTIME) > + seq_buf_puts(s, "|FUTEX_CLOCK_REALTIME"); > + continue; > + case 2: > + if (args[i] < 10) > + seq_buf_printf(s, ", val: %ld", args[i]); > + else > + seq_buf_printf(s, ", val: 0x%lx", args[i]); I'm confused why this hex formatting didn't appear in the sample output. Thanks, Ian > + continue; > + case 3: > + if (!futex_cmd_has_timeout(cmd)) { > + > + if (!futex_cmd_has_addr2(cmd)) { > + done = true; > + continue; > + } > + > + seq_buf_printf(s, ", val2: 0x%x", > (u32)(long)args[i]); > + continue; > + } > + > + if (!args[i]) > + continue; > + > + seq_buf_printf(s, ", timespec: 0x%lx", args[i]); > + continue; > + case 4: > + if (!futex_cmd_has_addr2(cmd)) { > + done = true; > + continue; > + } > + seq_buf_printf(s, ", uaddr2: 0x%lx", args[i]); > + continue; > + case 5: > + seq_buf_printf(s, ", val3: %lu", args[i]); > + done = true; > + continue; > + } > + } > +} > +#endif > + > /** > * futex_parse_waitv - Parse a waitv array from userspace > * @futexv: Kernel side list of waiters to be filled > diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c > index 37317b81fcda..f8213d772f89 100644 > --- a/kernel/trace/trace_syscalls.c > +++ b/kernel/trace/trace_syscalls.c > @@ -6,11 +6,13 @@ > #include <linux/slab.h> > #include <linux/kernel.h> > #include <linux/module.h> /* for MODULE_NAME_LEN via KSYM_SYMBOL_LEN */ > +#include <linux/futex.h> > #include <linux/ftrace.h> > #include <linux/perf_event.h> > #include <linux/xarray.h> > #include <asm/syscall.h> > > + > #include "trace_output.h" > #include "trace.h" > > @@ -237,6 +239,27 @@ sys_enter_openat_print(struct syscall_trace_enter > *trace, struct syscall_metadat > return trace_handle_return(s); > } > > +static enum print_line_t > +sys_enter_futex_print(struct syscall_trace_enter *trace, struct > syscall_metadata *entry, > + struct trace_seq *s, struct trace_event *event, int > ent_size) > +{ > + void *end = (void *)trace + ent_size; > + void *ptr; > + > + /* Set ptr to the user space copied area */ > + ptr = (void *)trace->args + sizeof(unsigned long) * entry->nb_args; > + if (ptr + 4 > end) > + ptr = NULL; > + > + trace_seq_printf(s, "%s(", entry->name); > + > + futex_print_syscall(&s->seq, entry->nb_args, trace->args, ptr); > + > + trace_seq_puts(s, ")\n"); > + > + return trace_handle_return(s); > +} > + > static enum print_line_t > print_syscall_enter(struct trace_iterator *iter, int flags, > struct trace_event *event) > @@ -267,6 +290,10 @@ print_syscall_enter(struct trace_iterator *iter, int > flags, > if (!tr || !(tr->trace_flags & TRACE_ITER(VERBOSE))) > return sys_enter_openat_print(trace, entry, s, event); > break; > + case __NR_futex: > + if (!tr || !(tr->trace_flags & TRACE_ITER(VERBOSE))) > + return sys_enter_futex_print(trace, entry, s, event, > iter->ent_size); > + break; > default: > break; > } > @@ -437,6 +464,48 @@ sys_enter_openat_print_fmt(struct syscall_metadata > *entry, char *buf, int len) > return pos; > } > > +static int __init > +sys_enter_futex_print_fmt(struct syscall_metadata *entry, char *buf, int len) > +{ > + int pos = 0; > + > + pos += snprintf(buf + pos, LEN_OR_ZERO, > + "\"uaddr: 0x%%lx (0x%%lx) cmd=%%s%%s%%s"); > + pos += snprintf(buf + pos, LEN_OR_ZERO, > + " val: 0x%%x timeout/val2: 0x%%llx"); > + pos += snprintf(buf + pos, LEN_OR_ZERO, > + " uaddr2: 0x%%lx val3: 0x%%x\", "); > + > + pos += snprintf(buf + pos, LEN_OR_ZERO, > + " REC->uaddr,"); > + pos += snprintf(buf + pos, LEN_OR_ZERO, > + " REC->__value,"); > + pos += snprintf(buf + pos, LEN_OR_ZERO, > + " __print_symbolic(REC->op & 0x%x, ", > FUTEX_CMD_MASK); > + > + for (int i = 0; __futex_cmds[i]; i++) { > + pos += snprintf(buf + pos, LEN_OR_ZERO, > + "%s{%d, \"%s\"} ", > + i ? "," : "", i, __futex_cmds[i]); > + } > + > + pos += snprintf(buf + pos, LEN_OR_ZERO, "), "); > + > + pos += snprintf(buf + pos, LEN_OR_ZERO, > + " (REC->op & %d) ? \"|FUTEX_PRIVATE_FLAG\" : \"\",", > + FUTEX_PRIVATE_FLAG); > + pos += snprintf(buf + pos, LEN_OR_ZERO, > + " (REC->op & %d) ? \"|FUTEX_CLOCK_REALTIME\" : \"\",", > + FUTEX_CLOCK_REALTIME); > + > + pos += snprintf(buf + pos, LEN_OR_ZERO, > + " REC->val, REC->utime,"); > + > + pos += snprintf(buf + pos, LEN_OR_ZERO, > + " REC->uaddr, REC->val3"); > + return pos; > +} > + > static int __init > __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len) > { > @@ -447,6 +516,8 @@ __set_enter_print_fmt(struct syscall_metadata *entry, > char *buf, int len) > switch (entry->syscall_nr) { > case __NR_openat: > return sys_enter_openat_print_fmt(entry, buf, len); > + case __NR_futex: > + return sys_enter_futex_print_fmt(entry, buf, len); > default: > break; > } > @@ -523,6 +594,21 @@ static void __init free_syscall_print_fmt(struct > trace_event_call *call) > kfree(call->print_fmt); > } > > +static int __init futex_fields(struct trace_event_call *call, int offset) > +{ > + char *arg; > + int ret; > + > + arg = kstrdup("__value", GFP_KERNEL); > + if (WARN_ON_ONCE(!arg)) > + return -ENOMEM; > + ret = trace_define_field(call, "u32", arg, offset, sizeof(int), 0, > + FILTER_OTHER); > + if (ret) > + kfree(arg); > + return ret; > +} > + > static int __init syscall_enter_define_fields(struct trace_event_call *call) > { > struct syscall_trace_enter trace; > @@ -544,6 +630,9 @@ static int __init syscall_enter_define_fields(struct > trace_event_call *call) > offset += sizeof(unsigned long); > } > > + if (!ret && meta->syscall_nr == __NR_futex) > + return futex_fields(call, offset); > + > if (ret || !meta->user_mask) > return ret; > > @@ -689,6 +778,45 @@ static int syscall_copy_user_array(char *buf, const char > __user *ptr, > return 0; > } > > +static int > +syscall_get_futex(unsigned long *args, char **buffer, int *size, int > buf_size) > +{ > + struct syscall_user_buffer *sbuf; > + const char __user *ptr; > + > + /* buf_size of zero means user doesn't want user space read */ > + if (!buf_size) > + return -1; > + > + /* If the syscall_buffer is NULL, tracing is being shutdown */ > + sbuf = READ_ONCE(syscall_buffer); > + if (!sbuf) > + return -1; > + > + ptr = (char __user *)args[0]; > + > + *buffer = trace_user_fault_read(&sbuf->buf, ptr, 4, NULL, NULL); > + if (!*buffer) > + return -1; > + > + /* Add room for the value */ > + *size += 4; > + > + return 0; > +} > + > +static void syscall_put_futex(struct syscall_metadata *sys_data, > + struct syscall_trace_enter *entry, > + char *buffer) > +{ > + u32 *ptr; > + > + /* Place the futex key into the storage */ > + ptr = (void *)entry->args + sizeof(unsigned long) * sys_data->nb_args; > + > + *ptr = *(u32 *)buffer; > +} > + > static char *sys_fault_user(unsigned int buf_size, > struct syscall_metadata *sys_data, > struct syscall_user_buffer *sbuf, > @@ -905,6 +1033,9 @@ static void ftrace_syscall_enter(void *data, struct > pt_regs *regs, long id) > if (syscall_get_data(sys_data, args, &user_ptr, > &size, user_sizes, &uargs, > tr->syscall_buf_sz) < 0) > return; > + } else if (syscall_nr == __NR_futex) { > + if (syscall_get_futex(args, &user_ptr, &size, > tr->syscall_buf_sz) < 0) > + return; > } > > size += sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args; > @@ -921,6 +1052,9 @@ static void ftrace_syscall_enter(void *data, struct > pt_regs *regs, long id) > if (mayfault) > syscall_put_data(sys_data, entry, user_ptr, size, user_sizes, > uargs); > > + else if (syscall_nr == __NR_futex) > + syscall_put_futex(sys_data, entry, user_ptr); > + > trace_event_buffer_commit(&fbuffer); > } > > @@ -971,14 +1105,18 @@ static int reg_event_syscall_enter(struct > trace_event_file *file, > { > struct syscall_metadata *sys_data = call->data; > struct trace_array *tr = file->tr; > + bool enable_faults; > int ret = 0; > int num; > > num = sys_data->syscall_nr; > if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) > return -ENOSYS; > + > + enable_faults = sys_data->user_mask || num == __NR_futex; > + > guard(mutex)(&syscall_trace_lock); > - if (sys_data->user_mask) { > + if (enable_faults) { > ret = syscall_fault_buffer_enable(); > if (ret < 0) > return ret; > @@ -986,7 +1124,7 @@ static int reg_event_syscall_enter(struct > trace_event_file *file, > if (!tr->sys_refcount_enter) { > ret = register_trace_sys_enter(ftrace_syscall_enter, tr); > if (ret < 0) { > - if (sys_data->user_mask) > + if (enable_faults) > syscall_fault_buffer_disable(); > return ret; > } > @@ -1011,7 +1149,7 @@ static void unreg_event_syscall_enter(struct > trace_event_file *file, > WRITE_ONCE(tr->enter_syscall_files[num], NULL); > if (!tr->sys_refcount_enter) > unregister_trace_sys_enter(ftrace_syscall_enter, tr); > - if (sys_data->user_mask) > + if (sys_data->user_mask || num == __NR_futex) > syscall_fault_buffer_disable(); > } > > -- > 2.51.0 > >
