From: Pengfei Li <[email protected]> Add TRACE_STACK_ID event type and integrate ftrace_stackmap into __ftrace_trace_stack(). When the 'stackmap' trace option is enabled, the stack recording path stores a 4-byte stack_id in the ring buffer instead of the full stack trace.
Changes: - New TRACE_STACK_ID in trace_type enum - New stack_id_entry in trace_entries.h - New TRACE_ITER(STACKMAP) trace option flag; when CONFIG_FTRACE_STACKMAP is disabled, TRACE_ITER_STACKMAP_BIT is defined as -1 so that TRACE_ITER(STACKMAP) evaluates to 0 (following the existing pattern used by TRACE_ITER_PROF_TEXT_OFFSET) - 'stackmap' is added to TOP_LEVEL_TRACE_FLAGS and ZEROED_TRACE_FLAGS so it is only exposed under the top-level trace instance, matching the convention already used for global-only options such as 'printk' and 'record-cmd'. Secondary instances under tracing/instances/*/ do not see the option at all, avoiding a confusing no-op. - Modified __ftrace_trace_stack() to call ftrace_stackmap_get_id() when the stackmap option is active. If reserving a TRACE_STACK_ID ring-buffer slot fails after a successful get_id(), the path falls through to the full-stack recording so the event still gets a stack trace recorded. - Stackmap pointer read with smp_load_acquire(), published with smp_store_release() to ensure proper initialization ordering - NULL check on tr->stackmap is retained as defense-in-depth: events that fire before fs_initcall (when the map is created) or after a failed ftrace_stackmap_create() observe a NULL pointer and fall back to full stack recording without dereferencing it - ftrace_stackmap_create() takes the owning trace_array so the stackmap can later check tracing state during reset - Added stack_id print handler in trace_output.c - Added TRACE_STACK_ID to trace_valid_entry() in trace_selftest.c so ftrace startup selftests don't reject the new entry type when the stackmap option is enabled Fallback behavior: if stackmap returns an error (pool exhausted, resetting, or NULL pointer), the full stack trace is recorded as before -- no new failure modes introduced. Per-instance stackmap support is left as a follow-up; gating the option via TOP_LEVEL_TRACE_FLAGS makes the global-only scope explicit at the tracefs interface rather than relying on a silent runtime fallback. Usage: echo 1 > /sys/kernel/debug/tracing/options/stackmap echo 1 > /sys/kernel/debug/tracing/options/stacktrace Signed-off-by: Pengfei Li <[email protected]> --- kernel/trace/trace.c | 78 ++++++++++++++++++++++++++++++++++- kernel/trace/trace.h | 16 +++++++ kernel/trace/trace_entries.h | 15 +++++++ kernel/trace/trace_output.c | 23 +++++++++++ kernel/trace/trace_selftest.c | 1 + 5 files changed, 131 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 6eb4d3097a4d..36120355e549 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -57,6 +57,7 @@ #include "trace.h" #include "trace_output.h" +#include "trace_stackmap.h" #ifdef CONFIG_FTRACE_STARTUP_TEST /* @@ -509,12 +510,13 @@ EXPORT_SYMBOL_GPL(unregister_ftrace_export); /* trace_options that are only supported by global_trace */ #define TOP_LEVEL_TRACE_FLAGS (TRACE_ITER(PRINTK) | \ TRACE_ITER(PRINTK_MSGONLY) | TRACE_ITER(RECORD_CMD) | \ - TRACE_ITER(PROF_TEXT_OFFSET) | FPROFILE_DEFAULT_FLAGS) + TRACE_ITER(PROF_TEXT_OFFSET) | TRACE_ITER(STACKMAP) | \ + FPROFILE_DEFAULT_FLAGS) /* trace_flags that are default zero for instances */ #define ZEROED_TRACE_FLAGS \ (TRACE_ITER(EVENT_FORK) | TRACE_ITER(FUNC_FORK) | TRACE_ITER(TRACE_PRINTK) | \ - TRACE_ITER(COPY_MARKER)) + TRACE_ITER(COPY_MARKER) | TRACE_ITER(STACKMAP)) /* * The global_trace is the descriptor that holds the top-level tracing @@ -2184,6 +2186,49 @@ void __ftrace_trace_stack(struct trace_array *tr, } #endif +#ifdef CONFIG_FTRACE_STACKMAP + /* + * If stackmap dedup is enabled, try to store only the stack_id + * in the ring buffer instead of the full stack trace. + */ + if (tr->trace_flags & TRACE_ITER(STACKMAP)) { + struct ftrace_stackmap *smap; + struct stack_id_entry *sid_entry; + int sid; + + smap = smp_load_acquire(&tr->stackmap); + if (!smap) + goto full_stack; + + sid = ftrace_stackmap_get_id(smap, fstack->calls, nr_entries); + if (sid >= 0) { + event = __trace_buffer_lock_reserve(buffer, + TRACE_STACK_ID, + sizeof(*sid_entry), trace_ctx); + if (!event) { + /* + * Could not reserve a TRACE_STACK_ID slot; + * fall back to the full-stack path so the + * event still gets a stack trace recorded. + */ + goto full_stack; + } + sid_entry = ring_buffer_event_data(event); + sid_entry->stack_id = sid; + /* + * stack_id is a synthetic side-event attached to a + * primary trace event that was already subject to + * filtering. No per-event filter is defined for + * TRACE_STACK_ID, so commit unconditionally. + */ + __buffer_unlock_commit(buffer, event); + goto out; + } + /* On stackmap failure, record the full stack instead. */ + } +full_stack: +#endif + event = __trace_buffer_lock_reserve(buffer, TRACE_STACK, struct_size(entry, caller, nr_entries), trace_ctx); @@ -9222,6 +9267,35 @@ static __init void tracer_init_tracefs_work_func(struct work_struct *work) NULL, &tracing_dyn_info_fops); #endif +#ifdef CONFIG_FTRACE_STACKMAP + { + struct ftrace_stackmap *smap; + + smap = ftrace_stackmap_create(&global_trace); + if (!IS_ERR(smap)) { + /* + * Use smp_store_release to ensure the stackmap + * structure is fully initialized before publishing + * the pointer to concurrent trace event readers. + */ + smp_store_release(&global_trace.stackmap, smap); + trace_create_file("stack_map", TRACE_MODE_WRITE, NULL, + smap, &ftrace_stackmap_fops); + trace_create_file("stack_map_stat", TRACE_MODE_READ, NULL, + smap, &ftrace_stackmap_stat_fops); + trace_create_file("stack_map_bin", TRACE_MODE_READ, NULL, + smap, &ftrace_stackmap_bin_fops); + } else { + pr_warn("ftrace stackmap init failed, dedup disabled\n"); + /* + * global_trace is statically defined; its stackmap + * field is zero-initialized via BSS, so leaving it + * NULL ensures the smp_load_acquire() in + * __ftrace_trace_stack() falls back to full stack. + */ + } + } +#endif create_trace_instances(NULL); update_tracer_options(); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 80fe152af1dd..7e7d5e5a35ff 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -57,6 +57,7 @@ enum trace_type { TRACE_TIMERLAT, TRACE_RAW_DATA, TRACE_FUNC_REPEATS, + TRACE_STACK_ID, __TRACE_LAST_TYPE, }; @@ -453,6 +454,9 @@ struct trace_array { struct cond_snapshot *cond_snapshot; #endif struct trace_func_repeats __percpu *last_func_repeats; +#ifdef CONFIG_FTRACE_STACKMAP + struct ftrace_stackmap *stackmap; +#endif /* * On boot up, the ring buffer is set to the minimum size, so that * we do not waste memory on systems that are not using tracing. @@ -579,6 +583,8 @@ extern void __ftrace_bad_type(void); TRACE_GRAPH_RET); \ IF_ASSIGN(var, ent, struct func_repeats_entry, \ TRACE_FUNC_REPEATS); \ + IF_ASSIGN(var, ent, struct stack_id_entry, \ + TRACE_STACK_ID); \ __ftrace_bad_type(); \ } while (0) @@ -1449,7 +1455,16 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf, # define STACK_FLAGS #endif +#ifdef CONFIG_FTRACE_STACKMAP +# define STACKMAP_FLAGS \ + C(STACKMAP, "stackmap"), +#else +# define STACKMAP_FLAGS +# define TRACE_ITER_STACKMAP_BIT -1 +#endif + #ifdef CONFIG_FUNCTION_PROFILER + # define PROFILER_FLAGS \ C(PROF_TEXT_OFFSET, "prof-text-offset"), # ifdef CONFIG_FUNCTION_GRAPH_TRACER @@ -1506,6 +1521,7 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf, FUNCTION_FLAGS \ FGRAPH_FLAGS \ STACK_FLAGS \ + STACKMAP_FLAGS \ BRANCH_FLAGS \ PROFILER_FLAGS \ FPROFILE_FLAGS diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index 54417468fdeb..89ed14b7e5fd 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -250,6 +250,21 @@ FTRACE_ENTRY(user_stack, userstack_entry, (void *)__entry->caller[6], (void *)__entry->caller[7]) ); +/* + * Stack ID entry - stores only a stack_id referencing the stackmap. + * Used when CONFIG_FTRACE_STACKMAP is enabled to deduplicate stacks. + */ +FTRACE_ENTRY(stack_id, stack_id_entry, + + TRACE_STACK_ID, + + F_STRUCT( + __field( int, stack_id ) + ), + + F_printk("<stack_id %d>", __entry->stack_id) +); + /* * trace_printk entry: */ diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index a5ad76175d10..68678ea88159 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -1517,6 +1517,28 @@ static struct trace_event trace_user_stack_event = { .funcs = &trace_user_stack_funcs, }; +/* TRACE_STACK_ID */ +static enum print_line_t trace_stack_id_print(struct trace_iterator *iter, + int flags, struct trace_event *event) +{ + struct stack_id_entry *field; + struct trace_seq *s = &iter->seq; + + trace_assign_type(field, iter->ent); + trace_seq_printf(s, "<stack_id %d>\n", field->stack_id); + + return trace_handle_return(s); +} + +static struct trace_event_functions trace_stack_id_funcs = { + .trace = trace_stack_id_print, +}; + +static struct trace_event trace_stack_id_event = { + .type = TRACE_STACK_ID, + .funcs = &trace_stack_id_funcs, +}; + /* TRACE_HWLAT */ static enum print_line_t trace_hwlat_print(struct trace_iterator *iter, int flags, @@ -1908,6 +1930,7 @@ static struct trace_event *events[] __initdata = { &trace_wake_event, &trace_stack_event, &trace_user_stack_event, + &trace_stack_id_event, &trace_bputs_event, &trace_bprint_event, &trace_print_event, diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 929c84075315..0c97065b0d68 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -14,6 +14,7 @@ static inline int trace_valid_entry(struct trace_entry *entry) case TRACE_CTX: case TRACE_WAKE: case TRACE_STACK: + case TRACE_STACK_ID: case TRACE_PRINT: case TRACE_BRANCH: case TRACE_GRAPH_ENT: -- 2.34.1
