Hi Steve, I updated the patch with all the fixes except for update timestamp part.
I used ftrace_export as you suggested, but I don't know how to hook to ringbuffer to update the timestamp, I used the same way as before to update the time_delta in function ftrace_vearly_export_write. static u64 export_delta __initdata; static void __init ftrace_vearly_export_write(const void *buf, unsigned int len) { struct ring_buffer_event *event; const u32 *entry = buf; event = container_of(entry, struct ring_buffer_event, array[0]); event->time_delta = export_delta; } Do I have to add a hook somewhere in the kernel/trace/ringbuffer.c ? And then call it from ftrace_vearly_export_write ? PATCH v2 -------------------------------------------------------------------- The very early tracing will start from the beginning of start_kernel() and will stop at ftrace_init(). start_kernel() { ftrace_early_init() <--- start very early tracing ... (calls) ... ftrace_init() <--- stop very early tracing early_trace_init(); <--- fill ring buffer, start function tracer if on ... } Currently, the events are saved in a temporary static buffer, which they will be copied to the ring buffer when it will be allocated. For the timestamp, we use rdtsc for x86, and sched_clock for the other archs. Currently dynamic tracing is not implemented with live patching, we use ftrace_filter and ftrace_notrace to find which functions to be filtered (traced / not traced), then during the callback from mcount hook, we do binary search lookup to decide which function to be save or not. Signed-off-by: Abderrahmane Benbachir <abderrahmane.benbac...@polymtl.ca> Cc: Steven Rostedt <rost...@goodmis.org> Cc: Ingo Molnar <mi...@redhat.com> Cc: Peter Zijlstra <pet...@infradead.org> Cc: Mathieu Desnoyers <mathieu.desnoy...@efficios.com> Cc: linux-kernel@vger.kernel.org --- arch/x86/Kconfig | 1 + arch/x86/kernel/ftrace_32.S | 73 ++++++++++ arch/x86/kernel/ftrace_64.S | 14 ++ include/linux/ftrace.h | 16 ++- init/main.c | 1 + kernel/trace/Kconfig | 42 ++++++ kernel/trace/ftrace.c | 341 +++++++++++++++++++++++++++++++++++++++++++- kernel/trace/trace.c | 3 + 8 files changed, 487 insertions(+), 4 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 971feac13506..03da741c088b 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -137,6 +137,7 @@ config X86 select HAVE_FENTRY if X86_64 || DYNAMIC_FTRACE select HAVE_FTRACE_MCOUNT_RECORD select HAVE_FUNCTION_GRAPH_TRACER + select HAVE_VERY_EARLY_FTRACE select HAVE_FUNCTION_TRACER select HAVE_GCC_PLUGINS select HAVE_HW_BREAKPOINT diff --git a/arch/x86/kernel/ftrace_32.S b/arch/x86/kernel/ftrace_32.S index 722a145b4139..5d93037872b2 100644 --- a/arch/x86/kernel/ftrace_32.S +++ b/arch/x86/kernel/ftrace_32.S @@ -29,8 +29,81 @@ EXPORT_SYMBOL(mcount) # define MCOUNT_FRAME 0 /* using frame = false */ #endif +.macro save_mcount_regs +#ifdef USING_FRAME_POINTER +# ifdef CC_USING_FENTRY + /* + * Frame pointers are of ip followed by bp. + * Since fentry is an immediate jump, we are left with + * parent-ip, function-ip. We need to add a frame with + * parent-ip followed by ebp. + */ + pushl 4(%esp) /* parent ip */ + pushl %ebp + movl %esp, %ebp + pushl 2*4(%esp) /* function ip */ +# endif + /* For mcount, the function ip is directly above */ + pushl %ebp + movl %esp, %ebp +#endif + pushl %eax + pushl %ecx + pushl %edx + pushl $0 /* Pass NULL as regs pointer */ + +#ifdef USING_FRAME_POINTER + /* Load parent ebp into edx */ + movl 4*4(%esp), %edx +#else + /* There's no frame pointer, load the appropriate stack addr instead */ + lea 4*4(%esp), %edx +#endif + + movl (MCOUNT_FRAME+4)*4(%esp), %eax /* load the rip */ + /* Get the parent ip */ + movl 4(%edx), %edx /* edx has ebp */ + + movl function_trace_op, %ecx + subl $MCOUNT_INSN_SIZE, %eax + .endm + +.macro restore_mcount_regs + addl $4, %esp /* skip NULL pointer */ + popl %edx + popl %ecx + popl %eax +#ifdef USING_FRAME_POINTER + popl %ebp +# ifdef CC_USING_FENTRY + addl $4,%esp /* skip function ip */ + popl %ebp /* this is the orig bp */ + addl $4, %esp /* skip parent ip */ +# endif +#endif + .endm + + ENTRY(function_hook) +#ifdef CONFIG_VERY_EARLY_FUNCTION_TRACER + cmpl $__PAGE_OFFSET, %esp + jb vearly_stub /* Paging not enabled yet? */ + + cmpl $ftrace_stub, ftrace_vearly_trace_function + jnz vearly_trace + +vearly_stub: ret + +vearly_trace: + save_mcount_regs + call *ftrace_vearly_trace_function + restore_mcount_regs + + jmp vearly_stub +#else + ret +#endif END(function_hook) ENTRY(ftrace_caller) diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S index 1dfac634bbf7..35f73a9dbbd2 100644 --- a/arch/x86/kernel/ftrace_64.S +++ b/arch/x86/kernel/ftrace_64.S @@ -146,7 +146,21 @@ EXPORT_SYMBOL(mcount) #ifdef CONFIG_DYNAMIC_FTRACE ENTRY(function_hook) +#ifdef CONFIG_VERY_EARLY_FUNCTION_TRACER + cmpq $ftrace_stub, ftrace_vearly_trace_function + jnz vearly_trace + +vearly_stub: retq + +vearly_trace: + save_mcount_regs + call *ftrace_vearly_trace_function + restore_mcount_regs + jmp vearly_stub +#else + retq +#endif END(function_hook) ENTRY(ftrace_caller) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 2e028854bac7..c3f2ba246fc9 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -272,6 +272,16 @@ static inline void ftrace_kill(void) { } static inline void ftrace_free_init_mem(void) { } #endif /* CONFIG_FUNCTION_TRACER */ +#ifdef CONFIG_VERY_EARLY_FUNCTION_TRACER +extern void ftrace_early_init(char *command_line); +extern void ftrace_early_shutdown(void); +extern void ftrace_early_fill_ringbuffer(void *data); +#else +static inline void ftrace_early_init(char *command_line) { } +static inline void ftrace_early_shutdown(void) { } +static inline void ftrace_early_fill_ringbuffer(void *data) { } +#endif + #ifdef CONFIG_STACK_TRACER #define STACK_TRACE_ENTRIES 500 @@ -476,6 +486,10 @@ unsigned long ftrace_get_addr_curr(struct dyn_ftrace *rec); extern ftrace_func_t ftrace_trace_function; +#if defined(CONFIG_VERY_EARLY_FUNCTION_TRACER) && defined(CONFIG_DYNAMIC_FTRACE) +extern ftrace_func_t ftrace_vearly_trace_function; +#endif + int ftrace_regex_open(struct ftrace_ops *ops, int flag, struct inode *inode, struct file *file); ssize_t ftrace_filter_write(struct file *file, const char __user *ubuf, @@ -757,7 +771,7 @@ static inline unsigned long get_lock_parent_ip(void) #ifdef CONFIG_FTRACE_MCOUNT_RECORD extern void ftrace_init(void); #else -static inline void ftrace_init(void) { } +static inline void ftrace_init(void) { ftrace_early_shutdown(); } #endif /* diff --git a/init/main.c b/init/main.c index 0ee9c6866ada..c794292b18ef 100644 --- a/init/main.c +++ b/init/main.c @@ -511,6 +511,7 @@ asmlinkage __visible void __init start_kernel(void) char *command_line; char *after_dashes; + ftrace_early_init(boot_command_line); set_task_stack_end_magic(&init_task); smp_setup_processor_id(); debug_objects_early_init(); diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 434c840e2d82..0e1a9662bb2f 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -19,6 +19,11 @@ config HAVE_FUNCTION_TRACER help See Documentation/trace/ftrace-design.txt +config HAVE_VERY_EARLY_FTRACE + bool + help + See Documentation/trace/ftrace-design.txt + config HAVE_FUNCTION_GRAPH_TRACER bool help @@ -145,6 +150,43 @@ config FUNCTION_TRACER (the bootup default), then the overhead of the instructions is very small and not measurable even in micro-benchmarks. +config VERY_EARLY_FUNCTION_TRACER + bool "Very Early Kernel Function Tracer" + depends on FUNCTION_TRACER + depends on HAVE_VERY_EARLY_FTRACE + default y + help + Enable very early function tracing. When ftrace_init will be called, all + mcount hooks will be replaced by nop, but before that (at very early + stage) mcounts are still there, and they are used for very early tracing. + +config VERY_EARLY_BUF_SHIFT + int "Temporary buffer size (17 => 128 KB, 24 => 16 MB)" + depends on VERY_EARLY_FUNCTION_TRACER + range 0 24 + default 19 + help + Select the size of the buffer to be used for storing function calls at + very early stage. + Examples: + 20 => 1 MB + 19 => 512 KB + 17 => 128 KB + +config VERY_EARLY_FILTER_SHIFT + int "Temporary filter size (filter/ntrace) (17 => 128 KB, 19 => 512 KB)" + depends on VERY_EARLY_FUNCTION_TRACER + depends on DYNAMIC_FTRACE + range 0 19 + default 17 + help + Select the size of the filter buffer to be used for filtering (trace/ + no trace) functions at very early stage. + Examples: + 19 => 512 KB + 18 => 256 KB + 17 => 128 KB + config FUNCTION_GRAPH_TRACER bool "Kernel Function Graph Tracer" depends on HAVE_FUNCTION_GRAPH_TRACER diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 8319e09e15b9..5e1b53662bbb 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -33,7 +33,7 @@ #include <linux/list.h> #include <linux/hash.h> #include <linux/rcupdate.h> - +#include <linux/trace.h> #include <trace/events/sched.h> #include <asm/sections.h> @@ -65,6 +65,8 @@ #define FTRACE_HASH_MAX_BITS 12 #ifdef CONFIG_DYNAMIC_FTRACE +extern unsigned long __start_mcount_loc[]; +extern unsigned long __stop_mcount_loc[]; #define INIT_OPS_HASH(opsname) \ .func_hash = &opsname.local_hash, \ .local_hash.regex_lock = __MUTEX_INITIALIZER(opsname.local_hash.regex_lock), @@ -5912,11 +5914,11 @@ void __init ftrace_free_init_mem(void) void __init ftrace_init(void) { - extern unsigned long __start_mcount_loc[]; - extern unsigned long __stop_mcount_loc[]; unsigned long count, flags; int ret; + ftrace_early_shutdown(); + local_irq_save(flags); ret = ftrace_dyn_arch_init(); local_irq_restore(flags); @@ -6882,3 +6884,336 @@ void ftrace_graph_exit_task(struct task_struct *t) kfree(ret_stack); } #endif + + +#ifdef CONFIG_VERY_EARLY_FUNCTION_TRACER + +#define VEARLY_BUF_LEN ((1 << CONFIG_VERY_EARLY_BUF_SHIFT) / \ + sizeof(struct ftrace_vearly_entry)) +/* + * In very early stage : + * - only CPU0 is running in early, there is no need to store it + * - tsc counter is used in x86, otherwise we use sched_clock + */ +struct ftrace_vearly_entry { + unsigned long ip; + unsigned long parent_ip; + unsigned long long clock; +}; +static struct ftrace_vearly_entry ftrace_vearly_entries[VEARLY_BUF_LEN] __initdata; +static const unsigned long VERY_EARLY_BUF_LEN __initconst = VEARLY_BUF_LEN; +static struct trace_export ftrace_vearly_export __initdata; +static char tmp_cmdline[COMMAND_LINE_SIZE] __initdata; +static unsigned int ftrace_vearly_enabled __initdata; +static unsigned int vearly_entries_count __initdata; + +#ifdef CONFIG_DYNAMIC_FTRACE +ftrace_func_t ftrace_vearly_trace_function __read_mostly = ftrace_stub; +#else +# define ftrace_vearly_trace_function ftrace_trace_function +#endif +#define ftrace_vearly_disable() (ftrace_vearly_trace_function = ftrace_stub) + + +#ifdef CONFIG_DYNAMIC_FTRACE +#define VEARLY_FILTER_SIZE ((1 << CONFIG_VERY_EARLY_FILTER_SHIFT) / \ + sizeof(unsigned long)) +struct ftrace_vearly_filtering { + unsigned long list[VEARLY_FILTER_SIZE]; + char buf[COMMAND_LINE_SIZE]; + int size; +}; +static const unsigned long FILTER_SIZE __initconst = VEARLY_FILTER_SIZE; +static struct ftrace_vearly_filtering ftrace_data_notrace __initdata; +static struct ftrace_vearly_filtering ftrace_data_filter __initdata; + +static __init int +ftrace_vearly_match_record(unsigned long ip, struct ftrace_glob *func_g) +{ + char str[KSYM_SYMBOL_LEN]; + char *modname; + + kallsyms_lookup(ip, NULL, NULL, &modname, str); + return ftrace_match(str, func_g); +} + +static __init int ftrace_vearly_filter_has_addr(unsigned long addr, + unsigned long *filter, int *size) +{ + int i; + + for (i = 0; i < *size; i++) { + if (filter[i] == addr) + return 1; + } + return 0; +} + +static __init void +ftrace_vearly_regex(char *func, unsigned long *filter, int *size) +{ + struct ftrace_glob func_g = { .type = MATCH_FULL }; + unsigned long *start = __start_mcount_loc; + unsigned long *end = __stop_mcount_loc; + unsigned long count; + unsigned long addr; + unsigned long *p; + int clear_filter = 0; + + count = end - start; + + if (!count) + return; + + if (func) { + func_g.type = filter_parse_regex(func, strlen(func), &func_g.search, + &clear_filter); + func_g.len = strlen(func_g.search); + } + + p = start; + while (p < end) { + addr = ftrace_call_adjust(*p++); + if (!addr) + continue; + + if ((*size) > FILTER_SIZE) + return; + + if (ftrace_vearly_match_record(addr, &func_g)) { + if (!ftrace_vearly_filter_has_addr(addr, filter, size)) + filter[(*size)++] = addr; + } + } +} + +static int __init ftrace_addr_compare(const void *a, const void *b) +{ + if (*(unsigned long *)a > *(unsigned long *)b) + return 1; + if (*(unsigned long *)a < *(unsigned long *)b) + return -1; + + return 0; +} + +static void __init ftrace_addr_swap(void *a, void *b, int size) +{ + unsigned long t = *(unsigned long *)a; + *(unsigned long *)a = *(unsigned long *)b; + *(unsigned long *)b = t; +} + +static __init int set_ftrace_vearly_filtering(void *data, char *str) +{ + struct ftrace_vearly_filtering *ftrace_data = data; + char *func; + char *buf; + + if (!ftrace_data) + return 0; + buf = ftrace_data->buf; + strlcpy(buf, str, COMMAND_LINE_SIZE); + + while (buf) { + func = strsep(&buf, ","); + ftrace_vearly_regex(func, ftrace_data->list, &ftrace_data->size); + } + /* sort filter to use binary search on it */ + sort(ftrace_data->list, ftrace_data->size, + sizeof(unsigned long), ftrace_addr_compare, ftrace_addr_swap); + + return 1; +} + +#define ftrace_vearly_bsearch_addr(addr, data) bsearch(&addr, data.list,\ + data.size, sizeof(unsigned long), ftrace_addr_compare) + +#endif /* CONFIG_DYNAMIC_FTRACE */ + + +static __init void +ftrace_function_vearly_trace_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct pt_regs *regs) +{ + struct ftrace_vearly_entry *entry; + +#ifdef CONFIG_DYNAMIC_FTRACE + if (ftrace_data_notrace.size && + ftrace_vearly_bsearch_addr(ip, ftrace_data_notrace)) + return; + + if (ftrace_data_filter.size && + !ftrace_vearly_bsearch_addr(ip, ftrace_data_filter)) + return; +#endif + + if (vearly_entries_count >= VERY_EARLY_BUF_LEN) { + /* stop tracing when buffer is full */ + ftrace_vearly_disable(); + return; + } + + entry = &ftrace_vearly_entries[vearly_entries_count++]; + entry->ip = ip; + entry->parent_ip = parent_ip; +#ifdef CONFIG_X86_TSC + entry->clock = rdtsc(); +#else + entry->clock = trace_clock_local(); +#endif +} + +/* + * ns = cycles * (10^6 / cpu_khz) + */ +static u64 __init cycles_to_ns(u64 cycles, u64 khz) +{ + u64 ns = cycles; + u64 dividend, divisor = 0; + const u64 constant = 1000000; + + if (khz > constant) { + // ns /= (khz / constant); + dividend = khz; + divisor = constant; + do_div(dividend, divisor); + do_div(ns, dividend); + } else { + dividend = constant; + divisor = khz; + do_div(dividend, divisor); + ns *= dividend; + } + return ns; +} + +/** + * delta btw the current and previous event, used in ftrace export to + * update entry time delta + */ +static u64 export_delta __initdata; +/** + * update entry timestamp + */ +static void __init ftrace_vearly_export_write(const void *buf, unsigned int len) +{ + struct ring_buffer_event *event; + const u32 *entry = buf; + + event = container_of(entry, struct ring_buffer_event, array[0]); + event->time_delta = export_delta; +} + +void __init ftrace_early_fill_ringbuffer(void *data) +{ + struct ftrace_vearly_entry *entry; + struct trace_array *tr = data; + unsigned long cpu_khz; + u64 ns, prev_ns = 0; + int i; + + if (!ftrace_vearly_enabled) + return; + +#ifdef CONFIG_X86_TSC + cpu_khz = native_calibrate_cpu(); +#endif + + ftrace_vearly_export.write = ftrace_vearly_export_write; + register_ftrace_export(&ftrace_vearly_export); + preempt_disable_notrace(); + for (i = 0; i < vearly_entries_count; i++) { + entry = &ftrace_vearly_entries[i]; + +#ifdef CONFIG_X86_TSC + ns = cycles_to_ns(entry->clock, cpu_khz); +#else + ns = entry->clock; +#endif + export_delta = i > 0 ? ns - prev_ns : 0; + prev_ns = ns; + trace_function(tr, entry->ip, entry->parent_ip, 0, 0); + } + preempt_enable_notrace(); + unregister_ftrace_export(&ftrace_vearly_export); +} + +static __init int set_ftrace_vearly_enable(void *data, char *str) +{ + ftrace_vearly_enabled = 1; + return 1; +} +/* + * this will be used as __setup_param + */ +struct ftrace_vearly_obs_param { + int (*setup_func)(void *, char*); + const char *str; + void *data; +}; +static struct ftrace_vearly_obs_param ftrace_vearly_params[] __initdata = { + { .str = "ftrace_vearly", .setup_func = set_ftrace_vearly_enable }, +#ifdef CONFIG_DYNAMIC_FTRACE + { + .str = "ftrace_notrace", + .data = &ftrace_data_notrace, + .setup_func = set_ftrace_vearly_filtering, + }, + { + .str = "ftrace_filter", + .data = &ftrace_data_filter, + .setup_func = set_ftrace_vearly_filtering, + }, +#endif +}; + +static int __init ftrace_do_very_early_param(char *param, char *val, + const char *unused, void *arg) +{ + int size = ARRAY_SIZE(ftrace_vearly_params); + struct ftrace_vearly_obs_param *p; + int i; + + for (i = 0; i < size; i++) { + p = &ftrace_vearly_params[i]; + if (strcmp(param, p->str) == 0) { + p->setup_func(p->data, val); + return 0; + } + } + return 0; +} + +void __init ftrace_early_init(char *command_line) +{ + /* proceed only if 'ftrace_vearly' exist */ + if (!strstr(command_line, "ftrace_vearly")) + return; + + strlcpy(tmp_cmdline, command_line, COMMAND_LINE_SIZE); + parse_args("ftrace vearly options", tmp_cmdline, NULL, 0, 0, 0, NULL, + ftrace_do_very_early_param); + + if (ftrace_vearly_enabled) + ftrace_vearly_trace_function = ftrace_function_vearly_trace_call; +} + +void __init ftrace_early_shutdown(void) +{ + if (!ftrace_vearly_enabled) + return; + + ftrace_vearly_disable(); + +#ifdef CONFIG_DYNAMIC_FTRACE + pr_info("ftrace: vearly %u entries, notrace=%d, filter=%d", + vearly_entries_count, + ftrace_data_notrace.size, + ftrace_data_filter.size); +#else + pr_info("ftrace: vearly %u recorded entries", vearly_entries_count); +#endif +} + +#endif /* CONFIG_VERY_EARLY_FUNCTION_TRACER */ diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 752e5daf0896..c097b97b3025 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -8364,6 +8364,9 @@ __init static int tracer_alloc_buffers(void) /* Function tracing may start here (via kernel command line) */ init_function_trace(); + /* Pre-fill the ring buffer with very early events */ + ftrace_early_fill_ringbuffer(&global_trace); + /* All seems OK, enable tracing */ tracing_disabled = 0; -- 2.11.0