Re: dt: add kernel function boundary tracing provider

2020-11-19 Thread Martin Pieuchot
Hello Tom,

Thanks for sharing your work, that's awesome!

On 14/11/20(Sat) 13:13, Tom Rollet wrote:
> Here is a diff for dynamic tracing of kernel's functions boundaries.
> It's implemented as one of the dt's provider on i386 and amd64.
> To activate it, DDBPROF and pseudo device dt must be activated
> on GENERIC.

Could we replace the DDBPROF define by NDT > 0?  Is it still possible to
build a profiling kernel if we do so?

Being able to use the existing kernel profiling code is a nice way to
test prologue patching but if performances are really bad we might want
to keep a way to use the old profiling method.  If it's too complicated
to have both coexist I'd suggest we get rid of DBPROF and use the
intr-based patching code for dt(4) only.

> For now it's working like DTRACE fbt(function boundaries tracing).
> 
> We replace the prologue and epilogue of each function with a breakpoint.
> The replaced instruction on amd64 and 386 are respectively
> "push %rbp", "push %ebp" for prologue and "ret", "pop %ebp" for epilogue.
> These instructions are emulated at the end of the breakpoint handler,
> just after sending an event to userland (to be read by btrace).
> 
> For now the lookup to find the instruction to replace is hardcoded
> and needs to find if there is a retguard before the prologue of the
> function or if there are multiples int3 after the last ret.
> It allows to find 31896 entry points and 19513 return points on amd64.

That's a great start.

> ddb has also  a similar way of tracing all prologue of function. It now
> uses this new dt provider for doing tracing.
> 
> Perf wise, when all entry probes are enabled, the slow down is
> quite massive but expected since breakpoints are slow.
> On the kernel compilation with 10 threads on a linux qemu VM we go from:
> real   2m38.280s
> user   10m2.050s
> sys    14m10.360s
> 
> to:
> real   24m19.280s
> user   9m44.110s
> sys    220m26.980s

Did you try on i386 as well?  How is it?

> Any comments on the diff?

See inline.

> diff --git a/sys/arch/amd64/amd64/vector.S b/sys/arch/amd64/amd64/vector.S
> index dd2dfde3e3b..3a0a58ba3ac 100644
> --- a/sys/arch/amd64/amd64/vector.S
> +++ b/sys/arch/amd64/amd64/vector.S
> @@ -188,10 +188,11 @@ INTRENTRY_LABEL(trap03):
>  sti
>  cld
>  SMAP_CLAC
> -    movq    %rsp, %rdi
> -    call    _C_LABEL(db_prof_hook)
> -    cmpl    $1, %eax
> -    jne    .Lreal_kern_trap
> +    leaq    _C_LABEL(dt_prov_kprobe), %rdi
> +    movq    %rsp, %rsi
> +    call    _C_LABEL(dt_prov_kprobe_hook)

Why don't we call a function with no argument?  Maybe the current
interface is over designed and we should not use it.

> +    cmpl    $0, %eax
> +    je .Lreal_kern_trap
> 
>  cli
>  movq    TF_RDI(%rsp),%rdi
> @@ -210,6 +211,11 @@ INTRENTRY_LABEL(trap03):
>  movq    TF_R11(%rsp),%r11
>  /* %rax restored below, after being used to shift the stack */
> 
> +    cmpl    $2, %eax
> +    je  .Lemulate_ret
> +
> +.Lemulate_push_rbp:
> +
>  /*
>   * We are returning from a probe trap so we need to fix the
>   * stack layout and emulate the patched instruction.
> @@ -217,6 +223,9 @@ INTRENTRY_LABEL(trap03):
>   */
>  subq    $16, %rsp
> 
> +    movq    (TF_RAX + 16)(%rsp), %rax
> +    movq    %rax, TF_RAX(%rsp)
> +
>  /* Shift hardware-saved registers. */
>  movq    (TF_RIP + 16)(%rsp), %rax
>  movq    %rax, TF_RIP(%rsp)
> @@ -237,7 +246,20 @@ INTRENTRY_LABEL(trap03):
> 
>  /* Finally restore %rax */
>  movq    (TF_RAX + 16)(%rsp),%rax
> +    jmp .ret_int3

Shouldn't we copy the two instructions here instead of jumping?  If
you're after reducing code duplication you can look at macros like
the ones used in frameasm.h.

> +.Lemulate_ret:
> +
> +    /* Store a new return address in %rip */
> +    movq    TF_RSP(%rsp), %rax
> +    movq    (%rax), %rax
> +    movq    %rax, TF_RIP(%rsp)
> +    addq    $8, TF_RSP(%rsp)
> +
> +    /* Finally restore %rax */
> +    movq    (TF_RAX)(%rsp),%rax
> 
> +.ret_int3:
>  addq    $TF_RIP,%rsp
>  iretq
>  #endif /* !defined(GPROF) && defined(DDBPROF) */
> diff --git a/sys/arch/i386/i386/locore.s b/sys/arch/i386/i386/locore.s
> index 0c5607fe38a..e2d43b47eb3 100644
> --- a/sys/arch/i386/i386/locore.s
> +++ b/sys/arch/i386/i386/locore.s
> @@ -205,7 +205,8 @@ INTRENTRY_LABEL(label):    /* from kernel */    ; \
>  #define    INTRFASTEXIT \
>  jmp    intr_fast_exit
> 
> -#define    INTR_FAKE_TRAP    0xbadabada
> +#define    INTR_FAKE_TRAP_PUSH_RPB    0xbadabada
> +#define    INTR_FAKE_TRAP_POP_RBP    0xbcbcbcbc
> 
>  /*
>   * PTmap is recursive pagemap at top of virtual address space.
> @@ -1259,17 +1260,32 @@ calltrap:
>  jne    .Lreal_trap
> 
>  pushl    %esp
> -    call    _C_LABEL(db_prof_hook)
> -    addl    $4,%esp
> -    cmpl    $1,%eax
> -    jne    .Lreal_trap
> +    subl    $4, %esp
> +    pushl   %eax
> +    leal    _C_LABEL(dt_prov_kprobe), %eax
> +    movl    %eax, 4(%esp)
> +    popl    %eax
> +    call    _C_LABE

Re: dt: add kernel function boundary tracing provider

2020-11-16 Thread Tom Rollet

As requested by Philip Guenther here is more information on how to use
this new dt provider.

After applying the diff you need to add to /etc/sysctl.conf
ddb.profile=1
kern.allowdt=1
kern.allowkmem=1

There are two way to start the tracing.
The first one is to use kgmon that will instantiate all entry probes,
gprof for reading the gmon.out files.

The second one is whith btrace. Btrace can take scripts as input to
start tracing. It follows the bpftrace language.
Here is the bpftrace man:
https://www.mankier.com/8/bpftrace

Example of a simple script that you can write.
(btrace -l for the list of available probes)

BEGIN
{
    @open_entry = 0;
    @open_ret = 0;
}

kprobe:sys_pread:entry
{
     printf("sys_open: %d\n", @open_entry);
    @open_entry = @open_entry + 1;

    //Dump all cpus used
    @[cpu] = count();

}

kprobe:sys_pread:return
{
     printf("sys_ret: %d\n", @open_entry);
    @ret_entry = @open_entry + 1;
}

Printf are done at trace time whereas maps (@[...]) are dumped when the
tracing is ended with Ctr-c.

Tom



dt: add kernel function boundary tracing provider

2020-11-14 Thread Tom Rollet

Hi,

Here is a diff for dynamic tracing of kernel's functions boundaries.
It's implemented as one of the dt's provider on i386 and amd64.
To activate it, DDBPROF and pseudo device dt must be activated
on GENERIC.

For now it's working like DTRACE fbt(function boundaries tracing).

We replace the prologue and epilogue of each function with a breakpoint.
The replaced instruction on amd64 and 386 are respectively
"push %rbp", "push %ebp" for prologue and "ret", "pop %ebp" for epilogue.
These instructions are emulated at the end of the breakpoint handler,
just after sending an event to userland (to be read by btrace).

For now the lookup to find the instruction to replace is hardcoded
and needs to find if there is a retguard before the prologue of the
function or if there are multiples int3 after the last ret.
It allows to find 31896 entry points and 19513 return points on amd64.

ddb has also  a similar way of tracing all prologue of function. It now
uses this new dt provider for doing tracing.

Perf wise, when all entry probes are enabled, the slow down is
quite massive but expected since breakpoints are slow.
On the kernel compilation with 10 threads on a linux qemu VM we go from:
real   2m38.280s
user   10m2.050s
sys    14m10.360s

to:
real   24m19.280s
user   9m44.110s
sys    220m26.980s

Any comments on the diff?

Tom

diff --git a/sys/arch/amd64/amd64/vector.S b/sys/arch/amd64/amd64/vector.S
index dd2dfde3e3b..3a0a58ba3ac 100644
--- a/sys/arch/amd64/amd64/vector.S
+++ b/sys/arch/amd64/amd64/vector.S
@@ -188,10 +188,11 @@ INTRENTRY_LABEL(trap03):
 sti
 cld
 SMAP_CLAC
-    movq    %rsp, %rdi
-    call    _C_LABEL(db_prof_hook)
-    cmpl    $1, %eax
-    jne    .Lreal_kern_trap
+    leaq    _C_LABEL(dt_prov_kprobe), %rdi
+    movq    %rsp, %rsi
+    call    _C_LABEL(dt_prov_kprobe_hook)
+    cmpl    $0, %eax
+    je .Lreal_kern_trap

 cli
 movq    TF_RDI(%rsp),%rdi
@@ -210,6 +211,11 @@ INTRENTRY_LABEL(trap03):
 movq    TF_R11(%rsp),%r11
 /* %rax restored below, after being used to shift the stack */

+    cmpl    $2, %eax
+    je  .Lemulate_ret
+
+.Lemulate_push_rbp:
+
 /*
  * We are returning from a probe trap so we need to fix the
  * stack layout and emulate the patched instruction.
@@ -217,6 +223,9 @@ INTRENTRY_LABEL(trap03):
  */
 subq    $16, %rsp

+    movq    (TF_RAX + 16)(%rsp), %rax
+    movq    %rax, TF_RAX(%rsp)
+
 /* Shift hardware-saved registers. */
 movq    (TF_RIP + 16)(%rsp), %rax
 movq    %rax, TF_RIP(%rsp)
@@ -237,7 +246,20 @@ INTRENTRY_LABEL(trap03):

 /* Finally restore %rax */
 movq    (TF_RAX + 16)(%rsp),%rax
+    jmp .ret_int3
+
+.Lemulate_ret:
+
+    /* Store a new return address in %rip */
+    movq    TF_RSP(%rsp), %rax
+    movq    (%rax), %rax
+    movq    %rax, TF_RIP(%rsp)
+    addq    $8, TF_RSP(%rsp)
+
+    /* Finally restore %rax */
+    movq    (TF_RAX)(%rsp),%rax

+.ret_int3:
 addq    $TF_RIP,%rsp
 iretq
 #endif /* !defined(GPROF) && defined(DDBPROF) */
diff --git a/sys/arch/i386/i386/locore.s b/sys/arch/i386/i386/locore.s
index 0c5607fe38a..e2d43b47eb3 100644
--- a/sys/arch/i386/i386/locore.s
+++ b/sys/arch/i386/i386/locore.s
@@ -205,7 +205,8 @@ INTRENTRY_LABEL(label):    /* from kernel */    ; \
 #define    INTRFASTEXIT \
 jmp    intr_fast_exit

-#define    INTR_FAKE_TRAP    0xbadabada
+#define    INTR_FAKE_TRAP_PUSH_RPB    0xbadabada
+#define    INTR_FAKE_TRAP_POP_RBP    0xbcbcbcbc

 /*
  * PTmap is recursive pagemap at top of virtual address space.
@@ -1259,17 +1260,32 @@ calltrap:
 jne    .Lreal_trap

 pushl    %esp
-    call    _C_LABEL(db_prof_hook)
-    addl    $4,%esp
-    cmpl    $1,%eax
-    jne    .Lreal_trap
+    subl    $4, %esp
+    pushl   %eax
+    leal    _C_LABEL(dt_prov_kprobe), %eax
+    movl    %eax, 4(%esp)
+    popl    %eax
+    call    _C_LABEL(dt_prov_kprobe_hook)
+    addl    $8,%esp
+    cmpl    $0,%eax
+    je    .Lreal_trap

 /*
  * Abuse the error field to indicate that INTRFASTEXIT needs
  * to emulate the patched instruction.
  */
-    movl    $INTR_FAKE_TRAP, TF_ERR(%esp)
-    jz    .Lalltraps_check_asts
+    cmpl $1, %eax
+    je .Lset_emulate_push_rbp
+
+    cmpl $2, %eax
+    je .Lset_emulate_ret
+
+.Lset_emulate_push_rbp:
+    movl    $INTR_FAKE_TRAP_PUSH_RPB, TF_ERR(%esp)
+    jmp    .Lalltraps_check_asts
+.Lset_emulate_ret:
+    movl    $INTR_FAKE_TRAP_POP_RBP, TF_ERR(%esp)
+    jmp    .Lalltraps_check_asts
 .Lreal_trap:
 #endif /* !defined(GPROF) && defined(DDBPROF) */
 pushl    %esp
@@ -1298,8 +1314,10 @@ calltrap:
  * The code below does that by trashing %eax, so it MUST be
  * restored afterward.
  */
-    cmpl    $INTR_FAKE_TRAP, TF_ERR(%esp)
-    je    .Lprobe_fixup
+    cmpl    $INTR_FAKE_TRAP_PUSH_RPB, TF_ERR(%esp)
+    je    .Lprobe_fixup_push_rbp
+    cmpl    $INTR_FAKE_TRAP_POP_RBP, TF_ERR(%esp)
+    je    .Lprobe_fixup_pop_rbp
 #endif /* !defined(GPROF) && defined(DDBPROF) */
 #ifndef DIAGNOS