On Tue, 13 Apr 2021 at 21:05, Craig Ringer
<craig.rin...@enterprisedb.com> wrote:
> On Tue, 13 Apr 2021 at 11:06, Andres Freund <and...@anarazel.de> wrote:
> > IIRC those aren't really comparable - the kernel actually does modify
> > the executable code to replace the tracepoints with nops.
>
> Same with userspace static trace markers (USDTs).
>
> A followup mail will contain a testcase and samples to demonstrate this.

Demo follows, with source attached too. gcc 10.2 compiling with -O2,
using dtrace and <sys/sdt.h> from systemtap 4.4 .

Trivial empty function definition:

    __attribute__((noinline))
    void
    no_args(void)
    {
                SDT_NOOP_NO_ARGS();
    }

Disassembly when SDT_NOOP_NO_ARGS is defined as

    #define SDT_NOOP_NO_ARGS()

is:

    <no_args>:
        retq

When built with a probes.d definition processed by the dtrace script
instead, the disassembly becomes:

    <no_args>:
        nop
        retq

So ... yup, it's a nop.

Now, if we introduce semaphores that changes.

    __attribute__((noinline))
    void
    no_args(void)
    {
        if (SDT_NOOP_NO_ARGS_ENABLED())
                SDT_NOOP_NO_ARGS();
    }

disassembles to:

    <no_args>:
        cmpw   $0x0,0x2ec4(%rip)        # <sdt_noop_no_args_semaphore>
        jne    <no_args+0x10>
        retq
        nopl   0x0(%rax,%rax,1)
        nop
        retq

so the semaphore test is actually quite harmful and wasteful in this
case. That's not surprising since this SDT is a simple marker point.
But what if we supply arguments to it? It turns out that the
disassembly is the same if args are passed, whether locals or globals,
including globals assigned based on program input that can't be
determined at compile time. Still just a nop.

If I pass a function call as an argument expression to a probe, e.g.

    __attribute__((noinline)) static int
    compute_probe_argument(void)
    {
        return 100;
    }

    void
    with_computed_arg(void)
    {
        SDT_NOOP_WITH_COMPUTED_ARG(compute_probe_argument());
    }

then the disassembly with SDTs is:

    <with_computed_arg>:
        callq  <compute_probe_argument>
        nop
        retq

so the function call isn't elided even if it's unused. That's somewhat
expected. The same will be true if the arguments to a probe require
pointer chasing or non-trivial marshalling.

If a semaphore guard is added this becomes:

    <with_computed_arg>:
        cmpw   $0x0,0x2e2e(%rip)        # <sdt_noop_with_computed_arg_semaphore>
        jne    <with_computed_arg+0x10>
        retq
        nopl   0x0(%rax,%rax,1)
        callq  <compute_probe_argument>
        nop
        retq

so now the call to compute_probe_argument() is skipped unless the
probe is enabled, but the function is longer and requires a test and
jump.

If I dummy up a function that does some pointer chasing, without
semaphores I get

<with_pointer_chasing>:
    mov    (%rdi),%rax
    mov    (%rax),%rax
    mov    (%rax),%rax
    nop
    retq

so the arguments are marshalled then ignored.

with semaphores I get:

<with_pointer_chasing>:
    cmpw   $0x0,0x2d90(%rip)        # <sdt_noop_with_pointer_chasing_semaphore>
    jne    <with_pointer_chasing+0x10>
    retq
    nopl   0x0(%rax,%rax,1)
    mov    (%rdi),%rax
    mov    (%rax),%rax
    mov    (%rax),%rax
    nop
    retq

so again the probe's argument marshalling is inline in the function
body, but at the end, and skipped over.

Findings:

* A probe without arguments or with simple arguments is just a 'nop' instruction
* Probes that require function calls, pointer chasing, other
expression evaluation etc may impose a fixed cost to collect up
arguments even if the probe is disabled.
* SDT semaphores can avoid that cost but add a branch, so should
probably be avoided unless preparing probe arguments is likely to be
expensive.

Hideous but effective demo code attached.
provider sdt_noop {
	probe no_args();
	probe with_args(int arg1, int arg2, int arg3);
	probe with_global_arg(int arg1);
	probe with_volatile_arg(int arg1);
	probe with_many_args(int arg1, int arg2, int arg3, int64_t arg4, int64_t arg5, int64_t arg6, int64_t arg7, int64_t arg8);
	probe with_computed_arg(int arg1);
	probe with_pointer_chasing(int arg1);
};

Attachment: Makefile
Description: Binary data

#include <stdint.h>

#ifdef USE_SDT
#include "sdt_noop_probes_enabled.h"
#else
#include "sdt_noop_probes_disabled.h"
#endif

void no_args(void);
int with_args(void);
int with_global_arg(void);
int with_volatile_arg(void);
void with_many_args(void);
void with_computed_arg(void);
void with_pointer_chasing(int**** arg);

__attribute__((noinline))
void
no_args(void)
{
#ifdef USE_SDT_SEMAPHORES
	if (SDT_NOOP_NO_ARGS_ENABLED())
#endif
		SDT_NOOP_NO_ARGS();
}

__attribute__((noinline))
int
with_args(void)
{
	int arg1 = 0;
	int arg2 = 1;
	int arg3 = 2;
#ifdef USE_SDT_SEMAPHORES
	if (SDT_NOOP_WITH_ARGS_ENABLED())
#endif
		SDT_NOOP_WITH_ARGS(arg1, arg2, arg3);

	return arg1 + arg2 + arg3;
}

int some_global;

__attribute__((noinline))
int
with_global_arg(void)
{
#ifdef USE_SDT_SEMAPHORES
	if (SDT_NOOP_WITH_GLOBAL_ARG_ENABLED())
#endif
		SDT_NOOP_WITH_GLOBAL_ARG(some_global);

	return some_global;
}

__attribute__((noinline))
int
with_volatile_arg(void)
{
	volatile int arg1;
	arg1 = 42;
#ifdef USE_SDT_SEMAPHORES
	if (SDT_NOOP_WITH_VOLATILE_ARG_ENABLED())
#endif
		SDT_NOOP_WITH_VOLATILE_ARG(arg1);

	return arg1;
}

__attribute__((noinline))
void
with_many_args(void)
{
#ifdef USE_SDT_SEMAPHORES
	if (SDT_NOOP_WITH_MANY_ARGS_ENABLED())
#endif
		SDT_NOOP_WITH_MANY_ARGS(1,2,3,4,5,6,7,8);
}

__attribute__((noinline))
static int
compute_probe_argument(void)
{
	return 100;
}

__attribute__((noinline))
void
with_computed_arg(void)
{
#ifdef USE_SDT_SEMAPHORES
	if (SDT_NOOP_WITH_COMPUTED_ARG_ENABLED())
#endif
		SDT_NOOP_WITH_COMPUTED_ARG(compute_probe_argument());
}

__attribute__((noinline))
void
with_pointer_chasing(int**** arg)
{
#ifdef USE_SDT_SEMAPHORES
	if (SDT_NOOP_WITH_POINTER_CHASING_ENABLED())
#endif
		SDT_NOOP_WITH_POINTER_CHASING(****arg);
}

int
main(int argc, char * argv[] __attribute__((unused)) )
{
	no_args();

	with_args();

	with_many_args();

	some_global = argc;
	with_global_arg();

	with_volatile_arg();

	with_many_args();

	with_computed_arg();

	int *some_value = malloc(sizeof(int));
	*some_value = 0x7f;
	int **some_value_p = &some_value;
	int ***some_value_pp = &some_value_p;
	with_pointer_chasing(&some_value_pp);

	free(some_value);
}

Reply via email to