[PATCH bpf-next 0/3] uprobes: two common case speed ups

2024-03-12 Thread Andrii Nakryiko
This patch set implements two speed ups for uprobe/uretprobe runtime execution
path for some common scenarios: BPF-only uprobes (patches #1 and #2) and
system-wide (non-PID-specific) uprobes (patch #3). Please see individual
patches for details.

Given I haven't worked with uprobe code before, I'm unfamiliar with
conventions in this subsystem, including which kernel tree patches should be
sent to. For now I based all the changes on top of bpf-next/master, which is
where I tested and benchmarked everything anyways. Please advise what should
I use as a base for subsequent revision. Thanks.

Andrii Nakryiko (3):
  uprobes: encapsulate preparation of uprobe args buffer
  uprobes: prepare uprobe args buffer lazily
  uprobes: add speculative lockless system-wide uprobe filter check

 kernel/trace/trace_uprobe.c | 103 ++--
 1 file changed, 63 insertions(+), 40 deletions(-)

-- 
2.43.0




[PATCH bpf-next 1/3] uprobes: encapsulate preparation of uprobe args buffer

2024-03-12 Thread Andrii Nakryiko
Move the logic of fetching temporary per-CPU uprobe buffer and storing
uprobes args into it to a new helper function. Store data size as part
of this buffer, simplifying interfaces a bit, as now we only pass single
uprobe_cpu_buffer reference around, instead of pointer + dsize.

This logic was duplicated across uprobe_dispatcher and uretprobe_dispatcher,
and now will be centralized. All this is also in preparation to make
this uprobe_cpu_buffer handling logic optional in the next patch.

Signed-off-by: Andrii Nakryiko 
---
 kernel/trace/trace_uprobe.c | 75 -
 1 file changed, 41 insertions(+), 34 deletions(-)

diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index a84b85d8aac1..a0f60bb10158 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -854,6 +854,7 @@ static const struct file_operations uprobe_profile_ops = {
 struct uprobe_cpu_buffer {
struct mutex mutex;
void *buf;
+   int dsize;
 };
 static struct uprobe_cpu_buffer __percpu *uprobe_cpu_buffer;
 static int uprobe_buffer_refcnt;
@@ -943,9 +944,26 @@ static void uprobe_buffer_put(struct uprobe_cpu_buffer 
*ucb)
mutex_unlock(&ucb->mutex);
 }
 
+static struct uprobe_cpu_buffer *prepare_uprobe_buffer(struct trace_uprobe *tu,
+  struct pt_regs *regs)
+{
+   struct uprobe_cpu_buffer *ucb;
+   int dsize, esize;
+
+   esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
+   dsize = __get_data_size(&tu->tp, regs);
+
+   ucb = uprobe_buffer_get();
+   ucb->dsize = dsize;
+
+   store_trace_args(ucb->buf, &tu->tp, regs, esize, dsize);
+
+   return ucb;
+}
+
 static void __uprobe_trace_func(struct trace_uprobe *tu,
unsigned long func, struct pt_regs *regs,
-   struct uprobe_cpu_buffer *ucb, int dsize,
+   struct uprobe_cpu_buffer *ucb,
struct trace_event_file *trace_file)
 {
struct uprobe_trace_entry_head *entry;
@@ -956,14 +974,14 @@ static void __uprobe_trace_func(struct trace_uprobe *tu,
 
WARN_ON(call != trace_file->event_call);
 
-   if (WARN_ON_ONCE(tu->tp.size + dsize > PAGE_SIZE))
+   if (WARN_ON_ONCE(tu->tp.size + ucb->dsize > PAGE_SIZE))
return;
 
if (trace_trigger_soft_disabled(trace_file))
return;
 
esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
-   size = esize + tu->tp.size + dsize;
+   size = esize + tu->tp.size + ucb->dsize;
entry = trace_event_buffer_reserve(&fbuffer, trace_file, size);
if (!entry)
return;
@@ -977,14 +995,14 @@ static void __uprobe_trace_func(struct trace_uprobe *tu,
data = DATAOF_TRACE_ENTRY(entry, false);
}
 
-   memcpy(data, ucb->buf, tu->tp.size + dsize);
+   memcpy(data, ucb->buf, tu->tp.size + ucb->dsize);
 
trace_event_buffer_commit(&fbuffer);
 }
 
 /* uprobe handler */
 static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs,
-struct uprobe_cpu_buffer *ucb, int dsize)
+struct uprobe_cpu_buffer *ucb)
 {
struct event_file_link *link;
 
@@ -993,7 +1011,7 @@ static int uprobe_trace_func(struct trace_uprobe *tu, 
struct pt_regs *regs,
 
rcu_read_lock();
trace_probe_for_each_link_rcu(link, &tu->tp)
-   __uprobe_trace_func(tu, 0, regs, ucb, dsize, link->file);
+   __uprobe_trace_func(tu, 0, regs, ucb, link->file);
rcu_read_unlock();
 
return 0;
@@ -1001,13 +1019,13 @@ static int uprobe_trace_func(struct trace_uprobe *tu, 
struct pt_regs *regs,
 
 static void uretprobe_trace_func(struct trace_uprobe *tu, unsigned long func,
 struct pt_regs *regs,
-struct uprobe_cpu_buffer *ucb, int dsize)
+struct uprobe_cpu_buffer *ucb)
 {
struct event_file_link *link;
 
rcu_read_lock();
trace_probe_for_each_link_rcu(link, &tu->tp)
-   __uprobe_trace_func(tu, func, regs, ucb, dsize, link->file);
+   __uprobe_trace_func(tu, func, regs, ucb, link->file);
rcu_read_unlock();
 }
 
@@ -1335,7 +1353,7 @@ static bool uprobe_perf_filter(struct uprobe_consumer *uc,
 
 static void __uprobe_perf_func(struct trace_uprobe *tu,
   unsigned long func, struct pt_regs *regs,
-  struct uprobe_cpu_buffer *ucb, int dsize)
+  struct uprobe_cpu_buffer *ucb)
 {
struct trace_event_call *call = trace_probe_event_call(&tu->tp);
struct uprobe_trace_entry_head *entry;
@@ -1356,7 +1374,7 @@ static void __uprobe_perf_func(struct 

[PATCH bpf-next 2/3] uprobes: prepare uprobe args buffer lazily

2024-03-12 Thread Andrii Nakryiko
uprobe_cpu_buffer and corresponding logic to store uprobe args into it
are used for uprobes/uretprobes that are created through tracefs or
perf events.

BPF is yet another user of uprobe/uretprobe infrastructure, but doesn't
need uprobe_cpu_buffer and associated data. For BPF-only use cases this
buffer handling and preparation is a pure overhead. At the same time,
BPF-only uprobe/uretprobe usage is very common in practice. Also, for
a lot of cases applications are very senstivie to performance overheads,
as they might be tracing a very high frequency functions like
malloc()/free(), so every bit of performance improvement matters.

All that is to say that this uprobe_cpu_buffer preparation is an
unnecessary overhead that each BPF user of uprobes/uretprobe has to pay.
This patch is changing this by making uprobe_cpu_buffer preparation
optional. It will happen only if either tracefs-based or perf event-based
uprobe/uretprobe consumer is registered for given uprobe/uretprobe. For
BPF-only use cases this step will be skipped.

We used uprobe/uretprobe benchmark which is part of BPF selftests (see [0])
to estimate the improvements. We have 3 uprobe and 3 uretprobe
scenarios, which vary an instruction that is replaced by uprobe: nop
(fastest uprobe case), `push rbp` (typical case), and non-simulated
`ret` instruction (slowest case). Benchmark thread is constantly calling
user space function in a tight loop. User space function has attached
BPF uprobe or uretprobe program doing nothing but atomic counter
increments to count number of triggering calls. Benchmark emits
throughput in millions of executions per second.

BEFORE these changes

uprobe-nop :2.657 ± 0.024M/s
uprobe-push:2.499 ± 0.018M/s
uprobe-ret :1.100 ± 0.006M/s
uretprobe-nop  :1.356 ± 0.004M/s
uretprobe-push :1.317 ± 0.019M/s
uretprobe-ret  :0.785 ± 0.007M/s

AFTER these changes
===
uprobe-nop :2.732 ± 0.022M/s (+2.8%)
uprobe-push:2.621 ± 0.016M/s (+4.9%)
uprobe-ret :1.105 ± 0.007M/s (+0.5%)
uretprobe-nop  :1.396 ± 0.007M/s (+2.9%)
uretprobe-push :1.347 ± 0.008M/s (+2.3%)
uretprobe-ret  :0.800 ± 0.006M/s (+1.9)

So the improvements on this particular machine seems to be between 2% and 5%.

  [0] 
https://github.com/torvalds/linux/blob/master/tools/testing/selftests/bpf/benchs/bench_trigger.c

Signed-off-by: Andrii Nakryiko 
---
 kernel/trace/trace_uprobe.c | 56 ++---
 1 file changed, 34 insertions(+), 22 deletions(-)

diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index a0f60bb10158..f2875349d124 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -963,15 +963,22 @@ static struct uprobe_cpu_buffer 
*prepare_uprobe_buffer(struct trace_uprobe *tu,
 
 static void __uprobe_trace_func(struct trace_uprobe *tu,
unsigned long func, struct pt_regs *regs,
-   struct uprobe_cpu_buffer *ucb,
+   struct uprobe_cpu_buffer **ucbp,
struct trace_event_file *trace_file)
 {
struct uprobe_trace_entry_head *entry;
struct trace_event_buffer fbuffer;
+   struct uprobe_cpu_buffer *ucb;
void *data;
int size, esize;
struct trace_event_call *call = trace_probe_event_call(&tu->tp);
 
+   ucb = *ucbp;
+   if (!ucb) {
+   ucb = prepare_uprobe_buffer(tu, regs);
+   *ucbp = ucb;
+   }
+
WARN_ON(call != trace_file->event_call);
 
if (WARN_ON_ONCE(tu->tp.size + ucb->dsize > PAGE_SIZE))
@@ -1002,7 +1009,7 @@ static void __uprobe_trace_func(struct trace_uprobe *tu,
 
 /* uprobe handler */
 static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs,
-struct uprobe_cpu_buffer *ucb)
+struct uprobe_cpu_buffer **ucbp)
 {
struct event_file_link *link;
 
@@ -1011,7 +1018,7 @@ static int uprobe_trace_func(struct trace_uprobe *tu, 
struct pt_regs *regs,
 
rcu_read_lock();
trace_probe_for_each_link_rcu(link, &tu->tp)
-   __uprobe_trace_func(tu, 0, regs, ucb, link->file);
+   __uprobe_trace_func(tu, 0, regs, ucbp, link->file);
rcu_read_unlock();
 
return 0;
@@ -1019,13 +1026,13 @@ static int uprobe_trace_func(struct trace_uprobe *tu, 
struct pt_regs *regs,
 
 static void uretprobe_trace_func(struct trace_uprobe *tu, unsigned long func,
 struct pt_regs *regs,
-struct uprobe_cpu_buffer *ucb)
+struct uprobe_cpu_buffer **ucbp)
 {
struct event_file_link *link;
 
rcu_read_lock();
trace_probe_for_each_link_rcu(link, &tu->tp)
-   __uprobe_trace_func(tu, func, regs, ucb, link->file);
+  

[PATCH bpf-next 3/3] uprobes: add speculative lockless system-wide uprobe filter check

2024-03-12 Thread Andrii Nakryiko
It's very common with BPF-based uprobe/uretprobe use cases to have
a system-wide (not PID specific) probes used. In this case uprobe's
trace_uprobe_filter->nr_systemwide counter is bumped at registration
time, and actual filtering is short circuited at the time when
uprobe/uretprobe is triggered.

This is a great optimization, and the only issue with it is that to even
get to checking this counter uprobe subsystem is taking
read-side trace_uprobe_filter->rwlock. This is actually noticeable in
profiles and is just another point of contention when uprobe is
triggered on multiple CPUs simultaneously.

This patch adds a speculative check before grabbing that rwlock. If
nr_systemwide is non-zero, lock is skipped and event is passed through.
>From examining existing logic it looks correct and safe to do. If
nr_systemwide is being modified under rwlock in parallel, we have to
consider basically just one important race condition: the case when
nr_systemwide is dropped from one to zero (from
trace_uprobe_filter_remove()) under filter->rwlock, but
uprobe_perf_filter() raced and saw it as >0.

In this case, we'll proceed with uprobe/uretprobe execution, while
uprobe_perf_close() and uprobe_apply() will be blocked on trying to grab
uprobe->register_rwsem as a writer. It will be blocked because
uprobe_dispatcher() (and, similarly, uretprobe_dispatcher()) runs with
uprobe->register_rwsem taken as a reader. So there is no real race
besides uprobe/uretprobe might execute one last time before it's
removed, which is fine because from user space perspective
uprobe/uretprobe hasn't been yet deactivated.

In case we speculatively read nr_systemwide as zero, while it was
incremented in parallel, we'll proceed to grabbing filter->rwlock and
re-doing the check, this time in lock-protected and non-racy way.

As such, it looks safe to do a quick short circuiting check and save
some performance in a very common system-wide case, not sacrificing hot
path performance due to much rarer possibility of registration or
unregistration of uprobes.

Again, confirming with BPF selftests's based benchmarks.

BEFORE (based on changes in previous patch)
===
uprobe-nop :2.732 ± 0.022M/s
uprobe-push:2.621 ± 0.016M/s
uprobe-ret :1.105 ± 0.007M/s
uretprobe-nop  :1.396 ± 0.007M/s
uretprobe-push :1.347 ± 0.008M/s
uretprobe-ret  :0.800 ± 0.006M/s

AFTER
=
uprobe-nop :2.878 ± 0.017M/s (+5.5%, total +8.3%)
uprobe-push:2.753 ± 0.013M/s (+5.3%, total +10.2%)
uprobe-ret :1.142 ± 0.010M/s (+3.8%, total +3.8%)
uretprobe-nop  :1.444 ± 0.008M/s (+3.5%, total +6.5%)
uretprobe-push :1.410 ± 0.010M/s (+4.8%, total +7.1%)
uretprobe-ret  :0.816 ± 0.002M/s (+2.0%, total +3.9%)

In the above, first percentage value is based on top of previous patch
(lazy uprobe buffer optimization), while the "total" percentage is
based on kernel without any of the changes in this patch set.

As can be seen, we get about 4% - 10% speed up, in total, with both lazy
uprobe buffer and speculative filter check optimizations.

Signed-off-by: Andrii Nakryiko 
---
 kernel/trace/trace_uprobe.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index f2875349d124..be28e6d0578e 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -1351,6 +1351,10 @@ static bool uprobe_perf_filter(struct uprobe_consumer 
*uc,
tu = container_of(uc, struct trace_uprobe, consumer);
filter = tu->tp.event->filter;
 
+   /* speculative check */
+   if (READ_ONCE(filter->nr_systemwide))
+   return true;
+
read_lock(&filter->rwlock);
ret = __uprobe_perf_filter(filter, mm);
read_unlock(&filter->rwlock);
-- 
2.43.0




Re: [PATCH bpf-next 1/3] uprobes: encapsulate preparation of uprobe args buffer

2024-03-13 Thread Andrii Nakryiko
On Wed, Mar 13, 2024 at 8:16 AM Oleg Nesterov  wrote:
>
> LGTM, one nit below.
>
> On 03/12, Andrii Nakryiko wrote:
> >
> > +static struct uprobe_cpu_buffer *prepare_uprobe_buffer(struct trace_uprobe 
> > *tu,
> > +struct pt_regs *regs)
> > +{
> > + struct uprobe_cpu_buffer *ucb;
> > + int dsize, esize;
> > +
> > + esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
> > + dsize = __get_data_size(&tu->tp, regs);
> > +
> > + ucb = uprobe_buffer_get();
> > + ucb->dsize = dsize;
> > +
> > + store_trace_args(ucb->buf, &tu->tp, regs, esize, dsize);
> > +
> > + return ucb;
> > +}
>
> OK, but note that every user of ->dsize adds tp.size. So I think you can
> simplify this code a bit more if you change prepare_uprobe_buffer() to do
>
> ucb->dsize = tu->tp.size + dsize;
>
> and update the users.
>

makes sense, done

> Oleg.
>



Re: [PATCH bpf-next 2/3] uprobes: prepare uprobe args buffer lazily

2024-03-13 Thread Andrii Nakryiko
On Wed, Mar 13, 2024 at 8:48 AM Oleg Nesterov  wrote:
>
> Again, looks good to me, but I have a minor nit. Feel free to ignore.
>
> On 03/12, Andrii Nakryiko wrote:
> >
> >  static void __uprobe_trace_func(struct trace_uprobe *tu,
> >   unsigned long func, struct pt_regs *regs,
> > - struct uprobe_cpu_buffer *ucb,
> > + struct uprobe_cpu_buffer **ucbp,
> >   struct trace_event_file *trace_file)
> >  {
> >   struct uprobe_trace_entry_head *entry;
> >   struct trace_event_buffer fbuffer;
> > + struct uprobe_cpu_buffer *ucb;
> >   void *data;
> >   int size, esize;
> >   struct trace_event_call *call = trace_probe_event_call(&tu->tp);
> >
> > + ucb = *ucbp;
> > + if (!ucb) {
> > + ucb = prepare_uprobe_buffer(tu, regs);
> > + *ucbp = ucb;
> > + }
>
> perhaps it would be more clean to pass ucbp to prepare_uprobe_buffer()
> and change it to do
>
> if (*ucbp)
> return *ucbp;
>
> at the start. Then __uprobe_trace_func() and __uprobe_perf_func() can
> simply do
>
> ucb = prepare_uprobe_buffer(tu, regs, ucbp);

ok, will do

>
> > - uprobe_buffer_put(ucb);
> > + if (ucb)
> > + uprobe_buffer_put(ucb);
>
> Similarly, I think the "ucb != NULL" check should be shifted into
> uprobe_buffer_put().

sure, will hide it inside uprobe_buffer_put()

>
> Oleg.
>



Re: [PATCH bpf-next 3/3] uprobes: add speculative lockless system-wide uprobe filter check

2024-03-13 Thread Andrii Nakryiko
On Wed, Mar 13, 2024 at 6:20 AM Oleg Nesterov  wrote:
>
> I forgot everything about this code, plus it has changed a lot since
> I looked at it many years ago, but ...
>
> I think this change is fine but the changelog looks a bit confusing
> (overcomplicated) to me.

It's a new piece of code and logic, so I tried to do my due diligence
and argue why I think it's fine. I'll drop the overcomplicated
explanation, as I agree with you that it's inherently racy even
without my changes (and use-after-free safety is provided with
uprobe->register_rwsem independent from all this).

>
> On 03/12, Andrii Nakryiko wrote:
> >
> > This patch adds a speculative check before grabbing that rwlock. If
> > nr_systemwide is non-zero, lock is skipped and event is passed through.
> > From examining existing logic it looks correct and safe to do. If
> > nr_systemwide is being modified under rwlock in parallel, we have to
> > consider basically just one important race condition: the case when
> > nr_systemwide is dropped from one to zero (from
> > trace_uprobe_filter_remove()) under filter->rwlock, but
> > uprobe_perf_filter() raced and saw it as >0.
>
> Unless I am totally confused, there is nothing new. Even without
> this change trace_uprobe_filter_remove() can clear nr_systemwide
> right after uprobe_perf_filter() drops filter->rwlock.
>
> And of course, trace_uprobe_filter_add() can change nr_systemwide
> from 0 to 1. In this case uprobe_perf_func() can "wrongly" return
> UPROBE_HANDLER_REMOVE but we can't avoid this and afaics this is
> fine even if handler_chain() does unapply_uprobe(), uprobe_perf_open()
> will do uprobe_apply() after that, we can rely on ->register_rwsem.
>

yep, agreed

> > In case we speculatively read nr_systemwide as zero, while it was
> > incremented in parallel, we'll proceed to grabbing filter->rwlock and
> > re-doing the check, this time in lock-protected and non-racy way.
>
> See above...
>
>
> So I think uprobe_perf_filter() needs filter->rwlock only to iterate
> the list, it can check nr_systemwide lockless and this means that you
> can also remove the same check in __uprobe_perf_filter(), other callers
> trace_uprobe_filter_add/remove check it themselves.
>

makes sense, will do

>
> > --- a/kernel/trace/trace_uprobe.c
> > +++ b/kernel/trace/trace_uprobe.c
> > @@ -1351,6 +1351,10 @@ static bool uprobe_perf_filter(struct 
> > uprobe_consumer *uc,
> >   tu = container_of(uc, struct trace_uprobe, consumer);
> >   filter = tu->tp.event->filter;
> >
> > + /* speculative check */
> > + if (READ_ONCE(filter->nr_systemwide))
> > + return true;
> > +
> >   read_lock(&filter->rwlock);
> >   ret = __uprobe_perf_filter(filter, mm);
> >   read_unlock(&filter->rwlock);
>
> ACK,
>
> but see above. I think the changelog should be simplified and the
> filter->nr_systemwide check in __uprobe_perf_filter() should be
> removed. But I won't insist and perhaps I missed something...
>

I think you are right, I'll move the check

> Oleg.
>



Re: [PATCH bpf-next 0/3] uprobes: two common case speed ups

2024-03-13 Thread Andrii Nakryiko
On Wed, Mar 13, 2024 at 2:41 AM Jiri Olsa  wrote:
>
> On Tue, Mar 12, 2024 at 02:02:30PM -0700, Andrii Nakryiko wrote:
> > This patch set implements two speed ups for uprobe/uretprobe runtime 
> > execution
> > path for some common scenarios: BPF-only uprobes (patches #1 and #2) and
> > system-wide (non-PID-specific) uprobes (patch #3). Please see individual
> > patches for details.
> >
> > Given I haven't worked with uprobe code before, I'm unfamiliar with
> > conventions in this subsystem, including which kernel tree patches should be
> > sent to. For now I based all the changes on top of bpf-next/master, which is
> > where I tested and benchmarked everything anyways. Please advise what should
> > I use as a base for subsequent revision. Thanks.

Steven, Masami,

Is this the kind of patches that should go through your tree(s)? Or
you'd be fine with this going through bpf-next? I'd appreciate the
link to the specific GIT repo I should use as a base in the former
case, thank you!

> >
> > Andrii Nakryiko (3):
> >   uprobes: encapsulate preparation of uprobe args buffer
> >   uprobes: prepare uprobe args buffer lazily
> >   uprobes: add speculative lockless system-wide uprobe filter check
>
> nice cleanup and speed up, lgtm
>
> Reviewed-by: Jiri Olsa 
>
> jirka
>
> >
> >  kernel/trace/trace_uprobe.c | 103 ++--
> >  1 file changed, 63 insertions(+), 40 deletions(-)
> >
> > --
> > 2.43.0
> >
> >



[PATCH v2 0/3] uprobes: two common case speed ups

2024-03-18 Thread Andrii Nakryiko
This patch set implements two speed ups for uprobe/uretprobe runtime execution
path for some common scenarios: BPF-only uprobes (patches #1 and #2) and
system-wide (non-PID-specific) uprobes (patch #3). Please see individual
patches for details.

v1->v2:
  - rebased onto trace/core branch of tracing tree, hopefully I guessed right;
  - simplified user_cpu_buffer usage further (Oleg Nesterov);
  - simplified patch #3, just moved speculative check outside of lock (Oleg);
  - added Reviewed-by from Jiri Olsa.

Andrii Nakryiko (3):
  uprobes: encapsulate preparation of uprobe args buffer
  uprobes: prepare uprobe args buffer lazily
  uprobes: add speculative lockless system-wide uprobe filter check

 kernel/trace/trace_uprobe.c | 103 +---
 1 file changed, 59 insertions(+), 44 deletions(-)

-- 
2.43.0




[PATCH v2 1/3] uprobes: encapsulate preparation of uprobe args buffer

2024-03-18 Thread Andrii Nakryiko
Move the logic of fetching temporary per-CPU uprobe buffer and storing
uprobes args into it to a new helper function. Store data size as part
of this buffer, simplifying interfaces a bit, as now we only pass single
uprobe_cpu_buffer reference around, instead of pointer + dsize.

This logic was duplicated across uprobe_dispatcher and uretprobe_dispatcher,
and now will be centralized. All this is also in preparation to make
this uprobe_cpu_buffer handling logic optional in the next patch.

Reviewed-by: Jiri Olsa 
Signed-off-by: Andrii Nakryiko 
---
 kernel/trace/trace_uprobe.c | 78 +++--
 1 file changed, 41 insertions(+), 37 deletions(-)

diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index a84b85d8aac1..9bffaab448a6 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -854,6 +854,7 @@ static const struct file_operations uprobe_profile_ops = {
 struct uprobe_cpu_buffer {
struct mutex mutex;
void *buf;
+   int dsize;
 };
 static struct uprobe_cpu_buffer __percpu *uprobe_cpu_buffer;
 static int uprobe_buffer_refcnt;
@@ -943,9 +944,26 @@ static void uprobe_buffer_put(struct uprobe_cpu_buffer 
*ucb)
mutex_unlock(&ucb->mutex);
 }
 
+static struct uprobe_cpu_buffer *prepare_uprobe_buffer(struct trace_uprobe *tu,
+  struct pt_regs *regs)
+{
+   struct uprobe_cpu_buffer *ucb;
+   int dsize, esize;
+
+   esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
+   dsize = __get_data_size(&tu->tp, regs);
+
+   ucb = uprobe_buffer_get();
+   ucb->dsize = tu->tp.size + dsize;
+
+   store_trace_args(ucb->buf, &tu->tp, regs, esize, dsize);
+
+   return ucb;
+}
+
 static void __uprobe_trace_func(struct trace_uprobe *tu,
unsigned long func, struct pt_regs *regs,
-   struct uprobe_cpu_buffer *ucb, int dsize,
+   struct uprobe_cpu_buffer *ucb,
struct trace_event_file *trace_file)
 {
struct uprobe_trace_entry_head *entry;
@@ -956,14 +974,14 @@ static void __uprobe_trace_func(struct trace_uprobe *tu,
 
WARN_ON(call != trace_file->event_call);
 
-   if (WARN_ON_ONCE(tu->tp.size + dsize > PAGE_SIZE))
+   if (WARN_ON_ONCE(ucb->dsize > PAGE_SIZE))
return;
 
if (trace_trigger_soft_disabled(trace_file))
return;
 
esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
-   size = esize + tu->tp.size + dsize;
+   size = esize + ucb->dsize;
entry = trace_event_buffer_reserve(&fbuffer, trace_file, size);
if (!entry)
return;
@@ -977,14 +995,14 @@ static void __uprobe_trace_func(struct trace_uprobe *tu,
data = DATAOF_TRACE_ENTRY(entry, false);
}
 
-   memcpy(data, ucb->buf, tu->tp.size + dsize);
+   memcpy(data, ucb->buf, ucb->dsize);
 
trace_event_buffer_commit(&fbuffer);
 }
 
 /* uprobe handler */
 static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs,
-struct uprobe_cpu_buffer *ucb, int dsize)
+struct uprobe_cpu_buffer *ucb)
 {
struct event_file_link *link;
 
@@ -993,7 +1011,7 @@ static int uprobe_trace_func(struct trace_uprobe *tu, 
struct pt_regs *regs,
 
rcu_read_lock();
trace_probe_for_each_link_rcu(link, &tu->tp)
-   __uprobe_trace_func(tu, 0, regs, ucb, dsize, link->file);
+   __uprobe_trace_func(tu, 0, regs, ucb, link->file);
rcu_read_unlock();
 
return 0;
@@ -1001,13 +1019,13 @@ static int uprobe_trace_func(struct trace_uprobe *tu, 
struct pt_regs *regs,
 
 static void uretprobe_trace_func(struct trace_uprobe *tu, unsigned long func,
 struct pt_regs *regs,
-struct uprobe_cpu_buffer *ucb, int dsize)
+struct uprobe_cpu_buffer *ucb)
 {
struct event_file_link *link;
 
rcu_read_lock();
trace_probe_for_each_link_rcu(link, &tu->tp)
-   __uprobe_trace_func(tu, func, regs, ucb, dsize, link->file);
+   __uprobe_trace_func(tu, func, regs, ucb, link->file);
rcu_read_unlock();
 }
 
@@ -1335,7 +1353,7 @@ static bool uprobe_perf_filter(struct uprobe_consumer *uc,
 
 static void __uprobe_perf_func(struct trace_uprobe *tu,
   unsigned long func, struct pt_regs *regs,
-  struct uprobe_cpu_buffer *ucb, int dsize)
+  struct uprobe_cpu_buffer *ucb)
 {
struct trace_event_call *call = trace_probe_event_call(&tu->tp);
struct uprobe_trace_entry_head *entry;
@@ -1356,7 +1374,7 @@ static void __uprobe_perf_func(struct trace_

[PATCH v2 2/3] uprobes: prepare uprobe args buffer lazily

2024-03-18 Thread Andrii Nakryiko
uprobe_cpu_buffer and corresponding logic to store uprobe args into it
are used for uprobes/uretprobes that are created through tracefs or
perf events.

BPF is yet another user of uprobe/uretprobe infrastructure, but doesn't
need uprobe_cpu_buffer and associated data. For BPF-only use cases this
buffer handling and preparation is a pure overhead. At the same time,
BPF-only uprobe/uretprobe usage is very common in practice. Also, for
a lot of cases applications are very senstivie to performance overheads,
as they might be tracing a very high frequency functions like
malloc()/free(), so every bit of performance improvement matters.

All that is to say that this uprobe_cpu_buffer preparation is an
unnecessary overhead that each BPF user of uprobes/uretprobe has to pay.
This patch is changing this by making uprobe_cpu_buffer preparation
optional. It will happen only if either tracefs-based or perf event-based
uprobe/uretprobe consumer is registered for given uprobe/uretprobe. For
BPF-only use cases this step will be skipped.

We used uprobe/uretprobe benchmark which is part of BPF selftests (see [0])
to estimate the improvements. We have 3 uprobe and 3 uretprobe
scenarios, which vary an instruction that is replaced by uprobe: nop
(fastest uprobe case), `push rbp` (typical case), and non-simulated
`ret` instruction (slowest case). Benchmark thread is constantly calling
user space function in a tight loop. User space function has attached
BPF uprobe or uretprobe program doing nothing but atomic counter
increments to count number of triggering calls. Benchmark emits
throughput in millions of executions per second.

BEFORE these changes

uprobe-nop :2.657 ± 0.024M/s
uprobe-push:2.499 ± 0.018M/s
uprobe-ret :1.100 ± 0.006M/s
uretprobe-nop  :1.356 ± 0.004M/s
uretprobe-push :1.317 ± 0.019M/s
uretprobe-ret  :0.785 ± 0.007M/s

AFTER these changes
===
uprobe-nop :2.732 ± 0.022M/s (+2.8%)
uprobe-push:2.621 ± 0.016M/s (+4.9%)
uprobe-ret :1.105 ± 0.007M/s (+0.5%)
uretprobe-nop  :1.396 ± 0.007M/s (+2.9%)
uretprobe-push :1.347 ± 0.008M/s (+2.3%)
uretprobe-ret  :0.800 ± 0.006M/s (+1.9)

So the improvements on this particular machine seems to be between 2% and 5%.

  [0] 
https://github.com/torvalds/linux/blob/master/tools/testing/selftests/bpf/benchs/bench_trigger.c

Reviewed-by: Jiri Olsa 
Signed-off-by: Andrii Nakryiko 
---
 kernel/trace/trace_uprobe.c | 49 +
 1 file changed, 28 insertions(+), 21 deletions(-)

diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 9bffaab448a6..b5da95240a31 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -941,15 +941,21 @@ static struct uprobe_cpu_buffer *uprobe_buffer_get(void)
 
 static void uprobe_buffer_put(struct uprobe_cpu_buffer *ucb)
 {
+   if (!ucb)
+   return;
mutex_unlock(&ucb->mutex);
 }
 
 static struct uprobe_cpu_buffer *prepare_uprobe_buffer(struct trace_uprobe *tu,
-  struct pt_regs *regs)
+  struct pt_regs *regs,
+  struct uprobe_cpu_buffer 
**ucbp)
 {
struct uprobe_cpu_buffer *ucb;
int dsize, esize;
 
+   if (*ucbp)
+   return *ucbp;
+
esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
dsize = __get_data_size(&tu->tp, regs);
 
@@ -958,22 +964,25 @@ static struct uprobe_cpu_buffer 
*prepare_uprobe_buffer(struct trace_uprobe *tu,
 
store_trace_args(ucb->buf, &tu->tp, regs, esize, dsize);
 
+   *ucbp = ucb;
return ucb;
 }
 
 static void __uprobe_trace_func(struct trace_uprobe *tu,
unsigned long func, struct pt_regs *regs,
-   struct uprobe_cpu_buffer *ucb,
+   struct uprobe_cpu_buffer **ucbp,
struct trace_event_file *trace_file)
 {
struct uprobe_trace_entry_head *entry;
struct trace_event_buffer fbuffer;
+   struct uprobe_cpu_buffer *ucb;
void *data;
int size, esize;
struct trace_event_call *call = trace_probe_event_call(&tu->tp);
 
WARN_ON(call != trace_file->event_call);
 
+   ucb = prepare_uprobe_buffer(tu, regs, ucbp);
if (WARN_ON_ONCE(ucb->dsize > PAGE_SIZE))
return;
 
@@ -1002,7 +1011,7 @@ static void __uprobe_trace_func(struct trace_uprobe *tu,
 
 /* uprobe handler */
 static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs,
-struct uprobe_cpu_buffer *ucb)
+struct uprobe_cpu_buffer **ucbp)
 {
struct event_file_link *link;
 
@@ -1011,7 +1020,7 @@ static int uprobe_trace_func(struct trace_uprobe *tu, 

[PATCH v2 3/3] uprobes: add speculative lockless system-wide uprobe filter check

2024-03-18 Thread Andrii Nakryiko
It's very common with BPF-based uprobe/uretprobe use cases to have
a system-wide (not PID specific) probes used. In this case uprobe's
trace_uprobe_filter->nr_systemwide counter is bumped at registration
time, and actual filtering is short circuited at the time when
uprobe/uretprobe is triggered.

This is a great optimization, and the only issue with it is that to even
get to checking this counter uprobe subsystem is taking
read-side trace_uprobe_filter->rwlock. This is actually noticeable in
profiles and is just another point of contention when uprobe is
triggered on multiple CPUs simultaneously.

This patch moves this nr_systemwide check outside of filter list's
rwlock scope, as rwlock is meant to protect list modification, while
nr_systemwide-based check is speculative and racy already, despite the
lock (as discussed in [0]). trace_uprobe_filter_remove() and
trace_uprobe_filter_add() already check for filter->nr_systewide
explicitly outside of __uprobe_perf_filter, so no modifications are
required there.

Confirming with BPF selftests's based benchmarks.

BEFORE (based on changes in previous patch)
===
uprobe-nop :2.732 ± 0.022M/s
uprobe-push:2.621 ± 0.016M/s
uprobe-ret :1.105 ± 0.007M/s
uretprobe-nop  :1.396 ± 0.007M/s
uretprobe-push :1.347 ± 0.008M/s
uretprobe-ret  :0.800 ± 0.006M/s

AFTER
=
uprobe-nop :2.878 ± 0.017M/s (+5.5%, total +8.3%)
uprobe-push:2.753 ± 0.013M/s (+5.3%, total +10.2%)
uprobe-ret :1.142 ± 0.010M/s (+3.8%, total +3.8%)
uretprobe-nop  :1.444 ± 0.008M/s (+3.5%, total +6.5%)
uretprobe-push :1.410 ± 0.010M/s (+4.8%, total +7.1%)
uretprobe-ret  :0.816 ± 0.002M/s (+2.0%, total +3.9%)

In the above, first percentage value is based on top of previous patch
(lazy uprobe buffer optimization), while the "total" percentage is
based on kernel without any of the changes in this patch set.

As can be seen, we get about 4% - 10% speed up, in total, with both lazy
uprobe buffer and speculative filter check optimizations.

  [0] https://lore.kernel.org/bpf/20240313131926.ga19...@redhat.com/

Reviewed-by: Jiri Olsa 
Signed-off-by: Andrii Nakryiko 
---
 kernel/trace/trace_uprobe.c | 10 +++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index b5da95240a31..ac05885a6ce6 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -1226,9 +1226,6 @@ __uprobe_perf_filter(struct trace_uprobe_filter *filter, 
struct mm_struct *mm)
 {
struct perf_event *event;
 
-   if (filter->nr_systemwide)
-   return true;
-
list_for_each_entry(event, &filter->perf_events, hw.tp_list) {
if (event->hw.target->mm == mm)
return true;
@@ -1353,6 +1350,13 @@ static bool uprobe_perf_filter(struct uprobe_consumer 
*uc,
tu = container_of(uc, struct trace_uprobe, consumer);
filter = tu->tp.event->filter;
 
+   /*
+* speculative short-circuiting check to avoid unnecessarily taking
+* filter->rwlock below, if the uprobe has system-wide consumer
+*/
+   if (READ_ONCE(filter->nr_systemwide))
+   return true;
+
read_lock(&filter->rwlock);
ret = __uprobe_perf_filter(filter, mm);
read_unlock(&filter->rwlock);
-- 
2.43.0




Re: [PATCH v2 0/3] uprobes: two common case speed ups

2024-03-19 Thread Andrii Nakryiko
On Mon, Mar 18, 2024 at 9:21 PM Masami Hiramatsu  wrote:
>
> Hi,
>
> On Mon, 18 Mar 2024 11:17:25 -0700
> Andrii Nakryiko  wrote:
>
> > This patch set implements two speed ups for uprobe/uretprobe runtime 
> > execution
> > path for some common scenarios: BPF-only uprobes (patches #1 and #2) and
> > system-wide (non-PID-specific) uprobes (patch #3). Please see individual
> > patches for details.
>
> This series looks good to me. Let me pick it on probes/for-next.

Great, at least I guessed the Git repo right, if not the branch.
Thanks for pulling it in! I assume some other uprobe-related follow up
patches should be based on probes/for-next as well, right?

>
> Thanks!
>
> >
> > v1->v2:
> >   - rebased onto trace/core branch of tracing tree, hopefully I guessed 
> > right;
> >   - simplified user_cpu_buffer usage further (Oleg Nesterov);
> >   - simplified patch #3, just moved speculative check outside of lock 
> > (Oleg);
> >   - added Reviewed-by from Jiri Olsa.
> >
> > Andrii Nakryiko (3):
> >   uprobes: encapsulate preparation of uprobe args buffer
> >   uprobes: prepare uprobe args buffer lazily
> >   uprobes: add speculative lockless system-wide uprobe filter check
> >
> >  kernel/trace/trace_uprobe.c | 103 +---
> >  1 file changed, 59 insertions(+), 44 deletions(-)
> >
> > --
> > 2.43.0
> >
>
>
> --
> Masami Hiramatsu (Google) 



Re: [PATCH] uprobes: reduce contention on uprobes_tree access

2024-03-21 Thread Andrii Nakryiko
On Thu, Mar 21, 2024 at 7:57 AM Jonathan Haslam
 wrote:
>
> Active uprobes are stored in an RB tree and accesses to this tree are
> dominated by read operations. Currently these accesses are serialized by
> a spinlock but this leads to enormous contention when large numbers of
> threads are executing active probes.
>
> This patch converts the spinlock used to serialize access to the
> uprobes_tree RB tree into a reader-writer spinlock. This lock type
> aligns naturally with the overwhelmingly read-only nature of the tree
> usage here. Although the addition of reader-writer spinlocks are
> discouraged [0], this fix is proposed as an interim solution while an
> RCU based approach is implemented (that work is in a nascent form). This
> fix also has the benefit of being trivial, self contained and therefore
> simple to backport.

Yep, makes sense, I think we'll want to backport this ASAP to some of
the old kernels we have. Thanks!

Acked-by: Andrii Nakryiko 

>
> This change has been tested against production workloads that exhibit
> significant contention on the spinlock and an almost order of magnitude
> reduction for mean uprobe execution time is observed (28 -> 3.5 microsecs).
>
> [0] https://docs.kernel.org/locking/spinlocks.html
>
> Signed-off-by: Jonathan Haslam 
> ---
>  kernel/events/uprobes.c | 22 +++---
>  1 file changed, 11 insertions(+), 11 deletions(-)
>
> diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
> index 929e98c62965..42bf9b6e8bc0 100644
> --- a/kernel/events/uprobes.c
> +++ b/kernel/events/uprobes.c
> @@ -39,7 +39,7 @@ static struct rb_root uprobes_tree = RB_ROOT;
>   */
>  #define no_uprobe_events() RB_EMPTY_ROOT(&uprobes_tree)
>
> -static DEFINE_SPINLOCK(uprobes_treelock);  /* serialize rbtree access */
> +static DEFINE_RWLOCK(uprobes_treelock);/* serialize rbtree access */
>
>  #define UPROBES_HASH_SZ13
>  /* serialize uprobe->pending_list */
> @@ -669,9 +669,9 @@ static struct uprobe *find_uprobe(struct inode *inode, 
> loff_t offset)
>  {
> struct uprobe *uprobe;
>
> -   spin_lock(&uprobes_treelock);
> +   read_lock(&uprobes_treelock);
> uprobe = __find_uprobe(inode, offset);
> -   spin_unlock(&uprobes_treelock);
> +   read_unlock(&uprobes_treelock);
>
> return uprobe;
>  }
> @@ -701,9 +701,9 @@ static struct uprobe *insert_uprobe(struct uprobe *uprobe)
>  {
> struct uprobe *u;
>
> -   spin_lock(&uprobes_treelock);
> +   write_lock(&uprobes_treelock);
> u = __insert_uprobe(uprobe);
> -   spin_unlock(&uprobes_treelock);
> +   write_unlock(&uprobes_treelock);
>
> return u;
>  }
> @@ -935,9 +935,9 @@ static void delete_uprobe(struct uprobe *uprobe)
> if (WARN_ON(!uprobe_is_active(uprobe)))
> return;
>
> -   spin_lock(&uprobes_treelock);
> +   write_lock(&uprobes_treelock);
> rb_erase(&uprobe->rb_node, &uprobes_tree);
> -   spin_unlock(&uprobes_treelock);
> +   write_unlock(&uprobes_treelock);
> RB_CLEAR_NODE(&uprobe->rb_node); /* for uprobe_is_active() */
> put_uprobe(uprobe);
>  }
> @@ -1298,7 +1298,7 @@ static void build_probe_list(struct inode *inode,
> min = vaddr_to_offset(vma, start);
> max = min + (end - start) - 1;
>
> -   spin_lock(&uprobes_treelock);
> +   read_lock(&uprobes_treelock);
> n = find_node_in_range(inode, min, max);
> if (n) {
> for (t = n; t; t = rb_prev(t)) {
> @@ -1316,7 +1316,7 @@ static void build_probe_list(struct inode *inode,
> get_uprobe(u);
> }
> }
> -   spin_unlock(&uprobes_treelock);
> +   read_unlock(&uprobes_treelock);
>  }
>
>  /* @vma contains reference counter, not the probed instruction. */
> @@ -1407,9 +1407,9 @@ vma_has_uprobes(struct vm_area_struct *vma, unsigned 
> long start, unsigned long e
> min = vaddr_to_offset(vma, start);
> max = min + (end - start) - 1;
>
> -   spin_lock(&uprobes_treelock);
> +   read_lock(&uprobes_treelock);
> n = find_node_in_range(inode, min, max);
> -   spin_unlock(&uprobes_treelock);
> +   read_unlock(&uprobes_treelock);
>
> return !!n;
>  }
> --
> 2.43.0
>



Re: raw_tp+cookie is buggy. Was: [syzbot] [bpf?] [trace?] KASAN: slab-use-after-free Read in bpf_trace_run1

2024-03-25 Thread Andrii Nakryiko
On Sun, Mar 24, 2024 at 5:07 PM Alexei Starovoitov
 wrote:
>
> Hi Andrii,
>
> syzbot found UAF in raw_tp cookie series in bpf-next.
> Reverting the whole merge
> 2e244a72cd48 ("Merge branch 'bpf-raw-tracepoint-support-for-bpf-cookie'")
>
> fixes the issue.
>
> Pls take a look.
> See C reproducer below. It splats consistently with CONFIG_KASAN=y
>
> Thanks.

Will do, traveling today, so will be offline for a bit, but will check
first thing afterwards.

>
> On Sun, Mar 24, 2024 at 4:28 PM syzbot
>  wrote:
> >
> > Hello,
> >
> > syzbot found the following issue on:
> >
> > HEAD commit:520fad2e3206 selftests/bpf: scale benchmark counting by us..
> > git tree:   bpf-next
> > console+strace: https://syzkaller.appspot.com/x/log.txt?x=105af94618
> > kernel config:  https://syzkaller.appspot.com/x/.config?x=6fb1be60a193d440
> > dashboard link: https://syzkaller.appspot.com/bug?extid=981935d9485a560bfbcb
> > compiler:   Debian clang version 15.0.6, GNU ld (GNU Binutils for 
> > Debian) 2.40
> > syz repro:  https://syzkaller.appspot.com/x/repro.syz?x=114f17a518
> > C reproducer:   https://syzkaller.appspot.com/x/repro.c?x=162bb7a518
> >
> > Downloadable assets:
> > disk image: 
> > https://storage.googleapis.com/syzbot-assets/4eef3506c5ce/disk-520fad2e.raw.xz
> > vmlinux: 
> > https://storage.googleapis.com/syzbot-assets/24d60ebe76cc/vmlinux-520fad2e.xz
> > kernel image: 
> > https://storage.googleapis.com/syzbot-assets/8f883e706550/bzImage-520fad2e.xz
> >
> > IMPORTANT: if you fix the issue, please add the following tag to the commit:
> > Reported-by: syzbot+981935d9485a560bf...@syzkaller.appspotmail.com
> >
> > ==
> > BUG: KASAN: slab-use-after-free in __bpf_trace_run 
> > kernel/trace/bpf_trace.c:2376 [inline]
> > BUG: KASAN: slab-use-after-free in bpf_trace_run1+0xcb/0x510 
> > kernel/trace/bpf_trace.c:2430
> > Read of size 8 at addr 8880290d9918 by task migration/0/19
> >
> > CPU: 0 PID: 19 Comm: migration/0 Not tainted 
> > 6.8.0-syzkaller-05233-g520fad2e3206 #0
> > Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS 
> > Google 02/29/2024
> > Stopper: 0x0 <- 0x0
> > Call Trace:
> >  
> >  __dump_stack lib/dump_stack.c:88 [inline]
> >  dump_stack_lvl+0x1e7/0x2e0 lib/dump_stack.c:106
> >  print_address_description mm/kasan/report.c:377 [inline]
> >  print_report+0x169/0x550 mm/kasan/report.c:488
> >  kasan_report+0x143/0x180 mm/kasan/report.c:601
> >  __bpf_trace_run kernel/trace/bpf_trace.c:2376 [inline]
> >  bpf_trace_run1+0xcb/0x510 kernel/trace/bpf_trace.c:2430
> >  __traceiter_rcu_utilization+0x74/0xb0 include/trace/events/rcu.h:27
> >  trace_rcu_utilization+0x194/0x1c0 include/trace/events/rcu.h:27
> >  rcu_note_context_switch+0xc7c/0xff0 kernel/rcu/tree_plugin.h:360
> >  __schedule+0x345/0x4a20 kernel/sched/core.c:6635
> >  __schedule_loop kernel/sched/core.c:6813 [inline]
> >  schedule+0x14b/0x320 kernel/sched/core.c:6828
> >  smpboot_thread_fn+0x61e/0xa30 kernel/smpboot.c:160
> >  kthread+0x2f0/0x390 kernel/kthread.c:388
> >  ret_from_fork+0x4b/0x80 arch/x86/kernel/process.c:147
> >  ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:243
> >  
> >
> > Allocated by task 5075:
> >  kasan_save_stack mm/kasan/common.c:47 [inline]
> >  kasan_save_track+0x3f/0x80 mm/kasan/common.c:68
> >  poison_kmalloc_redzone mm/kasan/common.c:370 [inline]
> >  __kasan_kmalloc+0x98/0xb0 mm/kasan/common.c:387
> >  kasan_kmalloc include/linux/kasan.h:211 [inline]
> >  kmalloc_trace+0x1d9/0x360 mm/slub.c:4012
> >  kmalloc include/linux/slab.h:590 [inline]
> >  kzalloc include/linux/slab.h:711 [inline]
> >  bpf_raw_tp_link_attach+0x2a0/0x6e0 kernel/bpf/syscall.c:3816
> >  bpf_raw_tracepoint_open+0x1c2/0x240 kernel/bpf/syscall.c:3863
> >  __sys_bpf+0x3c0/0x810 kernel/bpf/syscall.c:5673
> >  __do_sys_bpf kernel/bpf/syscall.c:5738 [inline]
> >  __se_sys_bpf kernel/bpf/syscall.c:5736 [inline]
> >  __x64_sys_bpf+0x7c/0x90 kernel/bpf/syscall.c:5736
> >  do_syscall_64+0xfb/0x240
> >  entry_SYSCALL_64_after_hwframe+0x6d/0x75
> >
> > Freed by task 5075:
> >  kasan_save_stack mm/kasan/common.c:47 [inline]
> >  kasan_save_track+0x3f/0x80 mm/kasan/common.c:68
> >  kasan_save_free_info+0x40/0x50 mm/kasan/generic.c:589
> >  poison_slab_object+0xa6/0xe0 mm/kasan/common.c:240
> >  __kasan_slab_free+0x37/0x60 mm/kasan/common.c:256
> >  kasan_slab_free include/linux/kasan.h:184 [inline]
> >  slab_free_hook mm/slub.c:2121 [inline]
> >  slab_free mm/slub.c:4299 [inline]
> >  kfree+0x14a/0x380 mm/slub.c:4409
> >  bpf_link_release+0x3b/0x50 kernel/bpf/syscall.c:3071
> >  __fput+0x429/0x8a0 fs/file_table.c:423
> >  task_work_run+0x24f/0x310 kernel/task_work.c:180
> >  exit_task_work include/linux/task_work.h:38 [inline]
> >  do_exit+0xa1b/0x27e0 kernel/exit.c:878
> >  do_group_exit+0x207/0x2c0 kernel/exit.c:1027
> >  __do_sys_exit_group kernel/exit.c:1038 [inline]
> >  __se_sys_exit_group kernel/exit.c:1036 [inline]
> >  __x64_sys_ex

Re: [PATCH] uprobes: reduce contention on uprobes_tree access

2024-03-25 Thread Andrii Nakryiko
On Mon, Mar 25, 2024 at 12:12 PM Jonthan Haslam
 wrote:
>
> Hi Ingo,
>
> > > This change has been tested against production workloads that exhibit
> > > significant contention on the spinlock and an almost order of magnitude
> > > reduction for mean uprobe execution time is observed (28 -> 3.5 
> > > microsecs).
> >
> > Have you considered/measured per-CPU RW semaphores?
>
> No I hadn't but thanks hugely for suggesting it! In initial measurements
> it seems to be between 20-100% faster than the RW spinlocks! Apologies for
> all the exclamation marks but I'm very excited. I'll do some more testing
> tomorrow but so far it's looking very good.
>

Documentation ([0]) says that locking for writing calls
synchronize_rcu(), is that right? If that's true, attaching multiple
uprobes (including just attaching a single BPF multi-uprobe) will take
a really long time. We need to confirm we are not significantly
regressing this. And if we do, we need to take measures in the BPF
multi-uprobe attachment code path to make sure that a single
multi-uprobe attachment is still fast.

If my worries above turn out to be true, it still feels like a first
good step should be landing this patch as is (and get it backported to
older kernels), and then have percpu rw-semaphore as a final (and a
bit more invasive) solution (it's RCU-based, so feels like a good
primitive to settle on), making sure to not regress multi-uprobes
(we'll probably will need some batched API for multiple uprobes).

Thoughts?

  [0] https://docs.kernel.org/locking/percpu-rw-semaphore.html

> Thanks again for the input.
>
> Jon.



Re: raw_tp+cookie is buggy. Was: [syzbot] [bpf?] [trace?] KASAN: slab-use-after-free Read in bpf_trace_run1

2024-03-25 Thread Andrii Nakryiko
On Mon, Mar 25, 2024 at 10:27 AM Andrii Nakryiko
 wrote:
>
> On Sun, Mar 24, 2024 at 5:07 PM Alexei Starovoitov
>  wrote:
> >
> > Hi Andrii,
> >
> > syzbot found UAF in raw_tp cookie series in bpf-next.
> > Reverting the whole merge
> > 2e244a72cd48 ("Merge branch 'bpf-raw-tracepoint-support-for-bpf-cookie'")
> >
> > fixes the issue.
> >
> > Pls take a look.
> > See C reproducer below. It splats consistently with CONFIG_KASAN=y
> >
> > Thanks.
>
> Will do, traveling today, so will be offline for a bit, but will check
> first thing afterwards.
>

Ok, so I don't think it's bpf_raw_tp_link specific, it should affect a
bunch of other links (unless I missed something). Basically, when last
link refcnt drops, we detach, do bpf_prog_put() and then proceed to
kfree link itself synchronously. But that link can still be referred
from running BPF program (I think multi-kprobe/multi-uprobe use it for
cookies, raw_tp with my changes started using link at runtime, there
are probably more types), and so if we free this memory synchronously,
we can have UAF.

We should do what we do for bpf_maps and delay freeing, the only
question is how tunable that freeing can be? Always do call_rcu()?
Always call_rcu_tasks_trace() (relevant for sleepable multi-uprobes)?
Should we allow synchronous free if link is not directly accessible
from program during its run?

Anyway, I sent a fix as an RFC so we can discuss.

> >
> > On Sun, Mar 24, 2024 at 4:28 PM syzbot
> >  wrote:
> > >
> > > Hello,
> > >
> > > syzbot found the following issue on:
> > >
> > > HEAD commit:520fad2e3206 selftests/bpf: scale benchmark counting by 
> > > us..
> > > git tree:   bpf-next
> > > console+strace: https://syzkaller.appspot.com/x/log.txt?x=105af94618
> > > kernel config:  https://syzkaller.appspot.com/x/.config?x=6fb1be60a193d440
> > > dashboard link: 
> > > https://syzkaller.appspot.com/bug?extid=981935d9485a560bfbcb
> > > compiler:   Debian clang version 15.0.6, GNU ld (GNU Binutils for 
> > > Debian) 2.40
> > > syz repro:  https://syzkaller.appspot.com/x/repro.syz?x=114f17a518
> > > C reproducer:   https://syzkaller.appspot.com/x/repro.c?x=162bb7a518
> > >
> > > Downloadable assets:
> > > disk image: 
> > > https://storage.googleapis.com/syzbot-assets/4eef3506c5ce/disk-520fad2e.raw.xz
> > > vmlinux: 
> > > https://storage.googleapis.com/syzbot-assets/24d60ebe76cc/vmlinux-520fad2e.xz
> > > kernel image: 
> > > https://storage.googleapis.com/syzbot-assets/8f883e706550/bzImage-520fad2e.xz
> > >
> > > IMPORTANT: if you fix the issue, please add the following tag to the 
> > > commit:
> > > Reported-by: syzbot+981935d9485a560bf...@syzkaller.appspotmail.com
> > >
> > > ==
> > > BUG: KASAN: slab-use-after-free in __bpf_trace_run 
> > > kernel/trace/bpf_trace.c:2376 [inline]
> > > BUG: KASAN: slab-use-after-free in bpf_trace_run1+0xcb/0x510 
> > > kernel/trace/bpf_trace.c:2430
> > > Read of size 8 at addr 8880290d9918 by task migration/0/19
> > >
> > > CPU: 0 PID: 19 Comm: migration/0 Not tainted 
> > > 6.8.0-syzkaller-05233-g520fad2e3206 #0
> > > Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS 
> > > Google 02/29/2024
> > > Stopper: 0x0 <- 0x0
> > > Call Trace:
> > >  
> > >  __dump_stack lib/dump_stack.c:88 [inline]
> > >  dump_stack_lvl+0x1e7/0x2e0 lib/dump_stack.c:106
> > >  print_address_description mm/kasan/report.c:377 [inline]
> > >  print_report+0x169/0x550 mm/kasan/report.c:488
> > >  kasan_report+0x143/0x180 mm/kasan/report.c:601
> > >  __bpf_trace_run kernel/trace/bpf_trace.c:2376 [inline]
> > >  bpf_trace_run1+0xcb/0x510 kernel/trace/bpf_trace.c:2430
> > >  __traceiter_rcu_utilization+0x74/0xb0 include/trace/events/rcu.h:27
> > >  trace_rcu_utilization+0x194/0x1c0 include/trace/events/rcu.h:27
> > >  rcu_note_context_switch+0xc7c/0xff0 kernel/rcu/tree_plugin.h:360
> > >  __schedule+0x345/0x4a20 kernel/sched/core.c:6635
> > >  __schedule_loop kernel/sched/core.c:6813 [inline]
> > >  schedule+0x14b/0x320 kernel/sched/core.c:6828
> > >  smpboot_thread_fn+0x61e/0xa30 kernel/smpboot.c:160
> > >  kthread+0x2f0/0x390 kernel/kthread.c:388
> > >  ret_from_fork+0x4b/0x80 arch/x86/kernel/process.c:147
> > >  ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_6

Re: [PATCH] uprobes: reduce contention on uprobes_tree access

2024-03-26 Thread Andrii Nakryiko
On Sun, Mar 24, 2024 at 8:03 PM Masami Hiramatsu  wrote:
>
> On Thu, 21 Mar 2024 07:57:35 -0700
> Jonathan Haslam  wrote:
>
> > Active uprobes are stored in an RB tree and accesses to this tree are
> > dominated by read operations. Currently these accesses are serialized by
> > a spinlock but this leads to enormous contention when large numbers of
> > threads are executing active probes.
> >
> > This patch converts the spinlock used to serialize access to the
> > uprobes_tree RB tree into a reader-writer spinlock. This lock type
> > aligns naturally with the overwhelmingly read-only nature of the tree
> > usage here. Although the addition of reader-writer spinlocks are
> > discouraged [0], this fix is proposed as an interim solution while an
> > RCU based approach is implemented (that work is in a nascent form). This
> > fix also has the benefit of being trivial, self contained and therefore
> > simple to backport.
> >
> > This change has been tested against production workloads that exhibit
> > significant contention on the spinlock and an almost order of magnitude
> > reduction for mean uprobe execution time is observed (28 -> 3.5 microsecs).
>
> Looks good to me.
>
> Acked-by: Masami Hiramatsu (Google) 

Masami,

Given the discussion around per-cpu rw semaphore and need for
(internal) batched attachment API for uprobes, do you think you can
apply this patch as is for now? We can then gain initial improvements
in scalability that are also easy to backport, and Jonathan will work
on a more complete solution based on per-cpu RW semaphore, as
suggested by Ingo.

>
> BTW, how did you measure the overhead? I think spinlock overhead
> will depend on how much lock contention happens.
>
> Thank you,
>
> >
> > [0] https://docs.kernel.org/locking/spinlocks.html
> >
> > Signed-off-by: Jonathan Haslam 
> > ---
> >  kernel/events/uprobes.c | 22 +++---
> >  1 file changed, 11 insertions(+), 11 deletions(-)
> >
> > diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
> > index 929e98c62965..42bf9b6e8bc0 100644
> > --- a/kernel/events/uprobes.c
> > +++ b/kernel/events/uprobes.c
> > @@ -39,7 +39,7 @@ static struct rb_root uprobes_tree = RB_ROOT;
> >   */
> >  #define no_uprobe_events()   RB_EMPTY_ROOT(&uprobes_tree)
> >
> > -static DEFINE_SPINLOCK(uprobes_treelock);/* serialize rbtree access */
> > +static DEFINE_RWLOCK(uprobes_treelock);  /* serialize rbtree access */
> >
> >  #define UPROBES_HASH_SZ  13
> >  /* serialize uprobe->pending_list */
> > @@ -669,9 +669,9 @@ static struct uprobe *find_uprobe(struct inode *inode, 
> > loff_t offset)
> >  {
> >   struct uprobe *uprobe;
> >
> > - spin_lock(&uprobes_treelock);
> > + read_lock(&uprobes_treelock);
> >   uprobe = __find_uprobe(inode, offset);
> > - spin_unlock(&uprobes_treelock);
> > + read_unlock(&uprobes_treelock);
> >
> >   return uprobe;
> >  }
> > @@ -701,9 +701,9 @@ static struct uprobe *insert_uprobe(struct uprobe 
> > *uprobe)
> >  {
> >   struct uprobe *u;
> >
> > - spin_lock(&uprobes_treelock);
> > + write_lock(&uprobes_treelock);
> >   u = __insert_uprobe(uprobe);
> > - spin_unlock(&uprobes_treelock);
> > + write_unlock(&uprobes_treelock);
> >
> >   return u;
> >  }
> > @@ -935,9 +935,9 @@ static void delete_uprobe(struct uprobe *uprobe)
> >   if (WARN_ON(!uprobe_is_active(uprobe)))
> >   return;
> >
> > - spin_lock(&uprobes_treelock);
> > + write_lock(&uprobes_treelock);
> >   rb_erase(&uprobe->rb_node, &uprobes_tree);
> > - spin_unlock(&uprobes_treelock);
> > + write_unlock(&uprobes_treelock);
> >   RB_CLEAR_NODE(&uprobe->rb_node); /* for uprobe_is_active() */
> >   put_uprobe(uprobe);
> >  }
> > @@ -1298,7 +1298,7 @@ static void build_probe_list(struct inode *inode,
> >   min = vaddr_to_offset(vma, start);
> >   max = min + (end - start) - 1;
> >
> > - spin_lock(&uprobes_treelock);
> > + read_lock(&uprobes_treelock);
> >   n = find_node_in_range(inode, min, max);
> >   if (n) {
> >   for (t = n; t; t = rb_prev(t)) {
> > @@ -1316,7 +1316,7 @@ static void build_probe_list(struct inode *inode,
> >   get_uprobe(u);
> >   }
> >   }
> > - spin_unlock(&uprobes_treelock);
> > + read_unlock(&uprobes_treelock);
> >  }
> >
> >  /* @vma contains reference counter, not the probed instruction. */
> > @@ -1407,9 +1407,9 @@ vma_has_uprobes(struct vm_area_struct *vma, unsigned 
> > long start, unsigned long e
> >   min = vaddr_to_offset(vma, start);
> >   max = min + (end - start) - 1;
> >
> > - spin_lock(&uprobes_treelock);
> > + read_lock(&uprobes_treelock);
> >   n = find_node_in_range(inode, min, max);
> > - spin_unlock(&uprobes_treelock);
> > + read_unlock(&uprobes_treelock);
> >
> >   return !!n;
> >  }
> > --
> > 2.43.0
> >
>
>
> --
> Masami Hiramatsu (Google) 



Re: [PATCH] uprobes: reduce contention on uprobes_tree access

2024-03-27 Thread Andrii Nakryiko
On Wed, Mar 27, 2024 at 5:18 PM Masami Hiramatsu  wrote:
>
> On Wed, 27 Mar 2024 17:06:01 +
> Jonthan Haslam  wrote:
>
> > > > Masami,
> > > >
> > > > Given the discussion around per-cpu rw semaphore and need for
> > > > (internal) batched attachment API for uprobes, do you think you can
> > > > apply this patch as is for now? We can then gain initial improvements
> > > > in scalability that are also easy to backport, and Jonathan will work
> > > > on a more complete solution based on per-cpu RW semaphore, as
> > > > suggested by Ingo.
> > >
> > > Yeah, it is interesting to use per-cpu rw semaphore on uprobe.
> > > I would like to wait for the next version.
> >
> > My initial tests show a nice improvement on the over RW spinlocks but
> > significant regression in acquiring a write lock. I've got a few days
> > vacation over Easter but I'll aim to get some more formalised results out
> > to the thread toward the end of next week.
>
> As far as the write lock is only on the cold path, I think you can choose
> per-cpu RW semaphore. Since it does not do busy wait, the total system
> performance impact will be small.

No, Masami, unfortunately it's not as simple. In BPF we have BPF
multi-uprobe, which can be used to attach to thousands of user
functions. It currently creates one uprobe at a time, as we don't
really have a batched API. If each such uprobe registration will now
take a (relatively) long time, when multiplied by number of attach-to
user functions, it will be a horrible regression in terms of
attachment/detachment performance.

So when we switch to per-CPU rw semaphore, we'll need to provide an
internal batch uprobe attach/detach API to make sure that attaching to
multiple uprobes is still fast.

Which is why I was asking to land this patch as is, as it relieves the
scalability pains in production and is easy to backport to old
kernels. And then we can work on batched APIs and switch to per-CPU rw
semaphore.

So I hope you can reconsider and accept improvements in this patch,
while Jonathan will keep working on even better final solution.
Thanks!

> I look forward to your formalized results :)
>
> Thank you,
>
> >
> > Jon.
> >
> > >
> > > Thank you,
> > >
> > > >
> > > > >
> > > > > BTW, how did you measure the overhead? I think spinlock overhead
> > > > > will depend on how much lock contention happens.
> > > > >
> > > > > Thank you,
> > > > >
> > > > > >
> > > > > > [0] https://docs.kernel.org/locking/spinlocks.html
> > > > > >
> > > > > > Signed-off-by: Jonathan Haslam 
> > > > > > ---
> > > > > >  kernel/events/uprobes.c | 22 +++---
> > > > > >  1 file changed, 11 insertions(+), 11 deletions(-)
> > > > > >
> > > > > > diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
> > > > > > index 929e98c62965..42bf9b6e8bc0 100644
> > > > > > --- a/kernel/events/uprobes.c
> > > > > > +++ b/kernel/events/uprobes.c
> > > > > > @@ -39,7 +39,7 @@ static struct rb_root uprobes_tree = RB_ROOT;
> > > > > >   */
> > > > > >  #define no_uprobe_events()   RB_EMPTY_ROOT(&uprobes_tree)
> > > > > >
> > > > > > -static DEFINE_SPINLOCK(uprobes_treelock);/* serialize rbtree 
> > > > > > access */
> > > > > > +static DEFINE_RWLOCK(uprobes_treelock);  /* serialize rbtree 
> > > > > > access */
> > > > > >
> > > > > >  #define UPROBES_HASH_SZ  13
> > > > > >  /* serialize uprobe->pending_list */
> > > > > > @@ -669,9 +669,9 @@ static struct uprobe *find_uprobe(struct inode 
> > > > > > *inode, loff_t offset)
> > > > > >  {
> > > > > >   struct uprobe *uprobe;
> > > > > >
> > > > > > - spin_lock(&uprobes_treelock);
> > > > > > + read_lock(&uprobes_treelock);
> > > > > >   uprobe = __find_uprobe(inode, offset);
> > > > > > - spin_unlock(&uprobes_treelock);
> > > > > > + read_unlock(&uprobes_treelock);
> > > > > >
> > > > > >   return uprobe;
> > > > > >  }
> > > > > > @@ -701,9 +701,9 @@ static struct uprobe *insert_uprobe(struct 
> > > > > > uprobe *uprobe)
> > > > > >  {
> > > > > >   struct uprobe *u;
> > > > > >
> > > > > > - spin_lock(&uprobes_treelock);
> > > > > > + write_lock(&uprobes_treelock);
> > > > > >   u = __insert_uprobe(uprobe);
> > > > > > - spin_unlock(&uprobes_treelock);
> > > > > > + write_unlock(&uprobes_treelock);
> > > > > >
> > > > > >   return u;
> > > > > >  }
> > > > > > @@ -935,9 +935,9 @@ static void delete_uprobe(struct uprobe *uprobe)
> > > > > >   if (WARN_ON(!uprobe_is_active(uprobe)))
> > > > > >   return;
> > > > > >
> > > > > > - spin_lock(&uprobes_treelock);
> > > > > > + write_lock(&uprobes_treelock);
> > > > > >   rb_erase(&uprobe->rb_node, &uprobes_tree);
> > > > > > - spin_unlock(&uprobes_treelock);
> > > > > > + write_unlock(&uprobes_treelock);
> > > > > >   RB_CLEAR_NODE(&uprobe->rb_node); /* for uprobe_is_active() */
> > > > > >   put_uprobe(uprobe);
> > > > > >  }
> > > > > > @@ -1298,7 +1298,7 @@ static void build_probe_list(struct inode 
>

Re: [PATCH] uprobes: reduce contention on uprobes_tree access

2024-03-29 Thread Andrii Nakryiko
On Wed, Mar 27, 2024 at 5:45 PM Andrii Nakryiko
 wrote:
>
> On Wed, Mar 27, 2024 at 5:18 PM Masami Hiramatsu  wrote:
> >
> > On Wed, 27 Mar 2024 17:06:01 +
> > Jonthan Haslam  wrote:
> >
> > > > > Masami,
> > > > >
> > > > > Given the discussion around per-cpu rw semaphore and need for
> > > > > (internal) batched attachment API for uprobes, do you think you can
> > > > > apply this patch as is for now? We can then gain initial improvements
> > > > > in scalability that are also easy to backport, and Jonathan will work
> > > > > on a more complete solution based on per-cpu RW semaphore, as
> > > > > suggested by Ingo.
> > > >
> > > > Yeah, it is interesting to use per-cpu rw semaphore on uprobe.
> > > > I would like to wait for the next version.
> > >
> > > My initial tests show a nice improvement on the over RW spinlocks but
> > > significant regression in acquiring a write lock. I've got a few days
> > > vacation over Easter but I'll aim to get some more formalised results out
> > > to the thread toward the end of next week.
> >
> > As far as the write lock is only on the cold path, I think you can choose
> > per-cpu RW semaphore. Since it does not do busy wait, the total system
> > performance impact will be small.
>
> No, Masami, unfortunately it's not as simple. In BPF we have BPF
> multi-uprobe, which can be used to attach to thousands of user
> functions. It currently creates one uprobe at a time, as we don't
> really have a batched API. If each such uprobe registration will now
> take a (relatively) long time, when multiplied by number of attach-to
> user functions, it will be a horrible regression in terms of
> attachment/detachment performance.
>
> So when we switch to per-CPU rw semaphore, we'll need to provide an
> internal batch uprobe attach/detach API to make sure that attaching to
> multiple uprobes is still fast.
>
> Which is why I was asking to land this patch as is, as it relieves the
> scalability pains in production and is easy to backport to old
> kernels. And then we can work on batched APIs and switch to per-CPU rw
> semaphore.
>
> So I hope you can reconsider and accept improvements in this patch,
> while Jonathan will keep working on even better final solution.
> Thanks!
>
> > I look forward to your formalized results :)
> >

BTW, as part of BPF selftests, we have a multi-attach test for uprobes
and USDTs, reporting attach/detach timings:
$ sudo ./test_progs -v -t uprobe_multi_test/bench
bpf_testmod.ko is already unloaded.
Loading bpf_testmod.ko...
Successfully loaded bpf_testmod.ko.
test_bench_attach_uprobe:PASS:uprobe_multi_bench__open_and_load 0 nsec
test_bench_attach_uprobe:PASS:uprobe_multi_bench__attach 0 nsec
test_bench_attach_uprobe:PASS:uprobes_count 0 nsec
test_bench_attach_uprobe: attached in   0.120s
test_bench_attach_uprobe: detached in   0.092s
#400/5   uprobe_multi_test/bench_uprobe:OK
test_bench_attach_usdt:PASS:uprobe_multi__open 0 nsec
test_bench_attach_usdt:PASS:bpf_program__attach_usdt 0 nsec
test_bench_attach_usdt:PASS:usdt_count 0 nsec
test_bench_attach_usdt: attached in   0.124s
test_bench_attach_usdt: detached in   0.064s
#400/6   uprobe_multi_test/bench_usdt:OK
#400 uprobe_multi_test:OK
Summary: 1/2 PASSED, 0 SKIPPED, 0 FAILED
Successfully unloaded bpf_testmod.ko.

So it should be easy for Jonathan to validate his changes with this.

> > Thank you,
> >
> > >
> > > Jon.
> > >
> > > >
> > > > Thank you,
> > > >
> > > > >
> > > > > >
> > > > > > BTW, how did you measure the overhead? I think spinlock overhead
> > > > > > will depend on how much lock contention happens.
> > > > > >
> > > > > > Thank you,
> > > > > >
> > > > > > >
> > > > > > > [0] https://docs.kernel.org/locking/spinlocks.html
> > > > > > >
> > > > > > > Signed-off-by: Jonathan Haslam 
> > > > > > > ---
> > > > > > >  kernel/events/uprobes.c | 22 +++---
> > > > > > >  1 file changed, 11 insertions(+), 11 deletions(-)
> > > > > > >
> > > > > > > diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
> > > > > > > index 929e98c62965..42bf9b6e8bc0 100644
> > > > > > > --- a/kernel/events/uprobes.c
> > > > > > > +++ b/kernel/events/uprobes.c
> > > >

Re: [PATCH] uprobes: reduce contention on uprobes_tree access

2024-03-29 Thread Andrii Nakryiko
On Fri, Mar 29, 2024 at 5:36 PM Masami Hiramatsu  wrote:
>
> On Fri, 29 Mar 2024 10:33:57 -0700
> Andrii Nakryiko  wrote:
>
> > On Wed, Mar 27, 2024 at 5:45 PM Andrii Nakryiko
> >  wrote:
> > >
> > > On Wed, Mar 27, 2024 at 5:18 PM Masami Hiramatsu  
> > > wrote:
> > > >
> > > > On Wed, 27 Mar 2024 17:06:01 +
> > > > Jonthan Haslam  wrote:
> > > >
> > > > > > > Masami,
> > > > > > >
> > > > > > > Given the discussion around per-cpu rw semaphore and need for
> > > > > > > (internal) batched attachment API for uprobes, do you think you 
> > > > > > > can
> > > > > > > apply this patch as is for now? We can then gain initial 
> > > > > > > improvements
> > > > > > > in scalability that are also easy to backport, and Jonathan will 
> > > > > > > work
> > > > > > > on a more complete solution based on per-cpu RW semaphore, as
> > > > > > > suggested by Ingo.
> > > > > >
> > > > > > Yeah, it is interesting to use per-cpu rw semaphore on uprobe.
> > > > > > I would like to wait for the next version.
> > > > >
> > > > > My initial tests show a nice improvement on the over RW spinlocks but
> > > > > significant regression in acquiring a write lock. I've got a few days
> > > > > vacation over Easter but I'll aim to get some more formalised results 
> > > > > out
> > > > > to the thread toward the end of next week.
> > > >
> > > > As far as the write lock is only on the cold path, I think you can 
> > > > choose
> > > > per-cpu RW semaphore. Since it does not do busy wait, the total system
> > > > performance impact will be small.
> > >
> > > No, Masami, unfortunately it's not as simple. In BPF we have BPF
> > > multi-uprobe, which can be used to attach to thousands of user
> > > functions. It currently creates one uprobe at a time, as we don't
> > > really have a batched API. If each such uprobe registration will now
> > > take a (relatively) long time, when multiplied by number of attach-to
> > > user functions, it will be a horrible regression in terms of
> > > attachment/detachment performance.
>
> Ah, got it. So attachment/detachment performance should be counted.
>
> > >
> > > So when we switch to per-CPU rw semaphore, we'll need to provide an
> > > internal batch uprobe attach/detach API to make sure that attaching to
> > > multiple uprobes is still fast.
>
> Yeah, we need such interface like register_uprobes(...).
>
> > >
> > > Which is why I was asking to land this patch as is, as it relieves the
> > > scalability pains in production and is easy to backport to old
> > > kernels. And then we can work on batched APIs and switch to per-CPU rw
> > > semaphore.
>
> OK, then I'll push this to for-next at this moment.

Great, thanks a lot!

> Please share if you have a good idea for the batch interface which can be
> backported. I guess it should involve updating userspace changes too.
>

Yep, we'll investigate a best way to provide batch interface for
uprobes and will send patches.

> Thank you!
>
> > >
> > > So I hope you can reconsider and accept improvements in this patch,
> > > while Jonathan will keep working on even better final solution.
> > > Thanks!
> > >
> > > > I look forward to your formalized results :)
> > > >
> >
> > BTW, as part of BPF selftests, we have a multi-attach test for uprobes
> > and USDTs, reporting attach/detach timings:
> > $ sudo ./test_progs -v -t uprobe_multi_test/bench
> > bpf_testmod.ko is already unloaded.
> > Loading bpf_testmod.ko...
> > Successfully loaded bpf_testmod.ko.
> > test_bench_attach_uprobe:PASS:uprobe_multi_bench__open_and_load 0 nsec
> > test_bench_attach_uprobe:PASS:uprobe_multi_bench__attach 0 nsec
> > test_bench_attach_uprobe:PASS:uprobes_count 0 nsec
> > test_bench_attach_uprobe: attached in   0.120s
> > test_bench_attach_uprobe: detached in   0.092s
> > #400/5   uprobe_multi_test/bench_uprobe:OK
> > test_bench_attach_usdt:PASS:uprobe_multi__open 0 nsec
> > test_bench_attach_usdt:PASS:bpf_program__attach_usdt 0 nsec
> > test_bench_attach_usdt:PASS:usdt_count 0 nsec
> > test_bench_attach_usdt: attached in   0.124s
> > test_bench_

Re: [PATCHv2 1/3] uprobe: Add uretprobe syscall to speed up return probe

2024-04-03 Thread Andrii Nakryiko
On Wed, Apr 3, 2024 at 7:09 AM Masami Hiramatsu  wrote:
>
> On Wed, 3 Apr 2024 11:47:41 +0200
> Jiri Olsa  wrote:
>
> > On Wed, Apr 03, 2024 at 10:07:08AM +0900, Masami Hiramatsu wrote:
> > > Hi Jiri,
> > >
> > > On Tue,  2 Apr 2024 11:33:00 +0200
> > > Jiri Olsa  wrote:
> > >
> > > > Adding uretprobe syscall instead of trap to speed up return probe.
> > >
> > > This is interesting approach. But I doubt we need to add additional
> > > syscall just for this purpose. Can't we use another syscall or ioctl?
> >
> > so the plan is to optimize entry uprobe in a similar way and given
> > the syscall is not a scarce resource I wanted to add another syscall
> > for that one as well
> >
> > tbh I'm not sure sure which syscall or ioctl to reuse for this, it's
> > possible to do that, the trampoline will just have to save one or
> > more additional registers, but adding new syscall seems cleaner to me
>
> Hmm, I think a similar syscall is ptrace? prctl may also be a candidate.

I think both ptrace and prctl are for completely different use cases
and it would be an abuse of existing API to reuse them for uretprobe
tracing. Also, keep in mind, that any extra argument that has to be
passed into this syscall means that we need to complicate and slow
generated assembly code that is injected into user process (to
save/restore registers) and also kernel-side (again, to deal with all
the extra registers that would be stored/restored on stack).

Given syscalls are not some kind of scarce resources, what's the
downside to have a dedicated and simple syscall?

>
> >
> > >
> > > Also, we should run syzkaller on this syscall. And if uretprobe is
> >
> > right, I'll check on syzkaller
> >
> > > set in the user function, what happen if the user function directly
> > > calls this syscall? (maybe it consumes shadow stack?)
> >
> > the process should receive SIGILL if there's no pending uretprobe for
> > the current task, or it will trigger uretprobe if there's one pending
>
> No, that is too aggressive and not safe. Since the syscall is exposed to
> user program, it should return appropriate error code instead of SIGILL.
>

This is the way it is today with uretprobes even through interrupt.
E.g., it could happen that user process is using fibers and is
replacing stack pointer without kernel realizing this, which will
trigger some defensive checks in uretprobe handling code and kernel
will send SIGILL because it can't support such cases. This is
happening today already, and it works fine in practice (except for
applications that manually change stack pointer, too bad, you can't
trace them with uretprobes, unfortunately).

So I think it's absolutely adequate to have this behavior if the user
process is *intentionally* abusing this API.

> >
> > but we could limit the syscall to be executed just from the trampoline,
> > that should prevent all the user space use cases, I'll do that in next
> > version and add more tests for that
>
> Why not limit? :) The uprobe_handle_trampoline() expects it is called
> only from the trampoline, so it is natural to check the caller address.
> (and uprobe should know where is the trampoline)
>
> Since the syscall is always exposed to the user program, it should
> - Do nothing and return an error unless it is properly called.
> - check the prerequisites for operation strictly.
> I concern that new system calls introduce vulnerabilities.
>

As Oleg and Jiri mentioned, this syscall can't harm kernel or other
processes, only the process that is abusing the API. So any extra
checks that would slow down this approach is an unnecessary overhead
and complication that will never be useful in practice.

Also note that sys_uretprobe is a kind of internal and unstable API
and it is explicitly called out that its contract can change at any
time and user space shouldn't rely on it. It's purely for the kernel's
own usage.

So let's please keep it fast and simple.


> Thank you,
>
>
> >
> > thanks,
> > jirka
> >
> >
> > >

[...]



Re: [PATCH] uprobes: reduce contention on uprobes_tree access

2024-04-03 Thread Andrii Nakryiko
On Wed, Apr 3, 2024 at 4:05 AM Jonthan Haslam  wrote:
>
> > > > > Given the discussion around per-cpu rw semaphore and need for
> > > > > (internal) batched attachment API for uprobes, do you think you can
> > > > > apply this patch as is for now? We can then gain initial improvements
> > > > > in scalability that are also easy to backport, and Jonathan will work
> > > > > on a more complete solution based on per-cpu RW semaphore, as
> > > > > suggested by Ingo.
> > > >
> > > > Yeah, it is interesting to use per-cpu rw semaphore on uprobe.
> > > > I would like to wait for the next version.
> > >
> > > My initial tests show a nice improvement on the over RW spinlocks but
> > > significant regression in acquiring a write lock. I've got a few days
> > > vacation over Easter but I'll aim to get some more formalised results out
> > > to the thread toward the end of next week.
> >
> > As far as the write lock is only on the cold path, I think you can choose
> > per-cpu RW semaphore. Since it does not do busy wait, the total system
> > performance impact will be small.
> > I look forward to your formalized results :)
>
> Sorry for the delay in getting back to you on this Masami.
>
> I have used one of the bpf selftest benchmarks to provide some form of
> comparison of the 3 different approaches (spinlock, RW spinlock and
> per-cpu RW semaphore). The benchmark used here is the 'trig-uprobe-nop'
> benchmark which just executes a single uprobe with a minimal bpf program
> attached. The tests were done on a 32 core qemu/kvm instance.
>

Thanks a lot for running benchmarks and providing results!

> Things to note about the results:
>
> - The results are slightly variable so don't get too caught up on
>   individual thread count - it's the trend that is important.
> - In terms of throughput with this specific benchmark a *very* macro view
>   is that the RW spinlock provides 40-60% more throughput than the
>   spinlock.  The per-CPU RW semaphore provides in the order of 50-100%
>   more throughput then the spinlock.
> - This doesn't fully reflect the large reduction in latency that we have
>   seen in application based measurements. However, it does demonstrate
>   that even the trivial change of going to a RW spinlock provides
>   significant benefits.

This is probably because trig-uprobe-nop creates a single uprobe that
is triggered on many CPUs. While in production we have also *many*
uprobes running on many CPUs. In this benchmark, besides contention on
uprobes_treelock, we are also hammering on other per-uprobe locks
(register_rwsem, also if you don't have [0] patch locally, there will
be another filter lock taken each time, filter->rwlock). There is also
atomic refcounting going on, which when you have the same uprobe
across all CPUs at the same time will cause a bunch of cache line
bouncing.

So yes, it's understandable that in practice in production you see an
even larger effect of optimizing uprobe_treelock than in this
micro-benchmark.

  [0] 
https://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace.git/commit/?h=probes/for-next&id=366f7afd3de31d3ce2f4cbff97c6c23b6aa6bcdf

>
> I haven't included the measurements on per-CPU RW semaphore write
> performance as they are completely in line with those that Paul McKenney
> posted on his journal [0]. On a 32 core system I see semaphore writes to
> take in the order of 25-28 millisecs - the cost of the synchronize_rcu().
>
> Each block of results below show 1 line per execution of the benchmark (the
> "Summary" line) and each line is a run with one more thread added - a
> thread is a "producer". The lines are edited to remove extraneous output
> that adds no value here.
>
> The tests were executed with this driver script:
>
> for num_threads in {1..20}
> do
> sudo ./bench -p $num_threads trig-uprobe-nop | grep Summary

just want to mention -a (affinity) option that you can pass a bench
tool, it will pin each thread on its own CPU. It generally makes tests
more uniform, eliminating CPU migrations variability.

> done
>
>
> spinlock
>
> Summary: hits1.453 ± 0.005M/s (  1.453M/prod)
> Summary: hits2.087 ± 0.005M/s (  1.043M/prod)
> Summary: hits2.701 ± 0.012M/s (  0.900M/prod)

I also wanted to point out that the first measurement (1.453M/s in
this row) is total throughput across all threads, while value in
parenthesis (0.900M/prod) is averaged throughput per each thread. So
this M/prod value is the most interesting in this benchmark where we
assess the effect of reducing contention.

> Summary: hits1.917 ± 0.011M/s (  0.479M/prod)
> Summary: hits2.105 ± 0.003M/s (  0.421M/prod)
> Summary: hits1.615 ± 0.006M/s (  0.269M/prod)

[...]



Re: [PATCHv2 1/3] uprobe: Add uretprobe syscall to speed up return probe

2024-04-03 Thread Andrii Nakryiko
On Wed, Apr 3, 2024 at 5:58 PM Masami Hiramatsu  wrote:
>
> On Wed, 3 Apr 2024 09:58:12 -0700
> Andrii Nakryiko  wrote:
>
> > On Wed, Apr 3, 2024 at 7:09 AM Masami Hiramatsu  wrote:
> > >
> > > On Wed, 3 Apr 2024 11:47:41 +0200
> > > Jiri Olsa  wrote:
> > >
> > > > On Wed, Apr 03, 2024 at 10:07:08AM +0900, Masami Hiramatsu wrote:
> > > > > Hi Jiri,
> > > > >
> > > > > On Tue,  2 Apr 2024 11:33:00 +0200
> > > > > Jiri Olsa  wrote:
> > > > >
> > > > > > Adding uretprobe syscall instead of trap to speed up return probe.
> > > > >
> > > > > This is interesting approach. But I doubt we need to add additional
> > > > > syscall just for this purpose. Can't we use another syscall or ioctl?
> > > >
> > > > so the plan is to optimize entry uprobe in a similar way and given
> > > > the syscall is not a scarce resource I wanted to add another syscall
> > > > for that one as well
> > > >
> > > > tbh I'm not sure sure which syscall or ioctl to reuse for this, it's
> > > > possible to do that, the trampoline will just have to save one or
> > > > more additional registers, but adding new syscall seems cleaner to me
> > >
> > > Hmm, I think a similar syscall is ptrace? prctl may also be a candidate.
> >
> > I think both ptrace and prctl are for completely different use cases
> > and it would be an abuse of existing API to reuse them for uretprobe
> > tracing. Also, keep in mind, that any extra argument that has to be
> > passed into this syscall means that we need to complicate and slow
> > generated assembly code that is injected into user process (to
> > save/restore registers) and also kernel-side (again, to deal with all
> > the extra registers that would be stored/restored on stack).
> >
> > Given syscalls are not some kind of scarce resources, what's the
> > downside to have a dedicated and simple syscall?
>
> Syscalls are explicitly exposed to user space, thus, even if it is used
> ONLY for a very specific situation, it is an official kernel interface,
> and need to care about the compatibility. (If it causes SIGILL unless
> a specific use case, I don't know there is a "compatibility".)

Check rt_sigreturn syscall (manpage at [0], for example).

   sigreturn() exists only to allow the implementation of signal
   handlers.  It should never be called directly.  (Indeed, a simple
   sigreturn() wrapper in the GNU C library simply returns -1, with
   errno set to ENOSYS.)  Details of the arguments (if any) passed
   to sigreturn() vary depending on the architecture.  (On some
   architectures, such as x86-64, sigreturn() takes no arguments,
   since all of the information that it requires is available in the
   stack frame that was previously created by the kernel on the
   user-space stack.)

This is a very similar use case. Also, check its source code in
arch/x86/kernel/signal_64.c. It sends SIGSEGV to the calling process
on any sign of something not being right. It's exactly the same with
sys_uretprobe.

  [0] https://man7.org/linux/man-pages/man2/sigreturn.2.html

> And the number of syscalls are limited resource.

We have almost 500 of them, it didn't seems like adding 1-2 for good
reasons would be a problem. Can you please point to where the limits
on syscalls as a resource are described? I'm curious to learn.

>
> I'm actually not sure how much we need to care of it, but adding a new
> syscall is worth to be discussed carefully because all of them are
> user-space compatibility.

Absolutely, it's a good discussion to have.

>
> > > > > Also, we should run syzkaller on this syscall. And if uretprobe is
> > > >
> > > > right, I'll check on syzkaller
> > > >
> > > > > set in the user function, what happen if the user function directly
> > > > > calls this syscall? (maybe it consumes shadow stack?)
> > > >
> > > > the process should receive SIGILL if there's no pending uretprobe for
> > > > the current task, or it will trigger uretprobe if there's one pending
> > >
> > > No, that is too aggressive and not safe. Since the syscall is exposed to
> > > user program, it should return appropriate error code instead of SIGILL.
> > >
> >
> > This is the way it is today with uretprobes even through interrupt.
>
> I doubt that the interrupt (exception) and syscall should be handled
> differently. Especially, this exception is injected 

Re: [PATCHv2 1/3] uprobe: Add uretprobe syscall to speed up return probe

2024-04-18 Thread Andrii Nakryiko
On Mon, Apr 15, 2024 at 1:25 AM Jiri Olsa  wrote:
>
> On Tue, Apr 02, 2024 at 11:33:00AM +0200, Jiri Olsa wrote:
>
> SNIP
>
> >  #include 
> >  #include 
> > @@ -308,6 +309,88 @@ static int uprobe_init_insn(struct arch_uprobe 
> > *auprobe, struct insn *insn, bool
> >  }
> >
> >  #ifdef CONFIG_X86_64
> > +
> > +asm (
> > + ".pushsection .rodata\n"
> > + ".global uretprobe_syscall_entry\n"
> > + "uretprobe_syscall_entry:\n"
> > + "pushq %rax\n"
> > + "pushq %rcx\n"
> > + "pushq %r11\n"
> > + "movq $" __stringify(__NR_uretprobe) ", %rax\n"
> > + "syscall\n"
> > + "popq %r11\n"
> > + "popq %rcx\n"
> > +
> > + /* The uretprobe syscall replaces stored %rax value with final
> > +  * return address, so we don't restore %rax in here and just
> > +  * call ret.
> > +  */
> > + "retq\n"
> > + ".global uretprobe_syscall_end\n"
> > + "uretprobe_syscall_end:\n"
> > + ".popsection\n"
> > +);
> > +
> > +extern u8 uretprobe_syscall_entry[];
> > +extern u8 uretprobe_syscall_end[];
> > +
> > +void *arch_uprobe_trampoline(unsigned long *psize)
> > +{
> > + *psize = uretprobe_syscall_end - uretprobe_syscall_entry;
> > + return uretprobe_syscall_entry;
>
> fyi I realized this screws 32-bit programs, we either need to add
> compat trampoline, or keep the standard breakpoint for them:
>
> +   struct pt_regs *regs = task_pt_regs(current);
> +   static uprobe_opcode_t insn = UPROBE_SWBP_INSN;
> +
> +   if (user_64bit_mode(regs)) {
> +   *psize = uretprobe_syscall_end - uretprobe_syscall_entry;
> +   return uretprobe_syscall_entry;
> +   }
> +
> +   *psize = UPROBE_SWBP_INSN_SIZE;
> +   return &insn;
>
>
> not sure it's worth the effort to add the trampoline, I'll check
>

32-bit arch isn't a high-performance target anyways, so I'd probably
not bother and prioritize simplicity and long term maintenance.

>
> jirka



Re: [PATCH v9 29/36] bpf: Enable kprobe_multi feature if CONFIG_FPROBE is enabled

2024-04-25 Thread Andrii Nakryiko
On Mon, Apr 15, 2024 at 6:22 AM Masami Hiramatsu (Google)
 wrote:
>
> From: Masami Hiramatsu (Google) 
>
> Enable kprobe_multi feature if CONFIG_FPROBE is enabled. The pt_regs is
> converted from ftrace_regs by ftrace_partial_regs(), thus some registers
> may always returns 0. But it should be enough for function entry (access
> arguments) and exit (access return value).
>
> Signed-off-by: Masami Hiramatsu (Google) 
> Acked-by: Florent Revest 
> ---
>  Changes from previous series: NOTHING, Update against the new series.
> ---
>  kernel/trace/bpf_trace.c |   22 +-
>  1 file changed, 9 insertions(+), 13 deletions(-)
>
> diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
> index e51a6ef87167..57b1174030c9 100644
> --- a/kernel/trace/bpf_trace.c
> +++ b/kernel/trace/bpf_trace.c
> @@ -2577,7 +2577,7 @@ static int __init bpf_event_init(void)
>  fs_initcall(bpf_event_init);
>  #endif /* CONFIG_MODULES */
>
> -#if defined(CONFIG_FPROBE) && defined(CONFIG_DYNAMIC_FTRACE_WITH_REGS)
> +#ifdef CONFIG_FPROBE
>  struct bpf_kprobe_multi_link {
> struct bpf_link link;
> struct fprobe fp;
> @@ -2600,6 +2600,8 @@ struct user_syms {
> char *buf;
>  };
>
> +static DEFINE_PER_CPU(struct pt_regs, bpf_kprobe_multi_pt_regs);

this is a waste if CONFIG_HAVE_PT_REGS_TO_FTRACE_REGS_CAST=y, right?
Can we guard it?


> +
>  static int copy_user_syms(struct user_syms *us, unsigned long __user *usyms, 
> u32 cnt)
>  {
> unsigned long __user usymbol;
> @@ -2792,13 +2794,14 @@ static u64 bpf_kprobe_multi_entry_ip(struct 
> bpf_run_ctx *ctx)
>
>  static int
>  kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link,
> -  unsigned long entry_ip, struct pt_regs *regs)
> +  unsigned long entry_ip, struct ftrace_regs *fregs)
>  {
> struct bpf_kprobe_multi_run_ctx run_ctx = {
> .link = link,
> .entry_ip = entry_ip,
> };
> struct bpf_run_ctx *old_run_ctx;
> +   struct pt_regs *regs;
> int err;
>
> if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) {
> @@ -2809,6 +2812,7 @@ kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link 
> *link,
>
> migrate_disable();
> rcu_read_lock();
> +   regs = ftrace_partial_regs(fregs, 
> this_cpu_ptr(&bpf_kprobe_multi_pt_regs));

and then pass NULL if defined(CONFIG_HAVE_PT_REGS_TO_FTRACE_REGS_CAST)?


> old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
> err = bpf_prog_run(link->link.prog, regs);
> bpf_reset_run_ctx(old_run_ctx);
> @@ -2826,13 +2830,9 @@ kprobe_multi_link_handler(struct fprobe *fp, unsigned 
> long fentry_ip,
>   void *data)
>  {
> struct bpf_kprobe_multi_link *link;
> -   struct pt_regs *regs = ftrace_get_regs(fregs);
> -
> -   if (!regs)
> -   return 0;
>
> link = container_of(fp, struct bpf_kprobe_multi_link, fp);
> -   kprobe_multi_link_prog_run(link, get_entry_ip(fentry_ip), regs);
> +   kprobe_multi_link_prog_run(link, get_entry_ip(fentry_ip), fregs);
> return 0;
>  }
>
> @@ -2842,13 +2842,9 @@ kprobe_multi_link_exit_handler(struct fprobe *fp, 
> unsigned long fentry_ip,
>void *data)
>  {
> struct bpf_kprobe_multi_link *link;
> -   struct pt_regs *regs = ftrace_get_regs(fregs);
> -
> -   if (!regs)
> -   return;
>
> link = container_of(fp, struct bpf_kprobe_multi_link, fp);
> -   kprobe_multi_link_prog_run(link, get_entry_ip(fentry_ip), regs);
> +   kprobe_multi_link_prog_run(link, get_entry_ip(fentry_ip), fregs);
>  }
>
>  static int symbols_cmp_r(const void *a, const void *b, const void *priv)
> @@ -3107,7 +3103,7 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr 
> *attr, struct bpf_prog *pr
> kvfree(cookies);
> return err;
>  }
> -#else /* !CONFIG_FPROBE || !CONFIG_DYNAMIC_FTRACE_WITH_REGS */
> +#else /* !CONFIG_FPROBE */
>  int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog 
> *prog)
>  {
> return -EOPNOTSUPP;
>
>



Re: [PATCH v9 36/36] fgraph: Skip recording calltime/rettime if it is not nneeded

2024-04-25 Thread Andrii Nakryiko
On Mon, Apr 15, 2024 at 6:25 AM Masami Hiramatsu (Google)
 wrote:
>
> From: Masami Hiramatsu (Google) 
>
> Skip recording calltime and rettime if the fgraph_ops does not need it.
> This is a kind of performance optimization for fprobe. Since the fprobe
> user does not use these entries, recording timestamp in fgraph is just
> a overhead (e.g. eBPF, ftrace). So introduce the skip_timestamp flag,
> and all fgraph_ops sets this flag, skip recording calltime and rettime.
>
> Suggested-by: Jiri Olsa 
> Signed-off-by: Masami Hiramatsu (Google) 
> ---
>  Changes in v9:
>   - Newly added.
> ---
>  include/linux/ftrace.h |2 ++
>  kernel/trace/fgraph.c  |   46 +++---
>  kernel/trace/fprobe.c  |1 +
>  3 files changed, 42 insertions(+), 7 deletions(-)
>
> diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
> index d845a80a3d56..06fc7cbef897 100644
> --- a/include/linux/ftrace.h
> +++ b/include/linux/ftrace.h
> @@ -1156,6 +1156,8 @@ struct fgraph_ops {
> struct ftrace_ops   ops; /* for the hash lists */
> void*private;
> int idx;
> +   /* If skip_timestamp is true, this does not record timestamps. */
> +   boolskip_timestamp;
>  };
>
>  void *fgraph_reserve_data(int idx, int size_bytes);
> diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
> index 7556fbbae323..a5722537bb79 100644
> --- a/kernel/trace/fgraph.c
> +++ b/kernel/trace/fgraph.c
> @@ -131,6 +131,7 @@ DEFINE_STATIC_KEY_FALSE(kill_ftrace_graph);
>  int ftrace_graph_active;
>
>  static struct fgraph_ops *fgraph_array[FGRAPH_ARRAY_SIZE];
> +static bool fgraph_skip_timestamp;
>
>  /* LRU index table for fgraph_array */
>  static int fgraph_lru_table[FGRAPH_ARRAY_SIZE];
> @@ -475,7 +476,7 @@ void ftrace_graph_stop(void)
>  static int
>  ftrace_push_return_trace(unsigned long ret, unsigned long func,
>  unsigned long frame_pointer, unsigned long *retp,
> -int fgraph_idx)
> +int fgraph_idx, bool skip_ts)
>  {
> struct ftrace_ret_stack *ret_stack;
> unsigned long long calltime;
> @@ -498,8 +499,12 @@ ftrace_push_return_trace(unsigned long ret, unsigned 
> long func,
> ret_stack = get_ret_stack(current, current->curr_ret_stack, &index);
> if (ret_stack && ret_stack->func == func &&
> get_fgraph_type(current, index + FGRAPH_RET_INDEX) == 
> FGRAPH_TYPE_BITMAP &&
> -   !is_fgraph_index_set(current, index + FGRAPH_RET_INDEX, 
> fgraph_idx))
> +   !is_fgraph_index_set(current, index + FGRAPH_RET_INDEX, 
> fgraph_idx)) {
> +   /* If previous one skips calltime, update it. */
> +   if (!skip_ts && !ret_stack->calltime)
> +   ret_stack->calltime = trace_clock_local();
> return index + FGRAPH_RET_INDEX;
> +   }
>
> val = (FGRAPH_TYPE_RESERVED << FGRAPH_TYPE_SHIFT) | FGRAPH_RET_INDEX;
>
> @@ -517,7 +522,10 @@ ftrace_push_return_trace(unsigned long ret, unsigned 
> long func,
> return -EBUSY;
> }
>
> -   calltime = trace_clock_local();
> +   if (skip_ts)

would it be ok to add likely() here to keep the least-overhead code path linear?

> +   calltime = 0LL;
> +   else
> +   calltime = trace_clock_local();
>
> index = READ_ONCE(current->curr_ret_stack);
> ret_stack = RET_STACK(current, index);
> @@ -601,7 +609,8 @@ int function_graph_enter_regs(unsigned long ret, unsigned 
> long func,
> trace.func = func;
> trace.depth = ++current->curr_ret_depth;
>
> -   index = ftrace_push_return_trace(ret, func, frame_pointer, retp, 0);
> +   index = ftrace_push_return_trace(ret, func, frame_pointer, retp, 0,
> +fgraph_skip_timestamp);
> if (index < 0)
> goto out;
>
> @@ -654,7 +663,8 @@ int function_graph_enter_ops(unsigned long ret, unsigned 
> long func,
> return -ENODEV;
>
> /* Use start for the distance to ret_stack (skipping over reserve) */
> -   index = ftrace_push_return_trace(ret, func, frame_pointer, retp, 
> gops->idx);
> +   index = ftrace_push_return_trace(ret, func, frame_pointer, retp, 
> gops->idx,
> +gops->skip_timestamp);
> if (index < 0)
> return index;
> type = get_fgraph_type(current, index);
> @@ -732,6 +742,7 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, 
> unsigned long *ret,
> *ret = ret_stack->ret;
> trace->func = ret_stack->func;
> trace->calltime = ret_stack->calltime;
> +   trace->rettime = 0;
> trace->overrun = atomic_read(¤t->trace_overrun);
> trace->depth = current->curr_ret_depth;
> /*
> @@ -792,7 +803,6 @@ __ftrace_return_to_handler(struct f

Re: [PATCH v9 00/36] tracing: fprobe: function_graph: Multi-function graph and fprobe on fgraph

2024-04-25 Thread Andrii Nakryiko
On Mon, Apr 15, 2024 at 5:49 AM Masami Hiramatsu (Google)
 wrote:
>
> Hi,
>
> Here is the 9th version of the series to re-implement the fprobe on
> function-graph tracer. The previous version is;
>
> https://lore.kernel.org/all/170887410337.564249.6360118840946697039.stgit@devnote2/
>
> This version is ported on the latest kernel (v6.9-rc3 + probes/for-next)
> and fixed some bugs + performance optimization patch[36/36].
>  - [12/36] Fix to clear fgraph_array entry in registration failure, also
>return -ENOSPC when fgraph_array is full.
>  - [28/36] Add new store_fprobe_entry_data() for fprobe.
>  - [31/36] Remove DIV_ROUND_UP() and fix entry data address calculation.
>  - [36/36] Add new flag to skip timestamp recording.
>
> Overview
> 
> This series does major 2 changes, enable multiple function-graphs on
> the ftrace (e.g. allow function-graph on sub instances) and rewrite the
> fprobe on this function-graph.
>
> The former changes had been sent from Steven Rostedt 4 years ago (*),
> which allows users to set different setting function-graph tracer (and
> other tracers based on function-graph) in each trace-instances at the
> same time.
>
> (*) https://lore.kernel.org/all/20190525031633.811342...@goodmis.org/
>
> The purpose of latter change are;
>
>  1) Remove dependency of the rethook from fprobe so that we can reduce
>the return hook code and shadow stack.
>
>  2) Make 'ftrace_regs' the common trace interface for the function
>boundary.
>
> 1) Currently we have 2(or 3) different function return hook codes,
>  the function-graph tracer and rethook (and legacy kretprobe).
>  But since this  is redundant and needs double maintenance cost,
>  I would like to unify those. From the user's viewpoint, function-
>  graph tracer is very useful to grasp the execution path. For this
>  purpose, it is hard to use the rethook in the function-graph
>  tracer, but the opposite is possible. (Strictly speaking, kretprobe
>  can not use it because it requires 'pt_regs' for historical reasons.)
>
> 2) Now the fprobe provides the 'pt_regs' for its handler, but that is
>  wrong for the function entry and exit. Moreover, depending on the
>  architecture, there is no way to accurately reproduce 'pt_regs'
>  outside of interrupt or exception handlers. This means fprobe should
>  not use 'pt_regs' because it does not use such exceptions.
>  (Conversely, kprobe should use 'pt_regs' because it is an abstract
>   interface of the software breakpoint exception.)
>
> This series changes fprobe to use function-graph tracer for tracing
> function entry and exit, instead of mixture of ftrace and rethook.
> Unlike the rethook which is a per-task list of system-wide allocated
> nodes, the function graph's ret_stack is a per-task shadow stack.
> Thus it does not need to set 'nr_maxactive' (which is the number of
> pre-allocated nodes).
> Also the handlers will get the 'ftrace_regs' instead of 'pt_regs'.
> Since eBPF mulit_kprobe/multi_kretprobe events still use 'pt_regs' as
> their register interface, this changes it to convert 'ftrace_regs' to
> 'pt_regs'. Of course this conversion makes an incomplete 'pt_regs',
> so users must access only registers for function parameters or
> return value.
>
> Design
> --
> Instead of using ftrace's function entry hook directly, the new fprobe
> is built on top of the function-graph's entry and return callbacks
> with 'ftrace_regs'.
>
> Since the fprobe requires access to 'ftrace_regs', the architecture
> must support CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS and
> CONFIG_HAVE_FTRACE_GRAPH_FUNC, which enables to call function-graph
> entry callback with 'ftrace_regs', and also
> CONFIG_HAVE_FUNCTION_GRAPH_FREGS, which passes the ftrace_regs to
> return_to_handler.
>
> All fprobes share a single function-graph ops (means shares a common
> ftrace filter) similar to the kprobe-on-ftrace. This needs another
> layer to find corresponding fprobe in the common function-graph
> callbacks, but has much better scalability, since the number of
> registered function-graph ops is limited.
>
> In the entry callback, the fprobe runs its entry_handler and saves the
> address of 'fprobe' on the function-graph's shadow stack as data. The
> return callback decodes the data to get the 'fprobe' address, and runs
> the exit_handler.
>
> The fprobe introduces two hash-tables, one is for entry callback which
> searches fprobes related to the given function address passed by entry
> callback. The other is for a return callback which checks if the given
> 'fprobe' data structure pointer is still valid. Note that it is
> possible to unregister fprobe before the return callback runs. Thus
> the address validation must be done before using it in the return
> callback.
>
> This series can be applied against the probes/for-next branch, which
> is based on v6.9-rc3.
>
> This series can also be found below branch.
>
> https://git.kernel.org/pub/scm/linux/kernel/git/mhiramat/linux.git/log/?h=topic/fprobe-on-

Re: [PATCHv3 bpf-next 1/7] uprobe: Wire up uretprobe system call

2024-04-26 Thread Andrii Nakryiko
On Sun, Apr 21, 2024 at 12:42 PM Jiri Olsa  wrote:
>
> Wiring up uretprobe system call, which comes in following changes.
> We need to do the wiring before, because the uretprobe implementation
> needs the syscall number.
>
> Note at the moment uretprobe syscall is supported only for native
> 64-bit process.
>
> Signed-off-by: Jiri Olsa 
> ---
>  arch/x86/entry/syscalls/syscall_64.tbl | 1 +
>  include/linux/syscalls.h   | 2 ++
>  include/uapi/asm-generic/unistd.h  | 5 -
>  kernel/sys_ni.c| 2 ++
>  4 files changed, 9 insertions(+), 1 deletion(-)
>

LGTM

Acked-by: Andrii Nakryiko 

> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl 
> b/arch/x86/entry/syscalls/syscall_64.tbl
> index 7e8d46f4147f..af0a33ab06ee 100644
> --- a/arch/x86/entry/syscalls/syscall_64.tbl
> +++ b/arch/x86/entry/syscalls/syscall_64.tbl
> @@ -383,6 +383,7 @@
>  459common  lsm_get_self_attr   sys_lsm_get_self_attr
>  460common  lsm_set_self_attr   sys_lsm_set_self_attr
>  461common  lsm_list_modulessys_lsm_list_modules
> +46264  uretprobe   sys_uretprobe
>
>  #
>  # Due to a historical design error, certain syscalls are numbered differently
> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> index e619ac10cd23..5318e0e76799 100644
> --- a/include/linux/syscalls.h
> +++ b/include/linux/syscalls.h
> @@ -972,6 +972,8 @@ asmlinkage long sys_lsm_list_modules(u64 *ids, u32 *size, 
> u32 flags);
>  /* x86 */
>  asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int on);
>
> +asmlinkage long sys_uretprobe(void);
> +
>  /* pciconfig: alpha, arm, arm64, ia64, sparc */
>  asmlinkage long sys_pciconfig_read(unsigned long bus, unsigned long dfn,
> unsigned long off, unsigned long len,
> diff --git a/include/uapi/asm-generic/unistd.h 
> b/include/uapi/asm-generic/unistd.h
> index 75f00965ab15..8a747cd1d735 100644
> --- a/include/uapi/asm-generic/unistd.h
> +++ b/include/uapi/asm-generic/unistd.h
> @@ -842,8 +842,11 @@ __SYSCALL(__NR_lsm_set_self_attr, sys_lsm_set_self_attr)
>  #define __NR_lsm_list_modules 461
>  __SYSCALL(__NR_lsm_list_modules, sys_lsm_list_modules)
>
> +#define __NR_uretprobe 462
> +__SYSCALL(__NR_uretprobe, sys_uretprobe)
> +
>  #undef __NR_syscalls
> -#define __NR_syscalls 462
> +#define __NR_syscalls 463
>
>  /*
>   * 32 bit systems traditionally used different
> diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
> index faad00cce269..be6195e0d078 100644
> --- a/kernel/sys_ni.c
> +++ b/kernel/sys_ni.c
> @@ -391,3 +391,5 @@ COND_SYSCALL(setuid16);
>
>  /* restartable sequence */
>  COND_SYSCALL(rseq);
> +
> +COND_SYSCALL(uretprobe);
> --
> 2.44.0
>



Re: [PATCHv3 bpf-next 2/7] uprobe: Add uretprobe syscall to speed up return probe

2024-04-26 Thread Andrii Nakryiko
On Sun, Apr 21, 2024 at 12:42 PM Jiri Olsa  wrote:
>
> Adding uretprobe syscall instead of trap to speed up return probe.
>
> At the moment the uretprobe setup/path is:
>
>   - install entry uprobe
>
>   - when the uprobe is hit, it overwrites probed function's return address
> on stack with address of the trampoline that contains breakpoint
> instruction
>
>   - the breakpoint trap code handles the uretprobe consumers execution and
> jumps back to original return address
>
> This patch replaces the above trampoline's breakpoint instruction with new
> ureprobe syscall call. This syscall does exactly the same job as the trap
> with some more extra work:
>
>   - syscall trampoline must save original value for rax/r11/rcx registers
> on stack - rax is set to syscall number and r11/rcx are changed and
> used by syscall instruction
>
>   - the syscall code reads the original values of those registers and
> restore those values in task's pt_regs area
>
>   - only caller from trampoline exposed in '[uprobes]' is allowed,
> the process will receive SIGILL signal otherwise
>
> Even with some extra work, using the uretprobes syscall shows speed
> improvement (compared to using standard breakpoint):
>
>   On Intel (11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz)
>
>   current:
> uretprobe-nop  :1.498 ± 0.000M/s
> uretprobe-push :1.448 ± 0.001M/s
> uretprobe-ret  :0.816 ± 0.001M/s
>
>   with the fix:
> uretprobe-nop  :1.969 ± 0.002M/s  < 31% speed up
> uretprobe-push :1.910 ± 0.000M/s  < 31% speed up
> uretprobe-ret  :0.934 ± 0.000M/s  < 14% speed up
>
>   On Amd (AMD Ryzen 7 5700U)
>
>   current:
> uretprobe-nop  :0.778 ± 0.001M/s
> uretprobe-push :0.744 ± 0.001M/s
> uretprobe-ret  :0.540 ± 0.001M/s
>
>   with the fix:
> uretprobe-nop  :0.860 ± 0.001M/s  < 10% speed up
> uretprobe-push :0.818 ± 0.001M/s  < 10% speed up
> uretprobe-ret  :0.578 ± 0.000M/s  <  7% speed up
>
> The performance test spawns a thread that runs loop which triggers
> uprobe with attached bpf program that increments the counter that
> gets printed in results above.
>
> The uprobe (and uretprobe) kind is determined by which instruction
> is being patched with breakpoint instruction. That's also important
> for uretprobes, because uprobe is installed for each uretprobe.
>
> The performance test is part of bpf selftests:
>   tools/testing/selftests/bpf/run_bench_uprobes.sh
>
> Note at the moment uretprobe syscall is supported only for native
> 64-bit process, compat process still uses standard breakpoint.
>
> Suggested-by: Andrii Nakryiko 
> Signed-off-by: Oleg Nesterov 
> Signed-off-by: Jiri Olsa 
> ---
>  arch/x86/kernel/uprobes.c | 115 ++
>  include/linux/uprobes.h   |   3 +
>  kernel/events/uprobes.c   |  24 +---
>  3 files changed, 135 insertions(+), 7 deletions(-)
>

LGTM as far as I can follow the code

Acked-by: Andrii Nakryiko 

[...]



Re: [PATCHv3 bpf-next 5/7] selftests/bpf: Add uretprobe syscall call from user space test

2024-04-26 Thread Andrii Nakryiko
On Sun, Apr 21, 2024 at 12:43 PM Jiri Olsa  wrote:
>
> Adding test to verify that when called from outside of the
> trampoline provided by kernel, the uretprobe syscall will cause
> calling process to receive SIGILL signal and the attached bpf
> program is no executed.
>
> Signed-off-by: Jiri Olsa 
> ---
>  .../selftests/bpf/prog_tests/uprobe_syscall.c | 92 +++
>  .../selftests/bpf/progs/uprobe_syscall_call.c | 15 +++
>  2 files changed, 107 insertions(+)
>  create mode 100644 tools/testing/selftests/bpf/progs/uprobe_syscall_call.c
>

See nits below, but overall LGTM

Acked-by: Andrii Nakryiko 

[...]

> @@ -219,6 +301,11 @@ static void test_uretprobe_regs_change(void)
>  {
> test__skip();
>  }
> +
> +static void test_uretprobe_syscall_call(void)
> +{
> +   test__skip();
> +}
>  #endif
>
>  void test_uprobe_syscall(void)
> @@ -228,3 +315,8 @@ void test_uprobe_syscall(void)
> if (test__start_subtest("uretprobe_regs_change"))
> test_uretprobe_regs_change();
>  }
> +
> +void serial_test_uprobe_syscall_call(void)

does it need to be serial? non-serial are still run sequentially
within a process (there is no multi-threading), it's more about some
global effects on system.

> +{
> +   test_uretprobe_syscall_call();
> +}
> diff --git a/tools/testing/selftests/bpf/progs/uprobe_syscall_call.c 
> b/tools/testing/selftests/bpf/progs/uprobe_syscall_call.c
> new file mode 100644
> index ..5ea03bb47198
> --- /dev/null
> +++ b/tools/testing/selftests/bpf/progs/uprobe_syscall_call.c
> @@ -0,0 +1,15 @@
> +// SPDX-License-Identifier: GPL-2.0
> +#include "vmlinux.h"
> +#include 
> +#include 
> +
> +struct pt_regs regs;
> +
> +char _license[] SEC("license") = "GPL";
> +
> +SEC("uretprobe//proc/self/exe:uretprobe_syscall_call")
> +int uretprobe(struct pt_regs *regs)
> +{
> +   bpf_printk("uretprobe called");

debugging leftover? we probably don't want to pollute trace_pipe from test

> +   return 0;
> +}
> --
> 2.44.0
>



Re: [PATCHv3 bpf-next 6/7] selftests/bpf: Add uretprobe compat test

2024-04-26 Thread Andrii Nakryiko
On Sun, Apr 21, 2024 at 12:43 PM Jiri Olsa  wrote:
>
> Adding test that adds return uprobe inside 32 bit task
> and verify the return uprobe and attached bpf programs
> get properly executed.
>
> Signed-off-by: Jiri Olsa 
> ---
>  tools/testing/selftests/bpf/.gitignore|  1 +
>  tools/testing/selftests/bpf/Makefile  |  6 ++-
>  .../selftests/bpf/prog_tests/uprobe_syscall.c | 40 +++
>  .../bpf/progs/uprobe_syscall_compat.c | 13 ++
>  4 files changed, 59 insertions(+), 1 deletion(-)
>  create mode 100644 tools/testing/selftests/bpf/progs/uprobe_syscall_compat.c
>
> diff --git a/tools/testing/selftests/bpf/.gitignore 
> b/tools/testing/selftests/bpf/.gitignore
> index f1aebabfb017..69d71223c0dd 100644
> --- a/tools/testing/selftests/bpf/.gitignore
> +++ b/tools/testing/selftests/bpf/.gitignore
> @@ -45,6 +45,7 @@ test_cpp
>  /veristat
>  /sign-file
>  /uprobe_multi
> +/uprobe_compat
>  *.ko
>  *.tmp
>  xskxceiver
> diff --git a/tools/testing/selftests/bpf/Makefile 
> b/tools/testing/selftests/bpf/Makefile
> index edc73f8f5aef..d170b63eca62 100644
> --- a/tools/testing/selftests/bpf/Makefile
> +++ b/tools/testing/selftests/bpf/Makefile
> @@ -134,7 +134,7 @@ TEST_GEN_PROGS_EXTENDED = test_sock_addr 
> test_skb_cgroup_id_user \
> xskxceiver xdp_redirect_multi xdp_synproxy veristat xdp_hw_metadata \
> xdp_features bpf_test_no_cfi.ko
>
> -TEST_GEN_FILES += liburandom_read.so urandom_read sign-file uprobe_multi
> +TEST_GEN_FILES += liburandom_read.so urandom_read sign-file uprobe_multi 
> uprobe_compat

you need to add uprobe_compat to TRUNNER_EXTRA_FILES as well, no?

>
>  # Emit succinct information message describing current building step
>  # $1 - generic step name (e.g., CC, LINK, etc);
> @@ -761,6 +761,10 @@ $(OUTPUT)/uprobe_multi: uprobe_multi.c
> $(call msg,BINARY,,$@)
> $(Q)$(CC) $(CFLAGS) -O0 $(LDFLAGS) $^ $(LDLIBS) -o $@
>
> +$(OUTPUT)/uprobe_compat:
> +   $(call msg,BINARY,,$@)
> +   $(Q)echo "int main() { return 0; }" | $(CC) $(CFLAGS) -xc -m32 -O0 - 
> -o $@
> +
>  EXTRA_CLEAN := $(SCRATCH_DIR) $(HOST_SCRATCH_DIR)  \
> prog_tests/tests.h map_tests/tests.h verifier/tests.h   \
> feature bpftool \
> diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c 
> b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c
> index 9233210a4c33..3770254d893b 100644
> --- a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c
> +++ b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c
> @@ -11,6 +11,7 @@
>  #include 
>  #include "uprobe_syscall.skel.h"
>  #include "uprobe_syscall_call.skel.h"
> +#include "uprobe_syscall_compat.skel.h"
>
>  __naked unsigned long uretprobe_regs_trigger(void)
>  {
> @@ -291,6 +292,35 @@ static void test_uretprobe_syscall_call(void)
>  "read_trace_pipe_iter");
> ASSERT_EQ(found, 0, "found");
>  }
> +
> +static void trace_pipe_compat_cb(const char *str, void *data)
> +{
> +   if (strstr(str, "uretprobe compat") != NULL)
> +   (*(int *)data)++;
> +}
> +
> +static void test_uretprobe_compat(void)
> +{
> +   struct uprobe_syscall_compat *skel = NULL;
> +   int err, found = 0;
> +
> +   skel = uprobe_syscall_compat__open_and_load();
> +   if (!ASSERT_OK_PTR(skel, "uprobe_syscall_compat__open_and_load"))
> +   goto cleanup;
> +
> +   err = uprobe_syscall_compat__attach(skel);
> +   if (!ASSERT_OK(err, "uprobe_syscall_compat__attach"))
> +   goto cleanup;
> +
> +   system("./uprobe_compat");
> +
> +   ASSERT_OK(read_trace_pipe_iter(trace_pipe_compat_cb, &found, 1000),
> +"read_trace_pipe_iter");

why so complicated? can't you just set global variable that it was called

> +   ASSERT_EQ(found, 1, "found");
> +
> +cleanup:
> +   uprobe_syscall_compat__destroy(skel);
> +}
>  #else
>  static void test_uretprobe_regs_equal(void)
>  {
> @@ -306,6 +336,11 @@ static void test_uretprobe_syscall_call(void)
>  {
> test__skip();
>  }
> +
> +static void test_uretprobe_compat(void)
> +{
> +   test__skip();
> +}
>  #endif
>
>  void test_uprobe_syscall(void)
> @@ -320,3 +355,8 @@ void serial_test_uprobe_syscall_call(void)
>  {
> test_uretprobe_syscall_call();
>  }
> +
> +void serial_test_uprobe_syscall_compat(void)

and then no need for serial_test?

> +{
> +   test_uretprobe_compat();
> +}
> diff --git a/tools/testing/selftests/bpf/progs/uprobe_syscall_compat.c 
> b/tools/testing/selftests/bpf/progs/uprobe_syscall_compat.c
> new file mode 100644
> index ..f8adde7f08e2
> --- /dev/null
> +++ b/tools/testing/selftests/bpf/progs/uprobe_syscall_compat.c
> @@ -0,0 +1,13 @@
> +// SPDX-License-Identifier: GPL-2.0
> +#include 
> +#include 
> +#include 
> +
> +char _license[] SEC("license") = "GPL";
> +
> +SEC("uretprobe.multi/./uprobe_compat:main")
> +int uretprobe_compa

Re: [PATCHv3 bpf-next 5/7] selftests/bpf: Add uretprobe syscall call from user space test

2024-04-29 Thread Andrii Nakryiko
On Mon, Apr 29, 2024 at 12:33 AM Jiri Olsa  wrote:
>
> On Fri, Apr 26, 2024 at 11:03:29AM -0700, Andrii Nakryiko wrote:
> > On Sun, Apr 21, 2024 at 12:43 PM Jiri Olsa  wrote:
> > >
> > > Adding test to verify that when called from outside of the
> > > trampoline provided by kernel, the uretprobe syscall will cause
> > > calling process to receive SIGILL signal and the attached bpf
> > > program is no executed.
> > >
> > > Signed-off-by: Jiri Olsa 
> > > ---
> > >  .../selftests/bpf/prog_tests/uprobe_syscall.c | 92 +++
> > >  .../selftests/bpf/progs/uprobe_syscall_call.c | 15 +++
> > >  2 files changed, 107 insertions(+)
> > >  create mode 100644 
> > > tools/testing/selftests/bpf/progs/uprobe_syscall_call.c
> > >
> >
> > See nits below, but overall LGTM
> >
> > Acked-by: Andrii Nakryiko 
> >
> > [...]
> >
> > > @@ -219,6 +301,11 @@ static void test_uretprobe_regs_change(void)
> > >  {
> > > test__skip();
> > >  }
> > > +
> > > +static void test_uretprobe_syscall_call(void)
> > > +{
> > > +   test__skip();
> > > +}
> > >  #endif
> > >
> > >  void test_uprobe_syscall(void)
> > > @@ -228,3 +315,8 @@ void test_uprobe_syscall(void)
> > > if (test__start_subtest("uretprobe_regs_change"))
> > > test_uretprobe_regs_change();
> > >  }
> > > +
> > > +void serial_test_uprobe_syscall_call(void)
> >
> > does it need to be serial? non-serial are still run sequentially
> > within a process (there is no multi-threading), it's more about some
> > global effects on system.
>
> plz see below
>
> >
> > > +{
> > > +   test_uretprobe_syscall_call();
> > > +}
> > > diff --git a/tools/testing/selftests/bpf/progs/uprobe_syscall_call.c 
> > > b/tools/testing/selftests/bpf/progs/uprobe_syscall_call.c
> > > new file mode 100644
> > > index ..5ea03bb47198
> > > --- /dev/null
> > > +++ b/tools/testing/selftests/bpf/progs/uprobe_syscall_call.c
> > > @@ -0,0 +1,15 @@
> > > +// SPDX-License-Identifier: GPL-2.0
> > > +#include "vmlinux.h"
> > > +#include 
> > > +#include 
> > > +
> > > +struct pt_regs regs;
> > > +
> > > +char _license[] SEC("license") = "GPL";
> > > +
> > > +SEC("uretprobe//proc/self/exe:uretprobe_syscall_call")
> > > +int uretprobe(struct pt_regs *regs)
> > > +{
> > > +   bpf_printk("uretprobe called");
> >
> > debugging leftover? we probably don't want to pollute trace_pipe from test
>
> the reason for this is to make sure the bpf program was not executed,
>
> the test makes sure the child gets killed with SIGILL and also that
> the bpf program was not executed by checking the trace_pipe and
> making sure nothing was received
>
> the trace_pipe reading is also why it's serial

you could have attached BPF program from parent process and use a
global variable (and thus eliminate all the trace_pipe system-wide
dependency), but ok, it's fine by me the way this is done

>
> jirka
>
> >
> > > +   return 0;
> > > +}
> > > --
> > > 2.44.0
> > >



Re: [PATCHv3 bpf-next 6/7] selftests/bpf: Add uretprobe compat test

2024-04-29 Thread Andrii Nakryiko
On Mon, Apr 29, 2024 at 12:39 AM Jiri Olsa  wrote:
>
> On Fri, Apr 26, 2024 at 11:06:53AM -0700, Andrii Nakryiko wrote:
> > On Sun, Apr 21, 2024 at 12:43 PM Jiri Olsa  wrote:
> > >
> > > Adding test that adds return uprobe inside 32 bit task
> > > and verify the return uprobe and attached bpf programs
> > > get properly executed.
> > >
> > > Signed-off-by: Jiri Olsa 
> > > ---
> > >  tools/testing/selftests/bpf/.gitignore|  1 +
> > >  tools/testing/selftests/bpf/Makefile  |  6 ++-
> > >  .../selftests/bpf/prog_tests/uprobe_syscall.c | 40 +++
> > >  .../bpf/progs/uprobe_syscall_compat.c | 13 ++
> > >  4 files changed, 59 insertions(+), 1 deletion(-)
> > >  create mode 100644 
> > > tools/testing/selftests/bpf/progs/uprobe_syscall_compat.c
> > >
> > > diff --git a/tools/testing/selftests/bpf/.gitignore 
> > > b/tools/testing/selftests/bpf/.gitignore
> > > index f1aebabfb017..69d71223c0dd 100644
> > > --- a/tools/testing/selftests/bpf/.gitignore
> > > +++ b/tools/testing/selftests/bpf/.gitignore
> > > @@ -45,6 +45,7 @@ test_cpp
> > >  /veristat
> > >  /sign-file
> > >  /uprobe_multi
> > > +/uprobe_compat
> > >  *.ko
> > >  *.tmp
> > >  xskxceiver
> > > diff --git a/tools/testing/selftests/bpf/Makefile 
> > > b/tools/testing/selftests/bpf/Makefile
> > > index edc73f8f5aef..d170b63eca62 100644
> > > --- a/tools/testing/selftests/bpf/Makefile
> > > +++ b/tools/testing/selftests/bpf/Makefile
> > > @@ -134,7 +134,7 @@ TEST_GEN_PROGS_EXTENDED = test_sock_addr 
> > > test_skb_cgroup_id_user \
> > > xskxceiver xdp_redirect_multi xdp_synproxy veristat 
> > > xdp_hw_metadata \
> > > xdp_features bpf_test_no_cfi.ko
> > >
> > > -TEST_GEN_FILES += liburandom_read.so urandom_read sign-file uprobe_multi
> > > +TEST_GEN_FILES += liburandom_read.so urandom_read sign-file uprobe_multi 
> > > uprobe_compat
> >
> > you need to add uprobe_compat to TRUNNER_EXTRA_FILES as well, no?
>
> ah right
>
> > > diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c 
> > > b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c
> > > index 9233210a4c33..3770254d893b 100644
> > > --- a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c
> > > +++ b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c
> > > @@ -11,6 +11,7 @@
> > >  #include 
> > >  #include "uprobe_syscall.skel.h"
> > >  #include "uprobe_syscall_call.skel.h"
> > > +#include "uprobe_syscall_compat.skel.h"
> > >
> > >  __naked unsigned long uretprobe_regs_trigger(void)
> > >  {
> > > @@ -291,6 +292,35 @@ static void test_uretprobe_syscall_call(void)
> > >  "read_trace_pipe_iter");
> > > ASSERT_EQ(found, 0, "found");
> > >  }
> > > +
> > > +static void trace_pipe_compat_cb(const char *str, void *data)
> > > +{
> > > +   if (strstr(str, "uretprobe compat") != NULL)
> > > +   (*(int *)data)++;
> > > +}
> > > +
> > > +static void test_uretprobe_compat(void)
> > > +{
> > > +   struct uprobe_syscall_compat *skel = NULL;
> > > +   int err, found = 0;
> > > +
> > > +   skel = uprobe_syscall_compat__open_and_load();
> > > +   if (!ASSERT_OK_PTR(skel, "uprobe_syscall_compat__open_and_load"))
> > > +   goto cleanup;
> > > +
> > > +   err = uprobe_syscall_compat__attach(skel);
> > > +   if (!ASSERT_OK(err, "uprobe_syscall_compat__attach"))
> > > +   goto cleanup;
> > > +
> > > +   system("./uprobe_compat");
> > > +
> > > +   ASSERT_OK(read_trace_pipe_iter(trace_pipe_compat_cb, &found, 
> > > 1000),
> > > +"read_trace_pipe_iter");
> >
> > why so complicated? can't you just set global variable that it was called
>
> hm, we execute separate uprobe_compat (32bit) process that triggers the bpf
> program, so we can't use global variable.. using the trace_pipe was the only
> thing that was easy to do

you need child process to trigger uprobe, but you could have installed
BPF program from parent process (you'd need to make child wait for
parent to be ready, with normal pipe() like w

Re: [PATCH v9 00/36] tracing: fprobe: function_graph: Multi-function graph and fprobe on fgraph

2024-04-29 Thread Andrii Nakryiko
On Mon, Apr 29, 2024 at 6:51 AM Masami Hiramatsu  wrote:
>
> Hi Andrii,
>
> On Thu, 25 Apr 2024 13:31:53 -0700
> Andrii Nakryiko  wrote:
>
> > Hey Masami,
> >
> > I can't really review most of that code as I'm completely unfamiliar
> > with all those inner workings of fprobe/ftrace/function_graph. I left
> > a few comments where there were somewhat more obvious BPF-related
> > pieces.
> >
> > But I also did run our BPF benchmarks on probes/for-next as a baseline
> > and then with your series applied on top. Just to see if there are any
> > regressions. I think it will be a useful data point for you.
>
> Thanks for testing!
>
> >
> > You should be already familiar with the bench tool we have in BPF
> > selftests (I used it on some other patches for your tree).
>
> What patches we need?
>

You mean for this `bench` tool? They are part of BPF selftests (under
tools/testing/selftests/bpf), you can build them by running:

$ make RELEASE=1 -j$(nproc) bench

After that you'll get a self-container `bench` binary, which has all
the self-contained benchmarks.

You might also find a small script (benchs/run_bench_trigger.sh inside
BPF selftests directory) helpful, it collects final summary of the
benchmark run and optionally accepts a specific set of benchmarks. So
you can use it like this:

$ benchs/run_bench_trigger.sh kprobe kprobe-multi
kprobe :   18.731 ± 0.639M/s
kprobe-multi   :   23.938 ± 0.612M/s

By default it will run a wider set of benchmarks (no uprobes, but a
bunch of extra fentry/fexit tests and stuff like this).

> >
> > BASELINE
> > 
> > kprobe :   24.634 ± 0.205M/s
> > kprobe-multi   :   28.898 ± 0.531M/s
> > kretprobe  :   10.478 ± 0.015M/s
> > kretprobe-multi:   11.012 ± 0.063M/s
> >
> > THIS PATCH SET ON TOP
> > =
> > kprobe :   25.144 ± 0.027M/s (+2%)
> > kprobe-multi   :   28.909 ± 0.074M/s
> > kretprobe  :9.482 ± 0.008M/s (-9.5%)
> > kretprobe-multi:   13.688 ± 0.027M/s (+24%)
>
> This looks good. Kretprobe should also use kretprobe-multi (fprobe)
> eventually because it should be a single callback version of
> kretprobe-multi.
>
> >
> > These numbers are pretty stable and look to be more or less representative.
> >
> > As you can see, kprobes got a bit faster, kprobe-multi seems to be
> > about the same, though.
> >
> > Then (I suppose they are "legacy") kretprobes got quite noticeably
> > slower, almost by 10%. Not sure why, but looks real after re-running
> > benchmarks a bunch of times and getting stable results.
>
> Hmm, kretprobe on x86 should use ftrace + rethook even with my series.
> So nothing should be changed. Maybe cache access pattern has been
> changed?
> I'll check it with tracefs (to remove the effect from bpf related changes)
>
> >
> > On the other hand, multi-kretprobes got significantly faster (+24%!).
> > Again, I don't know if it is expected or not, but it's a nice
> > improvement.
>
> Thanks!
>
> >
> > If you have any idea why kretprobes would get so much slower, it would
> > be nice to look into that and see if you can mitigate the regression
> > somehow. Thanks!
>
> OK, let me check it.
>
> Thank you!
>
> >
> >
> > >  51 files changed, 2325 insertions(+), 882 deletions(-)
> > >  create mode 100644 
> > > tools/testing/selftests/ftrace/test.d/dynevent/add_remove_fprobe_repeat.tc
> > >
> > > --
> > > Masami Hiramatsu (Google) 
> > >
>
>
> --
> Masami Hiramatsu (Google) 



Re: [PATCH v9 00/36] tracing: fprobe: function_graph: Multi-function graph and fprobe on fgraph

2024-04-29 Thread Andrii Nakryiko
On Sun, Apr 28, 2024 at 4:25 PM Steven Rostedt  wrote:
>
> On Thu, 25 Apr 2024 13:31:53 -0700
> Andrii Nakryiko  wrote:
>
> I'm just coming back from Japan (work and then a vacation), and
> catching up on my email during the 6 hour layover in Detroit.
>
> > Hey Masami,
> >
> > I can't really review most of that code as I'm completely unfamiliar
> > with all those inner workings of fprobe/ftrace/function_graph. I left
> > a few comments where there were somewhat more obvious BPF-related
> > pieces.
> >
> > But I also did run our BPF benchmarks on probes/for-next as a baseline
> > and then with your series applied on top. Just to see if there are any
> > regressions. I think it will be a useful data point for you.
> >
> > You should be already familiar with the bench tool we have in BPF
> > selftests (I used it on some other patches for your tree).
>
> I should get familiar with your tools too.
>

It's a nifty and self-contained tool to do some micro-benchmarking, I
replied to Masami with a few details on how to build and use it.

> >
> > BASELINE
> > 
> > kprobe :   24.634 ± 0.205M/s
> > kprobe-multi   :   28.898 ± 0.531M/s
> > kretprobe  :   10.478 ± 0.015M/s
> > kretprobe-multi:   11.012 ± 0.063M/s
> >
> > THIS PATCH SET ON TOP
> > =
> > kprobe :   25.144 ± 0.027M/s (+2%)
> > kprobe-multi   :   28.909 ± 0.074M/s
> > kretprobe  :9.482 ± 0.008M/s (-9.5%)
> > kretprobe-multi:   13.688 ± 0.027M/s (+24%)
> >
> > These numbers are pretty stable and look to be more or less representative.
>
> Thanks for running this.
>
> >
> > As you can see, kprobes got a bit faster, kprobe-multi seems to be
> > about the same, though.
> >
> > Then (I suppose they are "legacy") kretprobes got quite noticeably
> > slower, almost by 10%. Not sure why, but looks real after re-running
> > benchmarks a bunch of times and getting stable results.
> >
> > On the other hand, multi-kretprobes got significantly faster (+24%!).
> > Again, I don't know if it is expected or not, but it's a nice
> > improvement.
> >
> > If you have any idea why kretprobes would get so much slower, it would
> > be nice to look into that and see if you can mitigate the regression
> > somehow. Thanks!
>
> My guess is that this patch set helps generic use cases for tracing the
> return of functions, but will likely add more overhead for single use
> cases. That is, kretprobe is made to be specific for a single function,
> but kretprobe-multi is more generic. Hence the generic version will
> improve at the sacrifice of the specific function. I did expect as much.
>
> That said, I think there's probably a lot of low hanging fruit that can
> be done to this series to help improve the kretprobe performance. I'm
> not sure we can get back to the baseline, but I'm hoping we can at
> least make it much better than that 10% slowdown.

That would certainly be appreciated, thanks!

But I'm also considering trying to switch to multi-kprobe/kretprobe
automatically on libbpf side, whenever possible, so that users can get
the best performance. There might still be situations where this can't
be done, so singular kprobe/kretprobe can't be completely deprecated,
but multi variants seems to be universally faster, so I'm going to
make them a default (I need to handle some backwards compat aspect,
but that's libbpf-specific stuff you shouldn't be concerned with).

>
> I'll be reviewing this patch set this week as I recover from jetlag.
>
> -- Steve



Re: [PATCH v9 00/36] tracing: fprobe: function_graph: Multi-function graph and fprobe on fgraph

2024-04-30 Thread Andrii Nakryiko
On Tue, Apr 30, 2024 at 6:32 AM Masami Hiramatsu  wrote:
>
> On Mon, 29 Apr 2024 13:25:04 -0700
> Andrii Nakryiko  wrote:
>
> > On Mon, Apr 29, 2024 at 6:51 AM Masami Hiramatsu  
> > wrote:
> > >
> > > Hi Andrii,
> > >
> > > On Thu, 25 Apr 2024 13:31:53 -0700
> > > Andrii Nakryiko  wrote:
> > >
> > > > Hey Masami,
> > > >
> > > > I can't really review most of that code as I'm completely unfamiliar
> > > > with all those inner workings of fprobe/ftrace/function_graph. I left
> > > > a few comments where there were somewhat more obvious BPF-related
> > > > pieces.
> > > >
> > > > But I also did run our BPF benchmarks on probes/for-next as a baseline
> > > > and then with your series applied on top. Just to see if there are any
> > > > regressions. I think it will be a useful data point for you.
> > >
> > > Thanks for testing!
> > >
> > > >
> > > > You should be already familiar with the bench tool we have in BPF
> > > > selftests (I used it on some other patches for your tree).
> > >
> > > What patches we need?
> > >
> >
> > You mean for this `bench` tool? They are part of BPF selftests (under
> > tools/testing/selftests/bpf), you can build them by running:
> >
> > $ make RELEASE=1 -j$(nproc) bench
> >
> > After that you'll get a self-container `bench` binary, which has all
> > the self-contained benchmarks.
> >
> > You might also find a small script (benchs/run_bench_trigger.sh inside
> > BPF selftests directory) helpful, it collects final summary of the
> > benchmark run and optionally accepts a specific set of benchmarks. So
> > you can use it like this:
> >
> > $ benchs/run_bench_trigger.sh kprobe kprobe-multi
> > kprobe :   18.731 ± 0.639M/s
> > kprobe-multi   :   23.938 ± 0.612M/s
> >
> > By default it will run a wider set of benchmarks (no uprobes, but a
> > bunch of extra fentry/fexit tests and stuff like this).
>
> origin:
> # benchs/run_bench_trigger.sh
> kretprobe :1.329 ± 0.007M/s
> kretprobe-multi:1.341 ± 0.004M/s
> # benchs/run_bench_trigger.sh
> kretprobe :1.288 ± 0.014M/s
> kretprobe-multi:1.365 ± 0.002M/s
> # benchs/run_bench_trigger.sh
> kretprobe :1.329 ± 0.002M/s
> kretprobe-multi:1.331 ± 0.011M/s
> # benchs/run_bench_trigger.sh
> kretprobe :1.311 ± 0.003M/s
> kretprobe-multi:1.318 ± 0.002M/s s
>
> patched:
>
> # benchs/run_bench_trigger.sh
> kretprobe :1.274 ± 0.003M/s
> kretprobe-multi:1.397 ± 0.002M/s
> # benchs/run_bench_trigger.sh
> kretprobe :1.307 ± 0.002M/s
> kretprobe-multi:1.406 ± 0.004M/s
> # benchs/run_bench_trigger.sh
> kretprobe :1.279 ± 0.004M/s
> kretprobe-multi:1.330 ± 0.014M/s
> # benchs/run_bench_trigger.sh
> kretprobe :1.256 ± 0.010M/s
> kretprobe-multi:1.412 ± 0.003M/s
>
> Hmm, in my case, it seems smaller differences (~3%?).
> I attached perf report results for those, but I don't see large difference.

I ran my benchmarks on bare metal machine (and quite powerful at that,
you can see my numbers are almost 10x of yours), with mitigations
disabled, no retpolines, etc. If you have any of those mitigations it
might result in smaller differences, probably. If you are running
inside QEMU/VM, the results might differ significantly as well.

>
> > > >
> > > > BASELINE
> > > > 
> > > > kprobe :   24.634 ± 0.205M/s
> > > > kprobe-multi   :   28.898 ± 0.531M/s
> > > > kretprobe  :   10.478 ± 0.015M/s
> > > > kretprobe-multi:   11.012 ± 0.063M/s
> > > >
> > > > THIS PATCH SET ON TOP
> > > > =
> > > > kprobe :   25.144 ± 0.027M/s (+2%)
> > > > kprobe-multi   :   28.909 ± 0.074M/s
> > > > kretprobe  :9.482 ± 0.008M/s (-9.5%)
> > > > kretprobe-multi:   13.688 ± 0.027M/s (+24%)
> > >
> > > This looks good. Kretprobe should also use kretprobe-multi (fprobe)
> > > eventually because it should be a single callback version of
> > > kretprobe-multi.
>
> I ran another benchmark (prctl loop, attached), the origin kernel result is 
> here;
>
> # sh ./benchmark.sh
> count = 1000, took 6.748133 sec
>
> And the patched kernel result;
>
> # sh ./benchmark.sh
> count = 1000, took 6.644095 sec
>
> I confirmed that the parf result has no big difference.
>
> Thank you,
>
>
> > >
> >

Re: [PATCHv4 bpf-next 5/7] selftests/bpf: Add uretprobe syscall call from user space test

2024-05-02 Thread Andrii Nakryiko
On Thu, May 2, 2024 at 5:24 AM Jiri Olsa  wrote:
>
> Adding test to verify that when called from outside of the
> trampoline provided by kernel, the uretprobe syscall will cause
> calling process to receive SIGILL signal and the attached bpf
> program is not executed.
>
> Reviewed-by: Masami Hiramatsu (Google) 
> Signed-off-by: Jiri Olsa 
> ---
>  .../selftests/bpf/prog_tests/uprobe_syscall.c | 95 +++
>  .../bpf/progs/uprobe_syscall_executed.c   | 17 
>  2 files changed, 112 insertions(+)
>  create mode 100644 
> tools/testing/selftests/bpf/progs/uprobe_syscall_executed.c
>
> diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c 
> b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c
> index 1a50cd35205d..c6fdb8c59ea3 100644
> --- a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c
> +++ b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c
> @@ -7,7 +7,10 @@
>  #include 
>  #include 
>  #include 
> +#include 
> +#include 
>  #include "uprobe_syscall.skel.h"
> +#include "uprobe_syscall_executed.skel.h"
>
>  __naked unsigned long uretprobe_regs_trigger(void)
>  {
> @@ -209,6 +212,91 @@ static void test_uretprobe_regs_change(void)
> }
>  }
>
> +#ifndef __NR_uretprobe
> +#define __NR_uretprobe 462
> +#endif
> +
> +__naked unsigned long uretprobe_syscall_call_1(void)
> +{
> +   /*
> +* Pretend we are uretprobe trampoline to trigger the return
> +* probe invocation in order to verify we get SIGILL.
> +*/
> +   asm volatile (
> +   "pushq %rax\n"
> +   "pushq %rcx\n"
> +   "pushq %r11\n"
> +   "movq $" __stringify(__NR_uretprobe) ", %rax\n"
> +   "syscall\n"
> +   "popq %r11\n"
> +   "popq %rcx\n"
> +   "retq\n"
> +   );
> +}
> +
> +__naked unsigned long uretprobe_syscall_call(void)
> +{
> +   asm volatile (
> +   "call uretprobe_syscall_call_1\n"
> +   "retq\n"
> +   );
> +}
> +
> +static void test_uretprobe_syscall_call(void)
> +{
> +   LIBBPF_OPTS(bpf_uprobe_multi_opts, opts,
> +   .retprobe = true,
> +   );
> +   struct uprobe_syscall_executed *skel;
> +   int pid, status, err, go[2], c;
> +
> +   if (pipe(go))
> +   return;

very unlikely to fail, but still, ASSERT_OK() would be in order here

But regardless:

Acked-by: Andrii Nakryiko 

[...]



Re: [PATCHv4 bpf-next 6/7] selftests/bpf: Add uretprobe compat test

2024-05-02 Thread Andrii Nakryiko
On Thu, May 2, 2024 at 5:24 AM Jiri Olsa  wrote:
>
> Adding test that adds return uprobe inside 32-bit task
> and verify the return uprobe and attached bpf programs
> get properly executed.
>
> Reviewed-by: Masami Hiramatsu (Google) 
> Signed-off-by: Jiri Olsa 
> ---
>  tools/testing/selftests/bpf/.gitignore|  1 +
>  tools/testing/selftests/bpf/Makefile  |  7 ++-
>  .../selftests/bpf/prog_tests/uprobe_syscall.c | 60 +++
>  3 files changed, 67 insertions(+), 1 deletion(-)
>
> diff --git a/tools/testing/selftests/bpf/.gitignore 
> b/tools/testing/selftests/bpf/.gitignore
> index f1aebabfb017..69d71223c0dd 100644
> --- a/tools/testing/selftests/bpf/.gitignore
> +++ b/tools/testing/selftests/bpf/.gitignore
> @@ -45,6 +45,7 @@ test_cpp
>  /veristat
>  /sign-file
>  /uprobe_multi
> +/uprobe_compat
>  *.ko
>  *.tmp
>  xskxceiver
> diff --git a/tools/testing/selftests/bpf/Makefile 
> b/tools/testing/selftests/bpf/Makefile
> index 82247aeef857..a94352162290 100644
> --- a/tools/testing/selftests/bpf/Makefile
> +++ b/tools/testing/selftests/bpf/Makefile
> @@ -133,7 +133,7 @@ TEST_GEN_PROGS_EXTENDED = test_sock_addr 
> test_skb_cgroup_id_user \
> xskxceiver xdp_redirect_multi xdp_synproxy veristat xdp_hw_metadata \
> xdp_features bpf_test_no_cfi.ko
>
> -TEST_GEN_FILES += liburandom_read.so urandom_read sign-file uprobe_multi
> +TEST_GEN_FILES += liburandom_read.so urandom_read sign-file uprobe_multi 
> uprobe_compat
>
>  ifneq ($(V),1)
>  submake_extras := feature_display=0
> @@ -631,6 +631,7 @@ TRUNNER_EXTRA_FILES := $(OUTPUT)/urandom_read 
> $(OUTPUT)/bpf_testmod.ko  \
>$(OUTPUT)/xdp_synproxy   \
>$(OUTPUT)/sign-file  \
>$(OUTPUT)/uprobe_multi   \
> +  $(OUTPUT)/uprobe_compat  \
>ima_setup.sh \
>verify_sig_setup.sh  \
>$(wildcard progs/btf_dump_test_case_*.c) \
> @@ -752,6 +753,10 @@ $(OUTPUT)/uprobe_multi: uprobe_multi.c
> $(call msg,BINARY,,$@)
> $(Q)$(CC) $(CFLAGS) -O0 $(LDFLAGS) $^ $(LDLIBS) -o $@
>
> +$(OUTPUT)/uprobe_compat:
> +   $(call msg,BINARY,,$@)
> +   $(Q)echo "int main() { return 0; }" | $(CC) $(CFLAGS) -xc -m32 -O0 - 
> -o $@
> +
>  EXTRA_CLEAN := $(SCRATCH_DIR) $(HOST_SCRATCH_DIR)  \
> prog_tests/tests.h map_tests/tests.h verifier/tests.h   \
> feature bpftool \
> diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c 
> b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c
> index c6fdb8c59ea3..bfea9a0368a4 100644
> --- a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c
> +++ b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c
> @@ -5,6 +5,7 @@
>  #ifdef __x86_64__
>
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -297,6 +298,58 @@ static void test_uretprobe_syscall_call(void)
> close(go[1]);
> close(go[0]);
>  }
> +
> +static void test_uretprobe_compat(void)
> +{
> +   LIBBPF_OPTS(bpf_uprobe_multi_opts, opts,
> +   .retprobe = true,
> +   );
> +   struct uprobe_syscall_executed *skel;
> +   int err, go[2], pid, c, status;
> +
> +   if (pipe(go))
> +   return;

ASSERT_OK() missing, like in the previous patch

Thanks for switching to pipe() + global variable instead of using trace_pipe.

Acked-by: Andrii Nakryiko 

> +
> +   skel = uprobe_syscall_executed__open_and_load();
> +   if (!ASSERT_OK_PTR(skel, "uprobe_syscall_executed__open_and_load"))
> +   goto cleanup;
> +

[...]



Re: [PATCHv4 bpf-next 0/7] uprobe: uretprobe speed up

2024-05-02 Thread Andrii Nakryiko
On Thu, May 2, 2024 at 5:23 AM Jiri Olsa  wrote:
>
> hi,
> as part of the effort on speeding up the uprobes [0] coming with
> return uprobe optimization by using syscall instead of the trap
> on the uretprobe trampoline.
>
> The speed up depends on instruction type that uprobe is installed
> and depends on specific HW type, please check patch 1 for details.
>
> Patches 1-6 are based on bpf-next/master, but path 1 and 2 are
> apply-able on linux-trace.git tree probes/for-next branch.
> Patch 7 is based on man-pages master.
>
> v4 changes:
>   - added acks [Oleg,Andrii,Masami]
>   - reworded the man page and adding more info to NOTE section [Masami]
>   - rewrote bpf tests not to use trace_pipe [Andrii]
>   - cc-ed linux-man list
>
> Also available at:
>   https://git.kernel.org/pub/scm/linux/kernel/git/jolsa/perf.git
>   uretprobe_syscall
>

It looks great to me, thanks! Unfortunately BPF CI build is broken,
probably due to some of the Makefile additions, please investigate and
fix (or we'll need to fix something on BPF CI side), but it looks like
you'll need another revision, unfortunately.

pw-bot: cr

  [0] 
https://github.com/kernel-patches/bpf/actions/runs/8923849088/job/24509002194



But while we are at it.

Masami, Oleg,

What should be the logistics of landing this? Can/should we route this
through the bpf-next tree, given there are lots of BPF-based
selftests? Or you want to take this through
linux-trace/probes/for-next? In the latter case, it's probably better
to apply only the first two patches to probes/for-next and the rest
should still go through the bpf-next tree (otherwise we are running
into conflicts in BPF selftests). Previously we were handling such
cross-tree dependencies by creating a named branch or tag, and merging
it into bpf-next (so that all SHAs are preserved). It's a bunch of
extra work for everyone involved, so the simplest way would be to just
land through bpf-next, of course. But let me know your preferences.

Thanks!

> thanks,
> jirka
>
>
> Notes to check list items in Documentation/process/adding-syscalls.rst:
>
> - System Call Alternatives
>   New syscall seems like the best way in here, becase we need

typo (thanks, Gmail): because

>   just to quickly enter kernel with no extra arguments processing,
>   which we'd need to do if we decided to use another syscall.
>
> - Designing the API: Planning for Extension
>   The uretprobe syscall is very specific and most likely won't be
>   extended in the future.
>
>   At the moment it does not take any arguments and even if it does
>   in future, it's allowed to be called only from trampoline prepared
>   by kernel, so there'll be no broken user.
>
> - Designing the API: Other Considerations
>   N/A because uretprobe syscall does not return reference to kernel
>   object.
>
> - Proposing the API
>   Wiring up of the uretprobe system call si in separate change,

typo: is

>   selftests and man page changes are part of the patchset.
>
> - Generic System Call Implementation
>   There's no CONFIG option for the new functionality because it
>   keeps the same behaviour from the user POV.
>
> - x86 System Call Implementation
>   It's 64-bit syscall only.
>
> - Compatibility System Calls (Generic)
>   N/A uretprobe syscall has no arguments and is not supported
>   for compat processes.
>
> - Compatibility System Calls (x86)
>   N/A uretprobe syscall is not supported for compat processes.
>
> - System Calls Returning Elsewhere
>   N/A.
>
> - Other Details
>   N/A.
>
> - Testing
>   Adding new bpf selftests and ran ltp on top of this change.
>
> - Man Page
>   Attached.
>
> - Do not call System Calls in the Kernel
>   N/A.
>
>
> [0] https://lore.kernel.org/bpf/ZeCXHKJ--iYYbmLj@krava/
> ---
> Jiri Olsa (6):
>   uprobe: Wire up uretprobe system call
>   uprobe: Add uretprobe syscall to speed up return probe
>   selftests/bpf: Add uretprobe syscall test for regs integrity
>   selftests/bpf: Add uretprobe syscall test for regs changes
>   selftests/bpf: Add uretprobe syscall call from user space test
>   selftests/bpf: Add uretprobe compat test
>
>  arch/x86/entry/syscalls/syscall_64.tbl  |   1 +
>  arch/x86/kernel/uprobes.c   | 115 
> 
>  include/linux/syscalls.h|   2 +
>  include/linux/uprobes.h |   3 +
>  include/uapi/asm-generic/unistd.h   |   5 +-
>  kernel/events/uprobes.c |  24 --
>  kernel/sys_ni.c |   2 +
>  tools/include/linux/compiler.h  |   4 +
>  tools/testing/selftests/bpf/.gitignore  |   1 +
>  tools/testing/selftests/bpf/Makefile|   7 +-
>  tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c   | 123 
> -
>  tools/testing/selftests/bpf/prog_te

Re: [PATCHv4 bpf-next 0/7] uprobe: uretprobe speed up

2024-05-03 Thread Andrii Nakryiko
On Thu, May 2, 2024 at 1:04 PM Jiri Olsa  wrote:
>
> On Thu, May 02, 2024 at 09:43:02AM -0700, Andrii Nakryiko wrote:
> > On Thu, May 2, 2024 at 5:23 AM Jiri Olsa  wrote:
> > >
> > > hi,
> > > as part of the effort on speeding up the uprobes [0] coming with
> > > return uprobe optimization by using syscall instead of the trap
> > > on the uretprobe trampoline.
> > >
> > > The speed up depends on instruction type that uprobe is installed
> > > and depends on specific HW type, please check patch 1 for details.
> > >
> > > Patches 1-6 are based on bpf-next/master, but path 1 and 2 are
> > > apply-able on linux-trace.git tree probes/for-next branch.
> > > Patch 7 is based on man-pages master.
> > >
> > > v4 changes:
> > >   - added acks [Oleg,Andrii,Masami]
> > >   - reworded the man page and adding more info to NOTE section [Masami]
> > >   - rewrote bpf tests not to use trace_pipe [Andrii]
> > >   - cc-ed linux-man list
> > >
> > > Also available at:
> > >   https://git.kernel.org/pub/scm/linux/kernel/git/jolsa/perf.git
> > >   uretprobe_syscall
> > >
> >
> > It looks great to me, thanks! Unfortunately BPF CI build is broken,
> > probably due to some of the Makefile additions, please investigate and
> > fix (or we'll need to fix something on BPF CI side), but it looks like
> > you'll need another revision, unfortunately.
> >
> > pw-bot: cr
> >
> >   [0] 
> > https://github.com/kernel-patches/bpf/actions/runs/8923849088/job/24509002194
>
> yes, I think it's missing the 32-bit libc for uprobe_compat binary,
> probably it needs to be added to github.com:libbpf/ci.git 
> setup-build-env/action.yml ?
> hm but I'm not sure how to test it, need to check

You can create a custom PR directly against Github repo
(kernel-patches/bpf) and BPF CI will run all the tests on your custom
code. This way you can iterate without spamming the mailing list.

But I'm just wondering if it's worth complicating setup just for
testing this x32 compat mode. So maybe just dropping one of those
patches would be better?

>
> >
> >
> >
> > But while we are at it.
> >
> > Masami, Oleg,
> >
> > What should be the logistics of landing this? Can/should we route this
> > through the bpf-next tree, given there are lots of BPF-based
> > selftests? Or you want to take this through
> > linux-trace/probes/for-next? In the latter case, it's probably better
> > to apply only the first two patches to probes/for-next and the rest
> > should still go through the bpf-next tree (otherwise we are running
>
> I think this was the plan, previously mentioned in here:
> https://lore.kernel.org/bpf/20240423000943.478ccf1e735a63c6c1b4c...@kernel.org/
>

Ok, then we'll have to land this patch set as two separate ones. It's
fine, let's figure out if you need to do anything for shadow stacks
and try to land it soon.

> > into conflicts in BPF selftests). Previously we were handling such
> > cross-tree dependencies by creating a named branch or tag, and merging
> > it into bpf-next (so that all SHAs are preserved). It's a bunch of
> > extra work for everyone involved, so the simplest way would be to just
> > land through bpf-next, of course. But let me know your preferences.
> >
> > Thanks!
> >
> > > thanks,
> > > jirka
> > >
> > >
> > > Notes to check list items in Documentation/process/adding-syscalls.rst:
> > >
> > > - System Call Alternatives
> > >   New syscall seems like the best way in here, becase we need
> >
> > typo (thanks, Gmail): because
>
> ok
>
> >
> > >   just to quickly enter kernel with no extra arguments processing,
> > >   which we'd need to do if we decided to use another syscall.
> > >
> > > - Designing the API: Planning for Extension
> > >   The uretprobe syscall is very specific and most likely won't be
> > >   extended in the future.
> > >
> > >   At the moment it does not take any arguments and even if it does
> > >   in future, it's allowed to be called only from trampoline prepared
> > >   by kernel, so there'll be no broken user.
> > >
> > > - Designing the API: Other Considerations
> > >   N/A because uretprobe syscall does not return reference to kernel
> > >   object.
> > >
> > > - Proposing the API
> > >   Wiring up of the uretprobe system call si in separate change,
> >
> > typo: is
>
> ok, thanks
>
> jirka



Re: [PATCHv5 bpf-next 5/8] selftests/bpf: Add uretprobe syscall call from user space test

2024-05-07 Thread Andrii Nakryiko
On Tue, May 7, 2024 at 3:54 AM Jiri Olsa  wrote:
>
> Adding test to verify that when called from outside of the
> trampoline provided by kernel, the uretprobe syscall will cause
> calling process to receive SIGILL signal and the attached bpf
> program is not executed.
>
> Reviewed-by: Masami Hiramatsu (Google) 
> Signed-off-by: Jiri Olsa 
> ---
>  .../selftests/bpf/prog_tests/uprobe_syscall.c | 95 +++
>  .../bpf/progs/uprobe_syscall_executed.c   | 17 
>  2 files changed, 112 insertions(+)
>  create mode 100644 
> tools/testing/selftests/bpf/progs/uprobe_syscall_executed.c
>

Acked-by: Andrii Nakryiko 

> diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c 
> b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c
> index 1a50cd35205d..3ef324c2db50 100644
> --- a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c
> +++ b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c
> @@ -7,7 +7,10 @@
>  #include 
>  #include 
>  #include 
> +#include 
> +#include 
>  #include "uprobe_syscall.skel.h"
> +#include "uprobe_syscall_executed.skel.h"
>

[...]



Re: [PATCH v9 00/36] tracing: fprobe: function_graph: Multi-function graph and fprobe on fgraph

2024-05-07 Thread Andrii Nakryiko
On Wed, May 1, 2024 at 7:06 PM Masami Hiramatsu  wrote:
>
> On Tue, 30 Apr 2024 09:29:40 -0700
> Andrii Nakryiko  wrote:
>
> > On Tue, Apr 30, 2024 at 6:32 AM Masami Hiramatsu  
> > wrote:
> > >
> > > On Mon, 29 Apr 2024 13:25:04 -0700
> > > Andrii Nakryiko  wrote:
> > >
> > > > On Mon, Apr 29, 2024 at 6:51 AM Masami Hiramatsu  
> > > > wrote:
> > > > >
> > > > > Hi Andrii,
> > > > >
> > > > > On Thu, 25 Apr 2024 13:31:53 -0700
> > > > > Andrii Nakryiko  wrote:
> > > > >
> > > > > > Hey Masami,
> > > > > >
> > > > > > I can't really review most of that code as I'm completely unfamiliar
> > > > > > with all those inner workings of fprobe/ftrace/function_graph. I 
> > > > > > left
> > > > > > a few comments where there were somewhat more obvious BPF-related
> > > > > > pieces.
> > > > > >
> > > > > > But I also did run our BPF benchmarks on probes/for-next as a 
> > > > > > baseline
> > > > > > and then with your series applied on top. Just to see if there are 
> > > > > > any
> > > > > > regressions. I think it will be a useful data point for you.
> > > > >
> > > > > Thanks for testing!
> > > > >
> > > > > >
> > > > > > You should be already familiar with the bench tool we have in BPF
> > > > > > selftests (I used it on some other patches for your tree).
> > > > >
> > > > > What patches we need?
> > > > >
> > > >
> > > > You mean for this `bench` tool? They are part of BPF selftests (under
> > > > tools/testing/selftests/bpf), you can build them by running:
> > > >
> > > > $ make RELEASE=1 -j$(nproc) bench
> > > >
> > > > After that you'll get a self-container `bench` binary, which has all
> > > > the self-contained benchmarks.
> > > >
> > > > You might also find a small script (benchs/run_bench_trigger.sh inside
> > > > BPF selftests directory) helpful, it collects final summary of the
> > > > benchmark run and optionally accepts a specific set of benchmarks. So
> > > > you can use it like this:
> > > >
> > > > $ benchs/run_bench_trigger.sh kprobe kprobe-multi
> > > > kprobe :   18.731 ± 0.639M/s
> > > > kprobe-multi   :   23.938 ± 0.612M/s
> > > >
> > > > By default it will run a wider set of benchmarks (no uprobes, but a
> > > > bunch of extra fentry/fexit tests and stuff like this).
> > >
> > > origin:
> > > # benchs/run_bench_trigger.sh
> > > kretprobe :1.329 ± 0.007M/s
> > > kretprobe-multi:1.341 ± 0.004M/s
> > > # benchs/run_bench_trigger.sh
> > > kretprobe :1.288 ± 0.014M/s
> > > kretprobe-multi:1.365 ± 0.002M/s
> > > # benchs/run_bench_trigger.sh
> > > kretprobe :1.329 ± 0.002M/s
> > > kretprobe-multi:1.331 ± 0.011M/s
> > > # benchs/run_bench_trigger.sh
> > > kretprobe :1.311 ± 0.003M/s
> > > kretprobe-multi:1.318 ± 0.002M/s s
> > >
> > > patched:
> > >
> > > # benchs/run_bench_trigger.sh
> > > kretprobe :1.274 ± 0.003M/s
> > > kretprobe-multi:1.397 ± 0.002M/s
> > > # benchs/run_bench_trigger.sh
> > > kretprobe :1.307 ± 0.002M/s
> > > kretprobe-multi:1.406 ± 0.004M/s
> > > # benchs/run_bench_trigger.sh
> > > kretprobe :1.279 ± 0.004M/s
> > > kretprobe-multi:1.330 ± 0.014M/s
> > > # benchs/run_bench_trigger.sh
> > > kretprobe :1.256 ± 0.010M/s
> > > kretprobe-multi:1.412 ± 0.003M/s
> > >
> > > Hmm, in my case, it seems smaller differences (~3%?).
> > > I attached perf report results for those, but I don't see large 
> > > difference.
> >
> > I ran my benchmarks on bare metal machine (and quite powerful at that,
> > you can see my numbers are almost 10x of yours), with mitigations
> > disabled, no retpolines, etc. If you have any of those mitigations it
> > might result in smaller differences, probably. If you are running
> > inside QEMU/VM, the results might differ significantly as well.
>
> I ran it on my bare metal machines again, but could not find any difference
> between them. But I think I enabled intel mitigations on, so it might make
> a difference from your result.
>
> Can you run the benchmark with perf record? If there is such differences,
> there should be recorded.

I can, yes, will try to do this week, I'm just trying to keep up with
the rest of the stuff on my plate and haven't found yet time to do
this. I'll get back to you (and I'll use the latest version of your
patch set, of course).

> e.g.
>
> # perf record -g -o perf.data-kretprobe-nopatch-raw-bpf -- bench -w2 -d5 -a 
> trig-kretprobe
> # perf report -G -i perf.data-kretprobe-nopatch-raw-bpf -k $VMLINUX --stdio > 
> perf-out-kretprobe-nopatch-raw-bpf
>
> I attached the results in my side.
> The interesting point is, the functions int the result are not touched by
> this series. Thus there may be another reason if you see the kretprobe
> regression.
>
> Thank you,
> --
> Masami Hiramatsu (Google) 



Re: [syzbot] [bpf?] [trace?] general protection fault in bpf_get_attach_cookie_tracing

2024-05-07 Thread Andrii Nakryiko
On Sun, May 5, 2024 at 9:13 AM syzbot
 wrote:
>
> Hello,
>
> syzbot found the following issue on:
>
> HEAD commit:a9e7715ce8b3 libbpf: Avoid casts from pointers to enums in..
> git tree:   bpf-next
> console+strace: https://syzkaller.appspot.com/x/log.txt?x=153c1dc498
> kernel config:  https://syzkaller.appspot.com/x/.config?x=e8aa3e4736485e94
> dashboard link: https://syzkaller.appspot.com/bug?extid=3ab78ff125b7979e45f9
> compiler:   Debian clang version 15.0.6, GNU ld (GNU Binutils for Debian) 
> 2.40
> syz repro:  https://syzkaller.appspot.com/x/repro.syz?x=17d4b58898
> C reproducer:   https://syzkaller.appspot.com/x/repro.c?x=16cb047098
>
> Downloadable assets:
> disk image: 
> https://storage.googleapis.com/syzbot-assets/a6daa7801875/disk-a9e7715c.raw.xz
> vmlinux: 
> https://storage.googleapis.com/syzbot-assets/0d5b51385a69/vmlinux-a9e7715c.xz
> kernel image: 
> https://storage.googleapis.com/syzbot-assets/999297a08631/bzImage-a9e7715c.xz
>
> IMPORTANT: if you fix the issue, please add the following tag to the commit:
> Reported-by: syzbot+3ab78ff125b7979e4...@syzkaller.appspotmail.com
>
> general protection fault, probably for non-canonical address 
> 0xdc00:  [#1] PREEMPT SMP KASAN PTI

I suspect it's the same issue that we already fixed ([0]) in
bpf/master, the fixes just haven't made it into bpf-next tree

  [0] 1a80dbcb2dba bpf: support deferring bpf_link dealloc to after
RCU grace period

> KASAN: null-ptr-deref in range [0x-0x0007]
> CPU: 0 PID: 5082 Comm: syz-executor316 Not tainted 
> 6.9.0-rc5-syzkaller-01452-ga9e7715ce8b3 #0
> Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS 
> Google 03/27/2024
> RIP: 0010:bpf_get_attach_cookie_tracing kernel/trace/bpf_trace.c:1179 
> [inline]
> RIP: 0010:bpf_get_attach_cookie_tracing+0x46/0x60 
> kernel/trace/bpf_trace.c:1174
> Code: d3 03 00 48 81 c3 00 18 00 00 48 89 d8 48 c1 e8 03 42 80 3c 30 00 74 08 
> 48 89 df e8 54 b9 59 00 48 8b 1b 48 89 d8 48 c1 e8 03 <42> 80 3c 30 00 74 08 
> 48 89 df e8 3b b9 59 00 48 8b 03 5b 41 5e c3
> RSP: 0018:c90002f9fba8 EFLAGS: 00010246
> RAX:  RBX:  RCX: 888029575a00
> RDX:  RSI: c9ace048 RDI: 
> RBP: c90002f9fbc0 R08: 89938ae7 R09: 125e80a0
> R10: dc00 R11: a950 R12: c90002f9fc80
> R13: dc00 R14: dc00 R15: 
> FS:  78992380() GS:8880b940() knlGS:
> CS:  0010 DS:  ES:  CR0: 80050033
> CR2: 2e3e9388 CR3: 791c2000 CR4: 003506f0
> DR0:  DR1:  DR2: 
> DR3:  DR6: fffe0ff0 DR7: 0400
> Call Trace:
>  
>  bpf_prog_fe13437f26555f61+0x1a/0x1c
>  bpf_dispatcher_nop_func include/linux/bpf.h:1243 [inline]
>  __bpf_prog_run include/linux/filter.h:691 [inline]
>  bpf_prog_run include/linux/filter.h:698 [inline]
>  __bpf_prog_test_run_raw_tp+0x149/0x310 net/bpf/test_run.c:732
>  bpf_prog_test_run_raw_tp+0x47b/0x6a0 net/bpf/test_run.c:772
>  bpf_prog_test_run+0x33a/0x3b0 kernel/bpf/syscall.c:4286
>  __sys_bpf+0x48d/0x810 kernel/bpf/syscall.c:5700
>  __do_sys_bpf kernel/bpf/syscall.c:5789 [inline]
>  __se_sys_bpf kernel/bpf/syscall.c:5787 [inline]
>  __x64_sys_bpf+0x7c/0x90 kernel/bpf/syscall.c:5787
>  do_syscall_x64 arch/x86/entry/common.c:52 [inline]
>  do_syscall_64+0xf5/0x240 arch/x86/entry/common.c:83
>  entry_SYSCALL_64_after_hwframe+0x77/0x7f
> RIP: 0033:0x7f53be8a0469
> Code: 48 83 c4 28 c3 e8 37 17 00 00 0f 1f 80 00 00 00 00 48 89 f8 48 89 f7 48 
> 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 
> 01 c3 48 c7 c1 b8 ff ff ff f7 d8 64 89 01 48
> RSP: 002b:7ffdcf680a08 EFLAGS: 0246 ORIG_RAX: 0141
> RAX: ffda RBX: 7ffdcf680bd8 RCX: 7f53be8a0469
> RDX: 000c RSI: 2080 RDI: 000a
> RBP: 7f53be913610 R08:  R09: 7ffdcf680bd8
> R10: 7f53be8dbae3 R11: 0246 R12: 0001
> R13: 7ffdcf680bc8 R14: 0001 R15: 0001
>  
> Modules linked in:
> ---[ end trace  ]---
> RIP: 0010:bpf_get_attach_cookie_tracing kernel/trace/bpf_trace.c:1179 
> [inline]
> RIP: 0010:bpf_get_attach_cookie_tracing+0x46/0x60 
> kernel/trace/bpf_trace.c:1174
> Code: d3 03 00 48 81 c3 00 18 00 00 48 89 d8 48 c1 e8 03 42 80 3c 30 00 74 08 
> 48 89 df e8 54 b9 59 00 48 8b 1b 48 89 d8 48 c1 e8 03 <42> 80 3c 30 00 74 08 
> 48 89 df e8 3b b9 59 00 48 8b 03 5b 41 5e c3
> RSP: 0018:c90002f9fba8 EFLAGS: 00010246
> RAX:  RBX:  RCX: 888029575a00
> RDX:  RSI: c9ace048 RDI: 
> RBP: c90002f9fbc0 R08: 89938ae7 R09: 125e80a0
> R10: dc00 R11: a950 R12

Re: [RFC PATCH] tracing: change syscall number type in struct syscall_trace_*

2023-10-03 Thread Andrii Nakryiko
On Mon, Oct 2, 2023 at 6:53 AM Artem Savkov  wrote:
>
> linux-rt-devel tree contains a patch that adds an extra member to struct

can you please point to the patch itself that makes that change?

> trace_entry. This causes the offset of args field in struct
> trace_event_raw_sys_enter be different from the one in struct
> syscall_trace_enter:
>
> struct trace_event_raw_sys_enter {
> struct trace_entry ent;  /* 012 */
>
> /* XXX last struct has 3 bytes of padding */
> /* XXX 4 bytes hole, try to pack */
>
> long int   id;   /*16 8 */
> long unsigned int  args[6];  /*2448 */
> /* --- cacheline 1 boundary (64 bytes) was 8 bytes ago --- */
> char   __data[]; /*72 0 */
>
> /* size: 72, cachelines: 2, members: 4 */
> /* sum members: 68, holes: 1, sum holes: 4 */
> /* paddings: 1, sum paddings: 3 */
> /* last cacheline: 8 bytes */
> };
>
> struct syscall_trace_enter {
> struct trace_entry ent;  /* 012 */
>
> /* XXX last struct has 3 bytes of padding */
>
> intnr;   /*12 4 */
> long unsigned int  args[];   /*16 0 */
>
> /* size: 16, cachelines: 1, members: 3 */
> /* paddings: 1, sum paddings: 3 */
> /* last cacheline: 16 bytes */
> };
>
> This, in turn, causes perf_event_set_bpf_prog() fail while running bpf
> test_profiler testcase because max_ctx_offset is calculated based on the
> former struct, while off on the latter:
>
>   10488 if (is_tracepoint || is_syscall_tp) {
>   10489 int off = trace_event_get_offsets(event->tp_event);
>   10490
>   10491 if (prog->aux->max_ctx_offset > off)
>   10492 return -EACCES;
>   10493 }
>
> This patch changes the type of nr member in syscall_trace_* structs to
> be long so that "args" offset is equal to that in struct
> trace_event_raw_sys_enter.
>
> Signed-off-by: Artem Savkov 
> ---
>  kernel/trace/trace.h  | 4 ++--
>  kernel/trace/trace_syscalls.c | 7 ---
>  2 files changed, 6 insertions(+), 5 deletions(-)
>
> diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
> index 77debe53f07cf..cd1d24df85364 100644
> --- a/kernel/trace/trace.h
> +++ b/kernel/trace/trace.h
> @@ -135,13 +135,13 @@ enum trace_type {
>   */
>  struct syscall_trace_enter {
> struct trace_entry  ent;
> -   int nr;
> +   longnr;
> unsigned long   args[];
>  };
>
>  struct syscall_trace_exit {
> struct trace_entry  ent;
> -   int nr;
> +   longnr;
> longret;
>  };
>
> diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
> index de753403cdafb..c26939119f2e4 100644
> --- a/kernel/trace/trace_syscalls.c
> +++ b/kernel/trace/trace_syscalls.c
> @@ -101,7 +101,7 @@ find_syscall_meta(unsigned long syscall)
> return NULL;
>  }
>
> -static struct syscall_metadata *syscall_nr_to_meta(int nr)
> +static struct syscall_metadata *syscall_nr_to_meta(long nr)
>  {
> if (IS_ENABLED(CONFIG_HAVE_SPARSE_SYSCALL_NR))
> return xa_load(&syscalls_metadata_sparse, (unsigned long)nr);
> @@ -132,7 +132,8 @@ print_syscall_enter(struct trace_iterator *iter, int 
> flags,
> struct trace_entry *ent = iter->ent;
> struct syscall_trace_enter *trace;
> struct syscall_metadata *entry;
> -   int i, syscall;
> +   int i;
> +   long syscall;
>
> trace = (typeof(trace))ent;
> syscall = trace->nr;
> @@ -177,7 +178,7 @@ print_syscall_exit(struct trace_iterator *iter, int flags,
> struct trace_seq *s = &iter->seq;
> struct trace_entry *ent = iter->ent;
> struct syscall_trace_exit *trace;
> -   int syscall;
> +   long syscall;
> struct syscall_metadata *entry;
>
> trace = (typeof(trace))ent;
> --
> 2.41.0
>
>



Re: [RFC PATCH bpf-next] bpf: change syscall_nr type to int in struct syscall_tp_t

2023-10-12 Thread Andrii Nakryiko
On Thu, Oct 12, 2023 at 6:43 AM Steven Rostedt  wrote:
>
> On Thu, 12 Oct 2023 13:45:50 +0200
> Artem Savkov  wrote:
>
> > linux-rt-devel tree contains a patch (b1773eac3f29c ("sched: Add support
> > for lazy preemption")) that adds an extra member to struct trace_entry.
> > This causes the offset of args field in struct trace_event_raw_sys_enter
> > be different from the one in struct syscall_trace_enter:
> >
> > struct trace_event_raw_sys_enter {
> > struct trace_entry ent;  /* 012 */
> >
> > /* XXX last struct has 3 bytes of padding */
> > /* XXX 4 bytes hole, try to pack */
> >
> > long int   id;   /*16 8 */
> > long unsigned int  args[6];  /*2448 */
> > /* --- cacheline 1 boundary (64 bytes) was 8 bytes ago --- */
> > char   __data[]; /*72 0 */
> >
> > /* size: 72, cachelines: 2, members: 4 */
> > /* sum members: 68, holes: 1, sum holes: 4 */
> > /* paddings: 1, sum paddings: 3 */
> > /* last cacheline: 8 bytes */
> > };
> >
> > struct syscall_trace_enter {
> > struct trace_entry ent;  /* 012 */
> >
> > /* XXX last struct has 3 bytes of padding */
> >
> > intnr;   /*12 4 */
> > long unsigned int  args[];   /*16 0 */
> >
> > /* size: 16, cachelines: 1, members: 3 */
> > /* paddings: 1, sum paddings: 3 */
> > /* last cacheline: 16 bytes */
> > };
> >
> > This, in turn, causes perf_event_set_bpf_prog() fail while running bpf
> > test_profiler testcase because max_ctx_offset is calculated based on the
> > former struct, while off on the latter:
> >
> >   10488 if (is_tracepoint || is_syscall_tp) {
> >   10489 int off = trace_event_get_offsets(event->tp_event);
> >   10490
> >   10491 if (prog->aux->max_ctx_offset > off)
> >   10492 return -EACCES;
> >   10493 }
> >
> > What bpf program is actually getting is a pointer to struct
> > syscall_tp_t, defined in kernel/trace/trace_syscalls.c. This patch fixes
> > the problem by aligning struct syscall_tp_t with with struct
> > syscall_trace_(enter|exit) and changing the tests to use these structs
> > to dereference context.
> >
> > Signed-off-by: Artem Savkov 
>

I think these changes make sense regardless, can you please resend the
patch without RFC tag so that our CI can run tests for it?

> Thanks for doing a proper fix.
>
> Acked-by: Steven Rostedt (Google) 

But looking at [0] and briefly reading some of the discussions you,
Steven, had. I'm just wondering if it would be best to avoid
increasing struct trace_entry altogether? It seems like preempt_count
is actually a 4-bit field in trace context, so it doesn't seem like we
really need to allocate an entire byte for both preempt_count and
preempt_lazy_count. Why can't we just combine them and not waste 8
extra bytes for each trace event in a ring buffer?

  [0] 
https://git.kernel.org/pub/scm/linux/kernel/git/rt/linux-rt-devel.git/commit/?id=b1773eac3f29cbdcdfd16e0339f1a164066e9f71

>
> -- Steve



Re: [RFC PATCH bpf-next] bpf: change syscall_nr type to int in struct syscall_tp_t

2023-10-13 Thread Andrii Nakryiko
On Fri, Oct 13, 2023 at 7:00 AM Steven Rostedt  wrote:
>
> On Fri, 13 Oct 2023 08:01:34 +0200
> Artem Savkov  wrote:
>
> > > But looking at [0] and briefly reading some of the discussions you,
> > > Steven, had. I'm just wondering if it would be best to avoid
> > > increasing struct trace_entry altogether? It seems like preempt_count
> > > is actually a 4-bit field in trace context, so it doesn't seem like we
> > > really need to allocate an entire byte for both preempt_count and
> > > preempt_lazy_count. Why can't we just combine them and not waste 8
> > > extra bytes for each trace event in a ring buffer?
> > >
> > >   [0] 
> > > https://git.kernel.org/pub/scm/linux/kernel/git/rt/linux-rt-devel.git/commit/?id=b1773eac3f29cbdcdfd16e0339f1a164066e9f71
> >
> > I agree that avoiding increase in struct trace_entry size would be very
> > desirable, but I have no knowledge whether rt developers had reasons to
> > do it like this.
> >
> > Nevertheless I think the issue with verifier running against a wrong
> > struct still needs to be addressed.
>
> Correct. My Ack is based on the current way things are done upstream.
> It was just that linux-rt showed the issue, where the code was not as
> robust as it should have been. To me this was a correctness issue, not
> an issue that had to do with how things are done in linux-rt.

I think we should at least add some BUILD_BUG_ON() that validates
offsets in syscall_tp_t matches the ones in syscall_trace_enter and
syscall_trace_exit, to fail more loudly if there is any mismatch in
the future. WDYT?

>
> As for the changes in linux-rt, they are not upstream yet. I'll have my
> comments on that code when that happens.

Ah, ok, cool. I'd appreciate you cc'ing b...@vger.kernel.org in that
discussion, thank you!

>
> -- Steve



Re: [PATCH for-next] tracing/kprobes: Add symbol counting check when module loads

2023-10-31 Thread Andrii Nakryiko
On Sat, Oct 28, 2023 at 8:10 PM Masami Hiramatsu (Google)
 wrote:
>
> From: Masami Hiramatsu (Google) 
>
> Check the number of probe target symbols in the target module when
> the module is loaded. If the probe is not on the unique name symbols
> in the module, it will be rejected at that point.
>
> Note that the symbol which has a unique name in the target module,
> it will be accepted even if there are same-name symbols in the
> kernel or other modules,
>
> Signed-off-by: Masami Hiramatsu (Google) 
> ---
>  kernel/trace/trace_kprobe.c |  112 
> ++-
>  1 file changed, 68 insertions(+), 44 deletions(-)
>

LGTM.

Acked-by: Andrii Nakryiko 


> diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
> index e834f149695b..90cf2219adb4 100644
> --- a/kernel/trace/trace_kprobe.c
> +++ b/kernel/trace/trace_kprobe.c
> @@ -670,6 +670,21 @@ static int register_trace_kprobe(struct trace_kprobe *tk)
> return ret;
>  }
>
> +static int validate_module_probe_symbol(const char *modname, const char 
> *symbol);
> +
> +static int register_module_trace_kprobe(struct module *mod, struct 
> trace_kprobe *tk)
> +{
> +   const char *p;
> +   int ret = 0;
> +
> +   p = strchr(trace_kprobe_symbol(tk), ':');
> +   if (p)
> +   ret = validate_module_probe_symbol(module_name(mod), p++);
> +   if (!ret)
> +   ret = register_trace_kprobe(tk);
> +   return ret;
> +}
> +
>  /* Module notifier call back, checking event on the module */
>  static int trace_kprobe_module_callback(struct notifier_block *nb,
>unsigned long val, void *data)
> @@ -688,7 +703,7 @@ static int trace_kprobe_module_callback(struct 
> notifier_block *nb,
> if (trace_kprobe_within_module(tk, mod)) {
> /* Don't need to check busy - this should have gone. 
> */
> __unregister_trace_kprobe(tk);
> -   ret = __register_trace_kprobe(tk);
> +   ret = register_module_trace_kprobe(mod, tk);
> if (ret)
> pr_warn("Failed to re-register probe %s on 
> %s: %d\n",
> trace_probe_name(&tk->tp),
> @@ -729,17 +744,55 @@ static int count_mod_symbols(void *data, const char 
> *name, unsigned long unused)
> return 0;
>  }
>
> -static unsigned int number_of_same_symbols(char *func_name)
> +static unsigned int number_of_same_symbols(const char *mod, const char 
> *func_name)
>  {
> struct sym_count_ctx ctx = { .count = 0, .name = func_name };
>
> -   kallsyms_on_each_match_symbol(count_symbols, func_name, &ctx.count);
> +   if (!mod)
> +   kallsyms_on_each_match_symbol(count_symbols, func_name, 
> &ctx.count);
>
> -   module_kallsyms_on_each_symbol(NULL, count_mod_symbols, &ctx);
> +   module_kallsyms_on_each_symbol(mod, count_mod_symbols, &ctx);
>
> return ctx.count;
>  }
>
> +static int validate_module_probe_symbol(const char *modname, const char 
> *symbol)
> +{
> +   unsigned int count = number_of_same_symbols(modname, symbol);
> +
> +   if (count > 1) {
> +   /*
> +* Users should use ADDR to remove the ambiguity of
> +* using KSYM only.
> +*/
> +   return -EADDRNOTAVAIL;
> +   } else if (count == 0) {
> +   /*
> +* We can return ENOENT earlier than when register the
> +* kprobe.
> +*/
> +   return -ENOENT;
> +   }
> +   return 0;
> +}
> +
> +static int validate_probe_symbol(char *symbol)
> +{
> +   char *mod = NULL, *p;
> +   int ret;
> +
> +   p = strchr(symbol, ':');
> +   if (p) {
> +   mod = symbol;
> +   symbol = p + 1;
> +   *p = '\0';
> +   }
> +   ret = validate_module_probe_symbol(mod, symbol);
> +   if (p)
> +   *p = ':';
> +   return ret;
> +}
> +
>  static int __trace_kprobe_create(int argc, const char *argv[])
>  {
> /*
> @@ -859,6 +912,14 @@ static int __trace_kprobe_create(int argc, const char 
> *argv[])
> trace_probe_log_err(0, BAD_PROBE_ADDR);
> goto parse_error;
> }
> +   ret = validate_probe_symbol(symbol);
> +   if (ret) {
> +   if (ret == -EADDRNOTAVAIL)

Re: [PATCHv7 bpf-next 0/9] uprobe: uretprobe speed up

2024-05-31 Thread Andrii Nakryiko
On Thu, May 23, 2024 at 5:11 AM Jiri Olsa  wrote:
>
> hi,
> as part of the effort on speeding up the uprobes [0] coming with
> return uprobe optimization by using syscall instead of the trap
> on the uretprobe trampoline.
>
> The speed up depends on instruction type that uprobe is installed
> and depends on specific HW type, please check patch 1 for details.
>
> Patches 1-8 are based on bpf-next/master, but patch 2 and 3 are
> apply-able on linux-trace.git tree probes/for-next branch.
> Patch 9 is based on man-pages master.
>
> v7 changes:
> - fixes in man page [Alejandro Colomar]
> - fixed patch #1 fixes tag [Oleg]
>
> Also available at:
>   https://git.kernel.org/pub/scm/linux/kernel/git/jolsa/perf.git
>   uretprobe_syscall
>
> thanks,
> jirka
>
>
> Notes to check list items in Documentation/process/adding-syscalls.rst:
>
> - System Call Alternatives
>   New syscall seems like the best way in here, because we need
>   just to quickly enter kernel with no extra arguments processing,
>   which we'd need to do if we decided to use another syscall.
>
> - Designing the API: Planning for Extension
>   The uretprobe syscall is very specific and most likely won't be
>   extended in the future.
>
>   At the moment it does not take any arguments and even if it does
>   in future, it's allowed to be called only from trampoline prepared
>   by kernel, so there'll be no broken user.
>
> - Designing the API: Other Considerations
>   N/A because uretprobe syscall does not return reference to kernel
>   object.
>
> - Proposing the API
>   Wiring up of the uretprobe system call is in separate change,
>   selftests and man page changes are part of the patchset.
>
> - Generic System Call Implementation
>   There's no CONFIG option for the new functionality because it
>   keeps the same behaviour from the user POV.
>
> - x86 System Call Implementation
>   It's 64-bit syscall only.
>
> - Compatibility System Calls (Generic)
>   N/A uretprobe syscall has no arguments and is not supported
>   for compat processes.
>
> - Compatibility System Calls (x86)
>   N/A uretprobe syscall is not supported for compat processes.
>
> - System Calls Returning Elsewhere
>   N/A.
>
> - Other Details
>   N/A.
>
> - Testing
>   Adding new bpf selftests and ran ltp on top of this change.
>
> - Man Page
>   Attached.
>
> - Do not call System Calls in the Kernel
>   N/A.
>
>
> [0] https://lore.kernel.org/bpf/ZeCXHKJ--iYYbmLj@krava/
> ---
> Jiri Olsa (8):
>   x86/shstk: Make return uprobe work with shadow stack
>   uprobe: Wire up uretprobe system call
>   uprobe: Add uretprobe syscall to speed up return probe
>   selftests/x86: Add return uprobe shadow stack test
>   selftests/bpf: Add uretprobe syscall test for regs integrity
>   selftests/bpf: Add uretprobe syscall test for regs changes
>   selftests/bpf: Add uretprobe syscall call from user space test
>   selftests/bpf: Add uretprobe shadow stack test
>

Masami, Steven,

It seems like the series is ready to go in. Are you planning to take
the first 4 patches through your linux-trace tree?

>  arch/x86/entry/syscalls/syscall_64.tbl  |   1 +
>  arch/x86/include/asm/shstk.h|   4 +
>  arch/x86/kernel/shstk.c |  16 
>  arch/x86/kernel/uprobes.c   | 124 
> -
>  include/linux/syscalls.h|   2 +
>  include/linux/uprobes.h |   3 +
>  include/uapi/asm-generic/unistd.h   |   5 +-
>  kernel/events/uprobes.c |  24 --
>  kernel/sys_ni.c |   2 +
>  tools/include/linux/compiler.h  |   4 +
>  tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c   | 123 
> -
>  tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c | 385 
> +++
>  tools/testing/selftests/bpf/progs/uprobe_syscall.c  |  15 
>  tools/testing/selftests/bpf/progs/uprobe_syscall_executed.c |  17 
>  tools/testing/selftests/x86/test_shadow_stack.c | 145 
> ++
>  15 files changed, 860 insertions(+), 10 deletions(-)
>  create mode 100644 tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c
>  create mode 100644 tools/testing/selftests/bpf/progs/uprobe_syscall.c
>  create mode 100644 
> tools/testing/selftests/bpf/progs/uprobe_syscall_executed.c
>
> Jiri Olsa (1):
>   man2: Add uretprobe syscall page
>
>  man/man2/uretprobe.2 | 56 
> 
>  1 file changed, 56 insertions(+)
>  create mode 100644 man/man2/uretprobe.2



Re: [PATCHv7 bpf-next 0/9] uprobe: uretprobe speed up

2024-06-05 Thread Andrii Nakryiko
On Fri, May 31, 2024 at 10:52 AM Andrii Nakryiko
 wrote:
>
> On Thu, May 23, 2024 at 5:11 AM Jiri Olsa  wrote:
> >
> > hi,
> > as part of the effort on speeding up the uprobes [0] coming with
> > return uprobe optimization by using syscall instead of the trap
> > on the uretprobe trampoline.
> >
> > The speed up depends on instruction type that uprobe is installed
> > and depends on specific HW type, please check patch 1 for details.
> >
> > Patches 1-8 are based on bpf-next/master, but patch 2 and 3 are
> > apply-able on linux-trace.git tree probes/for-next branch.
> > Patch 9 is based on man-pages master.
> >
> > v7 changes:
> > - fixes in man page [Alejandro Colomar]
> > - fixed patch #1 fixes tag [Oleg]
> >
> > Also available at:
> >   https://git.kernel.org/pub/scm/linux/kernel/git/jolsa/perf.git
> >   uretprobe_syscall
> >
> > thanks,
> > jirka
> >
> >
> > Notes to check list items in Documentation/process/adding-syscalls.rst:
> >
> > - System Call Alternatives
> >   New syscall seems like the best way in here, because we need
> >   just to quickly enter kernel with no extra arguments processing,
> >   which we'd need to do if we decided to use another syscall.
> >
> > - Designing the API: Planning for Extension
> >   The uretprobe syscall is very specific and most likely won't be
> >   extended in the future.
> >
> >   At the moment it does not take any arguments and even if it does
> >   in future, it's allowed to be called only from trampoline prepared
> >   by kernel, so there'll be no broken user.
> >
> > - Designing the API: Other Considerations
> >   N/A because uretprobe syscall does not return reference to kernel
> >   object.
> >
> > - Proposing the API
> >   Wiring up of the uretprobe system call is in separate change,
> >   selftests and man page changes are part of the patchset.
> >
> > - Generic System Call Implementation
> >   There's no CONFIG option for the new functionality because it
> >   keeps the same behaviour from the user POV.
> >
> > - x86 System Call Implementation
> >   It's 64-bit syscall only.
> >
> > - Compatibility System Calls (Generic)
> >   N/A uretprobe syscall has no arguments and is not supported
> >   for compat processes.
> >
> > - Compatibility System Calls (x86)
> >   N/A uretprobe syscall is not supported for compat processes.
> >
> > - System Calls Returning Elsewhere
> >   N/A.
> >
> > - Other Details
> >   N/A.
> >
> > - Testing
> >   Adding new bpf selftests and ran ltp on top of this change.
> >
> > - Man Page
> >   Attached.
> >
> > - Do not call System Calls in the Kernel
> >   N/A.
> >
> >
> > [0] https://lore.kernel.org/bpf/ZeCXHKJ--iYYbmLj@krava/
> > ---
> > Jiri Olsa (8):
> >   x86/shstk: Make return uprobe work with shadow stack
> >   uprobe: Wire up uretprobe system call
> >   uprobe: Add uretprobe syscall to speed up return probe
> >   selftests/x86: Add return uprobe shadow stack test
> >   selftests/bpf: Add uretprobe syscall test for regs integrity
> >   selftests/bpf: Add uretprobe syscall test for regs changes
> >   selftests/bpf: Add uretprobe syscall call from user space test
> >   selftests/bpf: Add uretprobe shadow stack test
> >
>
> Masami, Steven,
>
> It seems like the series is ready to go in. Are you planning to take
> the first 4 patches through your linux-trace tree?

Another ping. It's been two weeks since Jiri posted the last revision
that got no more feedback to be addressed and everyone seems to be
happy with it.

This is an important speed up improvement for uprobe infrastructure in
general and for BPF ecosystem in particular. "Uprobes are slow" is one
of the top complaints from production BPF users, and sys_uretprobe
approach is significantly improving the situation for return uprobes
(aka uretprobes), potentially enabling new use cases that previously
could have been too expensive to trace in practice and reducing the
overhead of the existing ones.

I'd appreciate the engagement from linux-trace maintainers on this
patch set. Given it's important for BPF and that a big part of the
patch set is BPF-based selftests, we'd also be happy to route all this
through the bpf-next tree (which would actually make logistics for us
much easier, but that's not the main concern). But regardless of the
tree, it would be nice to make a decision and go forward with it.

Thank 

Re: [RFC bpf-next 01/10] uprobe: Add session callbacks to uprobe_consumer

2024-06-05 Thread Andrii Nakryiko
On Tue, Jun 4, 2024 at 1:02 PM Jiri Olsa  wrote:
>
> Adding new set of callbacks that are triggered on entry and return
> uprobe execution for the attached function.
>
> The session means that those callbacks are 'connected' in a way
> that allows to:
>   - control execution of return callback from entry callback
>   - share data between entry and return callbacks
>
> The session concept fits to our common use case where we do filtering
> on entry uprobe and based on the result we decide to run the return
> uprobe (or not).
>
> It's also convenient to share the data between session callbacks.
>
> The control of return uprobe execution is done via return value of the
> entry session callback, where 0 means to install and execute return
> uprobe, 1 means to not install.
>
> Current implementation has a restriction that allows to register only
> single consumer with session callbacks for a uprobe and also restricting
> standard callbacks consumers.
>
> Which means that there can be only single user of a uprobe (inode +
> offset) when session consumer is registered to it.
>
> This is because all registered consumers are executed when uprobe or
> return uprobe is hit and wihout additional layer (like fgraph's shadow
> stack) that would keep the state of the return callback, we have no
> way to find out which consumer should be executed.
>
> I'm not sure how big limitation this is for people, our current use
> case seems to be ok with that. Fixing this would be more complex/bigger
> change to uprobes, thoughts?

I think it's a pretty big limitation, because in production you don't
always know ahead of time all possible users of uprobe, so any such
limitations will cause problems, issue reports, investigation, etc.

As one possible solution, what if we do

struct return_instance {
...
u64 session_cookies[];
};

and allocate sizeof(struct return_instance) + 8 *
 and then at runtime pass
&session_cookies[i] as data pointer to session-aware callbacks?

>
> Hence sending this as RFC to gather more opinions and feedback.
>
> Signed-off-by: Jiri Olsa 
> ---
>  include/linux/uprobes.h | 18 +++
>  kernel/events/uprobes.c | 69 +++--
>  2 files changed, 78 insertions(+), 9 deletions(-)
>
> diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
> index f46e0ca0169c..a2f2d5ac3cee 100644
> --- a/include/linux/uprobes.h
> +++ b/include/linux/uprobes.h
> @@ -34,6 +34,12 @@ enum uprobe_filter_ctx {
>  };
>
>  struct uprobe_consumer {
> +   /*
> +* The handler callback return value controls removal of the uprobe.
> +*  0 on success, uprobe stays
> +*  1 on failure, remove the uprobe
> +*console warning for anything else
> +*/
> int (*handler)(struct uprobe_consumer *self, struct pt_regs *regs);
> int (*ret_handler)(struct uprobe_consumer *self,
> unsigned long func,
> @@ -42,6 +48,17 @@ struct uprobe_consumer {
> enum uprobe_filter_ctx ctx,
> struct mm_struct *mm);
>
> +   /* The handler_session callback return value controls execution of
> +* the return uprobe and ret_handler_session callback.
> +*  0 on success
> +*  1 on failure, DO NOT install/execute the return uprobe
> +*console warning for anything else
> +*/
> +   int (*handler_session)(struct uprobe_consumer *self, struct pt_regs 
> *regs,
> +  unsigned long *data);
> +   int (*ret_handler_session)(struct uprobe_consumer *self, unsigned 
> long func,
> +  struct pt_regs *regs, unsigned long *data);
> +

We should try to avoid an alternative set of callbacks, IMO. Let's
extend existing ones with `unsigned long *data`, but specify that
unless consumer sets some flag on registration that it needs a session
cookie, we'll pass NULL here? Or just allocate cookie data for each
registered consumer for simplicity, don't know; given we don't expect
many consumers on exactly the same uprobe, it might be ok to keep it
simple.


> struct uprobe_consumer *next;
>  };
>
> @@ -85,6 +102,7 @@ struct return_instance {
> unsigned long   func;
> unsigned long   stack;  /* stack pointer */
> unsigned long   orig_ret_vaddr; /* original return address */
> +   unsigned long   data;
> boolchained;/* true, if instance is 
> nested */
>
> struct return_instance  *next;  /* keep as stack */
> diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
> index 2c83ba776fc7..17b0771272a6 100644
> --- a/kernel/events/uprobes.c
> +++ b/kernel/events/uprobes.c
> @@ -750,12 +750,32 @@ static struct uprobe *alloc_uprobe(struct inode *inode, 
> loff_t offset,
> return uprobe;
>  }
>
> -static void consumer_add(struct uprobe *uprobe, struct

Re: [RFC bpf-next 01/10] uprobe: Add session callbacks to uprobe_consumer

2024-06-05 Thread Andrii Nakryiko
On Wed, Jun 5, 2024 at 10:57 AM Oleg Nesterov  wrote:
>
> On 06/05, Andrii Nakryiko wrote:
> >
> > so any such
> > limitations will cause problems, issue reports, investigation, etc.
>
> Agreed...
>
> > As one possible solution, what if we do
> >
> > struct return_instance {
> > ...
> > u64 session_cookies[];
> > };
> >
> > and allocate sizeof(struct return_instance) + 8 *
> >  and then at runtime pass
> > &session_cookies[i] as data pointer to session-aware callbacks?
>
> I too thought about this, but I guess it is not that simple.
>
> Just for example. Suppose we have 2 session-consumers C1 and C2.
> What if uprobe_unregister(C1) comes before the probed function
> returns?
>
> We need something like map_cookie_to_consumer().

Fair enough. The easy way to solve this is to have


struct uprobe_session_cookie {
int consumer_id;
u64 cookie;
};

And add id to each new consumer when it is added to struct uprobe.
Unfortunately, it's impossible to tell when a new consumer was added
to the list (as a front item, but maybe we just change it to be
appended instead of prepending) vs when the old consumer was removed,
so in some cases we'd need to do a linear search.

But the good news is that in the common case we wouldn't need to
search and the next item in session_cookies[] array would be the one
we need.

WDYT? It's still fast, and it's simpler than the shadow stack idea, IMO.

P.S. Regardless, maybe we should change the order in which we insert
consumers to uprobe? Right now uprobe consumer added later will be
executed first, which, while not wrong, is counter-intuitive. And also
it breaks a nice natural order when we need to match it up with stuff
like session_cookies[] as described above.

>
> > > +   /* The handler_session callback return value controls execution of
> > > +* the return uprobe and ret_handler_session callback.
> > > +*  0 on success
> > > +*  1 on failure, DO NOT install/execute the return uprobe
> > > +*console warning for anything else
> > > +*/
> > > +   int (*handler_session)(struct uprobe_consumer *self, struct 
> > > pt_regs *regs,
> > > +  unsigned long *data);
> > > +   int (*ret_handler_session)(struct uprobe_consumer *self, unsigned 
> > > long func,
> > > +  struct pt_regs *regs, unsigned long 
> > > *data);
> > > +
> >
> > We should try to avoid an alternative set of callbacks, IMO. Let's
> > extend existing ones with `unsigned long *data`,
>
> Oh yes, agreed.
>
> And the comment about the return value looks confusing too. I mean, the
> logic doesn't differ from the ret-code from ->handler().
>
> "DO NOT install/execute the return uprobe" is not true if another
> non-session-consumer returns 0.
>
> Oleg.
>



Re: [RFC bpf-next 01/10] uprobe: Add session callbacks to uprobe_consumer

2024-06-06 Thread Andrii Nakryiko
On Thu, Jun 6, 2024 at 9:46 AM Jiri Olsa  wrote:
>
> On Wed, Jun 05, 2024 at 10:50:11PM +0200, Jiri Olsa wrote:
> > On Wed, Jun 05, 2024 at 07:56:19PM +0200, Oleg Nesterov wrote:
> > > On 06/05, Andrii Nakryiko wrote:
> > > >
> > > > so any such
> > > > limitations will cause problems, issue reports, investigation, etc.
> > >
> > > Agreed...
> > >
> > > > As one possible solution, what if we do
> > > >
> > > > struct return_instance {
> > > > ...
> > > > u64 session_cookies[];
> > > > };
> > > >
> > > > and allocate sizeof(struct return_instance) + 8 *
> > > >  and then at runtime pass
> > > > &session_cookies[i] as data pointer to session-aware callbacks?
> > >
> > > I too thought about this, but I guess it is not that simple.
> > >
> > > Just for example. Suppose we have 2 session-consumers C1 and C2.
> > > What if uprobe_unregister(C1) comes before the probed function
> > > returns?
> > >
> > > We need something like map_cookie_to_consumer().
> >
> > I guess we could have hash table in return_instance that gets 'consumer -> 
> > cookie' ?
>
> ok, hash table is probably too big for this.. I guess some solution that
> would iterate consumers and cookies made sure it matches would be fine
>

Yes, I was hoping to avoid hash tables for this, and in the common
case have no added overhead.

> jirka
>
> >
> > return instance is freed after the consumers' return handlers are executed,
> > so there's no leak if some consumer gets unregistered before that
> >
> > >
> > > > > +   /* The handler_session callback return value controls 
> > > > > execution of
> > > > > +* the return uprobe and ret_handler_session callback.
> > > > > +*  0 on success
> > > > > +*  1 on failure, DO NOT install/execute the return uprobe
> > > > > +*console warning for anything else
> > > > > +*/
> > > > > +   int (*handler_session)(struct uprobe_consumer *self, struct 
> > > > > pt_regs *regs,
> > > > > +  unsigned long *data);
> > > > > +   int (*ret_handler_session)(struct uprobe_consumer *self, 
> > > > > unsigned long func,
> > > > > +  struct pt_regs *regs, unsigned 
> > > > > long *data);
> > > > > +
> > > >
> > > > We should try to avoid an alternative set of callbacks, IMO. Let's
> > > > extend existing ones with `unsigned long *data`,
> > >
> > > Oh yes, agreed.
> > >
> > > And the comment about the return value looks confusing too. I mean, the
> > > logic doesn't differ from the ret-code from ->handler().
> > >
> > > "DO NOT install/execute the return uprobe" is not true if another
> > > non-session-consumer returns 0.
> >
> > well they are meant to be exclusive, so there'd be no other 
> > non-session-consumer
> >
> > jirka



Re: [PATCHv7 bpf-next 0/9] uprobe: uretprobe speed up

2024-06-11 Thread Andrii Nakryiko
On Mon, Jun 10, 2024 at 10:46 PM Masami Hiramatsu  wrote:
>
> On Wed, 5 Jun 2024 09:42:45 -0700
> Andrii Nakryiko  wrote:
>
> > On Fri, May 31, 2024 at 10:52 AM Andrii Nakryiko
> >  wrote:
> > >
> > > On Thu, May 23, 2024 at 5:11 AM Jiri Olsa  wrote:
> > > >
> > > > hi,
> > > > as part of the effort on speeding up the uprobes [0] coming with
> > > > return uprobe optimization by using syscall instead of the trap
> > > > on the uretprobe trampoline.
> > > >
> > > > The speed up depends on instruction type that uprobe is installed
> > > > and depends on specific HW type, please check patch 1 for details.
> > > >
> > > > Patches 1-8 are based on bpf-next/master, but patch 2 and 3 are
> > > > apply-able on linux-trace.git tree probes/for-next branch.
> > > > Patch 9 is based on man-pages master.
> > > >
> > > > v7 changes:
> > > > - fixes in man page [Alejandro Colomar]
> > > > - fixed patch #1 fixes tag [Oleg]
> > > >
> > > > Also available at:
> > > >   https://git.kernel.org/pub/scm/linux/kernel/git/jolsa/perf.git
> > > >   uretprobe_syscall
> > > >
> > > > thanks,
> > > > jirka
> > > >
> > > >
> > > > Notes to check list items in Documentation/process/adding-syscalls.rst:
> > > >
> > > > - System Call Alternatives
> > > >   New syscall seems like the best way in here, because we need
> > > >   just to quickly enter kernel with no extra arguments processing,
> > > >   which we'd need to do if we decided to use another syscall.
> > > >
> > > > - Designing the API: Planning for Extension
> > > >   The uretprobe syscall is very specific and most likely won't be
> > > >   extended in the future.
> > > >
> > > >   At the moment it does not take any arguments and even if it does
> > > >   in future, it's allowed to be called only from trampoline prepared
> > > >   by kernel, so there'll be no broken user.
> > > >
> > > > - Designing the API: Other Considerations
> > > >   N/A because uretprobe syscall does not return reference to kernel
> > > >   object.
> > > >
> > > > - Proposing the API
> > > >   Wiring up of the uretprobe system call is in separate change,
> > > >   selftests and man page changes are part of the patchset.
> > > >
> > > > - Generic System Call Implementation
> > > >   There's no CONFIG option for the new functionality because it
> > > >   keeps the same behaviour from the user POV.
> > > >
> > > > - x86 System Call Implementation
> > > >   It's 64-bit syscall only.
> > > >
> > > > - Compatibility System Calls (Generic)
> > > >   N/A uretprobe syscall has no arguments and is not supported
> > > >   for compat processes.
> > > >
> > > > - Compatibility System Calls (x86)
> > > >   N/A uretprobe syscall is not supported for compat processes.
> > > >
> > > > - System Calls Returning Elsewhere
> > > >   N/A.
> > > >
> > > > - Other Details
> > > >   N/A.
> > > >
> > > > - Testing
> > > >   Adding new bpf selftests and ran ltp on top of this change.
> > > >
> > > > - Man Page
> > > >   Attached.
> > > >
> > > > - Do not call System Calls in the Kernel
> > > >   N/A.
> > > >
> > > >
> > > > [0] https://lore.kernel.org/bpf/ZeCXHKJ--iYYbmLj@krava/
> > > > ---
> > > > Jiri Olsa (8):
> > > >   x86/shstk: Make return uprobe work with shadow stack
> > > >   uprobe: Wire up uretprobe system call
> > > >   uprobe: Add uretprobe syscall to speed up return probe
> > > >   selftests/x86: Add return uprobe shadow stack test
> > > >   selftests/bpf: Add uretprobe syscall test for regs integrity
> > > >   selftests/bpf: Add uretprobe syscall test for regs changes
> > > >   selftests/bpf: Add uretprobe syscall call from user space test
> > > >   selftests/bpf: Add uretprobe shadow stack test
> > > >
> > >
> > > Masami, Steven,
> > >
> > > It seems like the series is ready to go in. Are you planning to take
> > 

Re: [PATCHv8 bpf-next 0/9] uprobe: uretprobe speed up

2024-06-12 Thread Andrii Nakryiko
On Tue, Jun 11, 2024 at 3:52 PM Masami Hiramatsu  wrote:
>
> On Tue, 11 Jun 2024 13:21:49 +0200
> Jiri Olsa  wrote:
>
> > hi,
> > as part of the effort on speeding up the uprobes [0] coming with
> > return uprobe optimization by using syscall instead of the trap
> > on the uretprobe trampoline.
> >
> > The speed up depends on instruction type that uprobe is installed
> > and depends on specific HW type, please check patch 1 for details.
> >
> > Patches 1-8 are based on bpf-next/master, but patch 2 and 3 are
> > apply-able on linux-trace.git tree probes/for-next branch.
> > Patch 9 is based on man-pages master.
> >
> > v8 changes:
> > - rebased (another new syscall got merged)
> > - added acks
> >
> > Also available at:
> >   https://git.kernel.org/pub/scm/linux/kernel/git/jolsa/perf.git
> >   uretprobe_syscall
>
> Applied patch [1/9] - [8/9] on probes/for-next in
>  git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace.git
>

Thanks, Masami, looks good! I also confirmed that probes/for-next and
bpf-next/master can be merged with no conflicts (hopefully it stays
this way till the next merge window).

> Thank you!
>
> >
> > thanks,
> > jirka
> >
> >
> > Notes to check list items in Documentation/process/adding-syscalls.rst:
> >
> > - System Call Alternatives
> >   New syscall seems like the best way in here, because we need
> >   just to quickly enter kernel with no extra arguments processing,
> >   which we'd need to do if we decided to use another syscall.
> >
> > - Designing the API: Planning for Extension
> >   The uretprobe syscall is very specific and most likely won't be
> >   extended in the future.
> >
> >   At the moment it does not take any arguments and even if it does
> >   in future, it's allowed to be called only from trampoline prepared
> >   by kernel, so there'll be no broken user.
> >
> > - Designing the API: Other Considerations
> >   N/A because uretprobe syscall does not return reference to kernel
> >   object.
> >
> > - Proposing the API
> >   Wiring up of the uretprobe system call is in separate change,
> >   selftests and man page changes are part of the patchset.
> >
> > - Generic System Call Implementation
> >   There's no CONFIG option for the new functionality because it
> >   keeps the same behaviour from the user POV.
> >
> > - x86 System Call Implementation
> >   It's 64-bit syscall only.
> >
> > - Compatibility System Calls (Generic)
> >   N/A uretprobe syscall has no arguments and is not supported
> >   for compat processes.
> >
> > - Compatibility System Calls (x86)
> >   N/A uretprobe syscall is not supported for compat processes.
> >
> > - System Calls Returning Elsewhere
> >   N/A.
> >
> > - Other Details
> >   N/A.
> >
> > - Testing
> >   Adding new bpf selftests and ran ltp on top of this change.
> >
> > - Man Page
> >   Attached.
> >
> > - Do not call System Calls in the Kernel
> >   N/A.
> >
> >
> > [0] https://lore.kernel.org/bpf/ZeCXHKJ--iYYbmLj@krava/
> > ---
> > Jiri Olsa (8):
> >   x86/shstk: Make return uprobe work with shadow stack
> >   uprobe: Wire up uretprobe system call
> >   uprobe: Add uretprobe syscall to speed up return probe
> >   selftests/x86: Add return uprobe shadow stack test
> >   selftests/bpf: Add uretprobe syscall test for regs integrity
> >   selftests/bpf: Add uretprobe syscall test for regs changes
> >   selftests/bpf: Add uretprobe syscall call from user space test
> >   selftests/bpf: Add uretprobe shadow stack test
> >
> >  arch/x86/entry/syscalls/syscall_64.tbl  |   1 +
> >  arch/x86/include/asm/shstk.h|   4 +
> >  arch/x86/kernel/shstk.c |  16 
> >  arch/x86/kernel/uprobes.c   | 124 
> > -
> >  include/linux/syscalls.h|   2 +
> >  include/linux/uprobes.h |   3 +
> >  include/uapi/asm-generic/unistd.h   |   5 +-
> >  kernel/events/uprobes.c |  24 --
> >  kernel/sys_ni.c |   2 +
> >  tools/include/linux/compiler.h  |   4 +
> >  tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c   | 123 
> > -
> >  tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c | 385 
> > +++
> >  tools/testing/selftests/bpf/progs/uprobe_syscall.c  |  15 
> >  tools/testing/selftests/bpf/progs/uprobe_syscall_executed.c |  17 
> >  tools/testing/selftests/x86/test_shadow_stack.c | 145 
> > ++
> >  15 files changed, 860 insertions(+), 10 deletions(-)
> >  create mode 100644 tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c
> >  create mode 100644 tools/testing/selftests/bpf/progs

Re: [RFC bpf-next 01/10] uprobe: Add session callbacks to uprobe_consumer

2024-06-17 Thread Andrii Nakryiko
On Mon, Jun 10, 2024 at 4:06 AM Jiri Olsa  wrote:
>
> On Thu, Jun 06, 2024 at 09:52:39AM -0700, Andrii Nakryiko wrote:
> > On Thu, Jun 6, 2024 at 9:46 AM Jiri Olsa  wrote:
> > >
> > > On Wed, Jun 05, 2024 at 10:50:11PM +0200, Jiri Olsa wrote:
> > > > On Wed, Jun 05, 2024 at 07:56:19PM +0200, Oleg Nesterov wrote:
> > > > > On 06/05, Andrii Nakryiko wrote:
> > > > > >
> > > > > > so any such
> > > > > > limitations will cause problems, issue reports, investigation, etc.
> > > > >
> > > > > Agreed...
> > > > >
> > > > > > As one possible solution, what if we do
> > > > > >
> > > > > > struct return_instance {
> > > > > > ...
> > > > > > u64 session_cookies[];
> > > > > > };
> > > > > >
> > > > > > and allocate sizeof(struct return_instance) + 8 *
> > > > > >  and then at runtime pass
> > > > > > &session_cookies[i] as data pointer to session-aware callbacks?
> > > > >
> > > > > I too thought about this, but I guess it is not that simple.
> > > > >
> > > > > Just for example. Suppose we have 2 session-consumers C1 and C2.
> > > > > What if uprobe_unregister(C1) comes before the probed function
> > > > > returns?
> > > > >
> > > > > We need something like map_cookie_to_consumer().
> > > >
> > > > I guess we could have hash table in return_instance that gets 'consumer 
> > > > -> cookie' ?
> > >
> > > ok, hash table is probably too big for this.. I guess some solution that
> > > would iterate consumers and cookies made sure it matches would be fine
> > >
> >
> > Yes, I was hoping to avoid hash tables for this, and in the common
> > case have no added overhead.
>
> hi,
> here's first stab on that.. the change below:
>   - extends current handlers with extra argument rather than adding new
> set of handlers
>   - store session consumers objects within return_instance object and
>   - iterate these objects ^^^ in handle_uretprobe_chain
>
> I guess it could be still polished, but I wonder if this could
> be the right direction to do this.. thoughts? ;-)

Yeah, I think this is the right direction. It's a bit sad that this
makes getting rid of rw_sem on hot path even harder, but that's a
separate problem.

>
> thanks,
> jirka
>
>
> ---
> diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
> index f46e0ca0169c..4e40e8352eac 100644
> --- a/include/linux/uprobes.h
> +++ b/include/linux/uprobes.h
> @@ -34,15 +34,19 @@ enum uprobe_filter_ctx {
>  };
>
>  struct uprobe_consumer {
> -   int (*handler)(struct uprobe_consumer *self, struct pt_regs *regs);
> +   int (*handler)(struct uprobe_consumer *self, struct pt_regs *regs,
> +   unsigned long *data);

can we use __u64 here? This long vs __u64 might cause problems for BPF
when the host is 32-bit architecture (BPF is always 64-bit).

> int (*ret_handler)(struct uprobe_consumer *self,
> unsigned long func,
> -   struct pt_regs *regs);
> +   struct pt_regs *regs,
> +   unsigned long *data);
> bool (*filter)(struct uprobe_consumer *self,
> enum uprobe_filter_ctx ctx,
> struct mm_struct *mm);
>

[...]

>  static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask)
>  {
> struct uprobe_task *n_utask;
> @@ -1756,11 +1795,11 @@ static int dup_utask(struct task_struct *t, struct 
> uprobe_task *o_utask)
>
> p = &n_utask->return_instances;
> for (o = o_utask->return_instances; o; o = o->next) {
> -   n = kmalloc(sizeof(struct return_instance), GFP_KERNEL);
> +   n = alloc_return_instance(o->session_cnt);
> if (!n)
> return -ENOMEM;
>
> -   *n = *o;
> +   memcpy(n, o, ri_size(o->session_cnt));
> get_uprobe(n->uprobe);
> n->next = NULL;
>
> @@ -1853,35 +1892,38 @@ static void cleanup_return_instances(struct 
> uprobe_task *utask, bool chained,
> utask->return_instances = ri;
>  }
>
> -static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
> +static struct return_instance *
> +prepare_

Re: [PATCH] uprobe: Do not use UPROBE_SWBP_INSN as static initializer

2024-06-20 Thread Andrii Nakryiko
On Tue, Jun 18, 2024 at 12:43 PM Jiri Olsa  wrote:
>
> Nathan reported compilation fail for loongarch arch:
>
>   kernel/events/uprobes.c: In function 'arch_uprobe_trampoline':
>   arch/loongarch/include/asm/uprobes.h:12:33: error: initializer element is 
> not constant
>  12 | #define UPROBE_SWBP_INSNlarch_insn_gen_break(BRK_UPROBE_BP)
> | ^~~~
>   kernel/events/uprobes.c:1479:39: note: in expansion of macro 
> 'UPROBE_SWBP_INSN'
>1479 | static uprobe_opcode_t insn = UPROBE_SWBP_INSN;
>
> Loongarch defines UPROBE_SWBP_INSN as function call, so we can't
> use it to initialize static variable.
>
> Cc: Oleg Nesterov 
> Fixes: ff474a78cef5 ("uprobe: Add uretprobe syscall to speed up return probe")
> Reported-by: Nathan Chancellor 
> Signed-off-by: Jiri Olsa 
> ---
>  kernel/events/uprobes.c | 3 ++-
>  1 file changed, 2 insertions(+), 1 deletion(-)
>

Can we instead ask loongarch folks to rewrite it to be a constant?
Having this as a function call is both an inconvenience and potential
performance problem (a minor one, but still). I would imagine it's not
hard to hard-code an instruction as a constant here.

> diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
> index 2816e65729ac..6986bd993702 100644
> --- a/kernel/events/uprobes.c
> +++ b/kernel/events/uprobes.c
> @@ -1476,8 +1476,9 @@ static int xol_add_vma(struct mm_struct *mm, struct 
> xol_area *area)
>
>  void * __weak arch_uprobe_trampoline(unsigned long *psize)
>  {
> -   static uprobe_opcode_t insn = UPROBE_SWBP_INSN;
> +   static uprobe_opcode_t insn;
>
> +   insn = insn ?: UPROBE_SWBP_INSN;
> *psize = UPROBE_SWBP_INSN_SIZE;
> return &insn;
>  }
> --
> 2.45.1
>



Re: [PATCH] uprobe: Do not use UPROBE_SWBP_INSN as static initializer

2024-06-20 Thread Andrii Nakryiko
On Thu, Jun 20, 2024 at 12:40 PM Oleg Nesterov  wrote:
>
> On 06/20, Andrii Nakryiko wrote:
> >
> > Can we instead ask loongarch folks to rewrite it to be a constant?
> > Having this as a function call is both an inconvenience and potential
> > performance problem (a minor one, but still). I would imagine it's not
> > hard to hard-code an instruction as a constant here.
>
> I was going to ask the same question when I saw the bug report ;)
> The same for other users of larch_insn_gen_break().
>
> But I can't understand what does it do, it calls emit_break() and
> git grep -w emit_break finds nothing.
>

It's DEF_EMIT_REG0I15_FORMAT(break, break_op) in
arch/loongarch/include/asm/inst.h

A bunch of macro magic, but in the end it produces some constant
value, of course.

> Oleg.
>



Re: [PATCH] LoongArch: uprobes: make UPROBE_SWBP_INSN/UPROBE_XOLBP_INSN constant

2024-06-27 Thread Andrii Nakryiko
On Thu, Jun 27, 2024 at 9:04 AM Oleg Nesterov  wrote:
>
> LoongArch defines UPROBE_SWBP_INSN as a function call and this breaks
> arch_uprobe_trampoline() which uses it to initialize a static variable.
>
> Fixes: ff474a78cef5 ("uprobe: Add uretprobe syscall to speed up return probe")
> Reported-by: Nathan Chancellor 
> Closes: https://lore.kernel.org/all/20240614174822.GA1185149@thelio-3990X/
> Suggested-by: Andrii Nakryiko 
> Signed-off-by: Oleg Nesterov 
> ---
>  arch/loongarch/include/asm/uprobes.h | 6 --
>  arch/loongarch/kernel/uprobes.c  | 8 
>  2 files changed, 12 insertions(+), 2 deletions(-)
>

LGTM.

Acked-by: Andrii Nakryiko 


> diff --git a/arch/loongarch/include/asm/uprobes.h 
> b/arch/loongarch/include/asm/uprobes.h
> index c8f59983f702..18221eb9a8b0 100644
> --- a/arch/loongarch/include/asm/uprobes.h
> +++ b/arch/loongarch/include/asm/uprobes.h
> @@ -6,13 +6,15 @@
>
>  typedef u32 uprobe_opcode_t;
>
> +#define __emit_break(imm)  (uprobe_opcode_t)((imm) | (break_op << 15))
> +
>  #define MAX_UINSN_BYTES8
>  #define UPROBE_XOL_SLOT_BYTES  MAX_UINSN_BYTES
>
> -#define UPROBE_SWBP_INSN   larch_insn_gen_break(BRK_UPROBE_BP)
> +#define UPROBE_SWBP_INSN   __emit_break(BRK_UPROBE_BP)
>  #define UPROBE_SWBP_INSN_SIZE  LOONGARCH_INSN_SIZE
>
> -#define UPROBE_XOLBP_INSN  larch_insn_gen_break(BRK_UPROBE_XOLBP)
> +#define UPROBE_XOLBP_INSN  __emit_break(BRK_UPROBE_XOLBP)
>

this looks correct (but based on pure code inspection)

>  struct arch_uprobe {
> unsigned long   resume_era;
> diff --git a/arch/loongarch/kernel/uprobes.c b/arch/loongarch/kernel/uprobes.c
> index 87abc7137b73..90462d94c28f 100644
> --- a/arch/loongarch/kernel/uprobes.c
> +++ b/arch/loongarch/kernel/uprobes.c
> @@ -7,6 +7,14 @@
>
>  #define UPROBE_TRAP_NR UINT_MAX
>
> +static __init int check_emit_break(void)
> +{
> +   BUG_ON(UPROBE_SWBP_INSN  != larch_insn_gen_break(BRK_UPROBE_BP));
> +   BUG_ON(UPROBE_XOLBP_INSN != larch_insn_gen_break(BRK_UPROBE_XOLBP));
> +   return 0;
> +}
> +arch_initcall(check_emit_break);
> +

I wouldn't even bother with this, but whatever.

>  int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe,
>  struct mm_struct *mm, unsigned long addr)
>  {
> --
> 2.25.1.362.g51ebf55
>
>



Re: [PATCH] perf,x86: avoid missing caller address in stack traces captured in uprobe

2024-07-02 Thread Andrii Nakryiko
On Tue, Jul 2, 2024 at 2:50 AM Peter Zijlstra  wrote:
>
>
> +Josj +LKML
>

ack, will add for next revision


> On Mon, Jul 01, 2024 at 04:10:27PM -0700, Andrii Nakryiko wrote:
> > When tracing user functions with uprobe functionality, it's common to
> > install the probe (e.g., a BPF program) at the first instruction of the
> > function. This is often going to be `push %rbp` instruction in function
> > preamble, which means that within that function frame pointer hasn't
> > been established yet. This leads to consistently missing an actual
> > caller of the traced function, because perf_callchain_user() only
> > records current IP (capturing traced function) and then following frame
> > pointer chain (which would be caller's frame, containing the address of
> > caller's caller).
> >
> > So when we have target_1 -> target_2 -> target_3 call chain and we are
> > tracing an entry to target_3, captured stack trace will report
> > target_1 -> target_3 call chain, which is wrong and confusing.
> >
> > This patch proposes a x86-64-specific heuristic to detect `push %rbp`
> > instruction being traced. Given entire kernel implementation of user
> > space stack trace capturing works under assumption that user space code
> > was compiled with frame pointer register (%rbp) preservation, it seems
> > pretty reasonable to use this instruction as a strong indicator that
> > this is the entry to the function. In that case, return address is still
> > pointed to by %rsp, so we fetch it and add to stack trace before
> > proceeding to unwind the rest using frame pointer-based logic.
> >
> > Signed-off-by: Andrii Nakryiko 
> > ---
> >  arch/x86/events/core.c  | 20 
> >  include/linux/uprobes.h |  2 ++
> >  kernel/events/uprobes.c |  2 ++
> >  3 files changed, 24 insertions(+)
> >
> > diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
> > index 5b0dd07b1ef1..82d5570b58ff 100644
> > --- a/arch/x86/events/core.c
> > +++ b/arch/x86/events/core.c
> > @@ -2884,6 +2884,26 @@ perf_callchain_user(struct perf_callchain_entry_ctx 
> > *entry, struct pt_regs *regs
> >   return;
> >
> >   pagefault_disable();
> > +
> > +#ifdef CONFIG_UPROBES
> > + /*
> > +  * If we are called from uprobe handler, and we are indeed at the very
> > +  * entry to user function (which is normally a `push %rbp` 
> > instruction,
> > +  * under assumption of application being compiled with frame 
> > pointers),
> > +  * we should read return address from *regs->sp before proceeding
> > +  * to follow frame pointers, otherwise we'll skip immediate caller
> > +  * as %rbp is not yet setup.
> > +  */
> > + if (current->utask) {
> > + struct arch_uprobe *auprobe = current->utask->auprobe;
> > + u64 ret_addr;
> > +
> > + if (auprobe && auprobe->insn[0] == 0x55 /* push %rbp */ &&
> > + !__get_user(ret_addr, (const u64 __user *)regs->sp))
>
> This u64 is wrong, perf_callchain_user() is always native size.
>
> Additionally, I suppose you should also add a hunk to
> perf_callchain_user32(), which is the compat case.
>

Ah, I misunderstood the purpose of perf_callchain_user32(), and so
assumed u64 is correct here. I get it now, perf_callchain_user32() is
compat 32-in-64 case, but the general case can be either 32 or 64 bit.
Will fix it, thanks!

> > + perf_callchain_store(entry, ret_addr);
> > + }
> > +#endif
> > +
> >   while (entry->nr < entry->max_stack) {
> >   if (!valid_user_frame(fp, sizeof(frame)))
> >   break;
> > diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
> > index b503fafb7fb3..a270a5892ab4 100644
> > --- a/include/linux/uprobes.h
> > +++ b/include/linux/uprobes.h
> > @@ -76,6 +76,8 @@ struct uprobe_task {
> >   struct uprobe   *active_uprobe;
> >   unsigned long   xol_vaddr;
> >
> > + struct arch_uprobe  *auprobe;
> > +
> >   struct return_instance  *return_instances;
> >   unsigned intdepth;
> >  };
> > diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
> > index 99be2adedbc0..6e22e4d80f1e 100644
> > --- a/kernel/events/uprobes.c
> > +++ b/kernel/events/uprobes.c
> > @@ -2082,6 +2082,7 @@ static void handler_chain(struct uprobe *uprobe, 
> > struct pt_regs *regs)
> >   bool need_prep = false; /* prepare return uprobe, when needed */
> >
> >   down_read(&uprobe->register_rwsem);
> > + current->utask->auprobe = &uprobe->arch;
> >   for (uc = uprobe->consumers; uc; uc = uc->next) {
> >   int rc = 0;
> >
> > @@ -2096,6 +2097,7 @@ static void handler_chain(struct uprobe *uprobe, 
> > struct pt_regs *regs)
> >
> >   remove &= rc;
> >   }
> > + current->utask->auprobe = NULL;
> >
> >   if (need_prep && !remove)
> >   prepare_uretprobe(uprobe, regs); /* put bp at return */
> > --
> > 2.43.0
> >



[PATCH v2] perf,x86: avoid missing caller address in stack traces captured in uprobe

2024-07-02 Thread Andrii Nakryiko
When tracing user functions with uprobe functionality, it's common to
install the probe (e.g., a BPF program) at the first instruction of the
function. This is often going to be `push %rbp` instruction in function
preamble, which means that within that function frame pointer hasn't
been established yet. This leads to consistently missing an actual
caller of the traced function, because perf_callchain_user() only
records current IP (capturing traced function) and then following frame
pointer chain (which would be caller's frame, containing the address of
caller's caller).

So when we have target_1 -> target_2 -> target_3 call chain and we are
tracing an entry to target_3, captured stack trace will report
target_1 -> target_3 call chain, which is wrong and confusing.

This patch proposes a x86-64-specific heuristic to detect `push %rbp`
(`push %ebp` on 32-bit architecture) instruction being traced. Given
entire kernel implementation of user space stack trace capturing works
under assumption that user space code was compiled with frame pointer
register (%rbp/%ebp) preservation, it seems pretty reasonable to use
this instruction as a strong indicator that this is the entry to the
function. In that case, return address is still pointed to by %rsp/%esp,
so we fetch it and add to stack trace before proceeding to unwind the
rest using frame pointer-based logic.

Signed-off-by: Andrii Nakryiko 
---
v1->v2:
  - use native unsigned long for ret_addr (Peter);
  - add same logic for compat logic in perf_callchain_user32 (Peter).

 arch/x86/events/core.c  | 33 +
 include/linux/uprobes.h |  2 ++
 kernel/events/uprobes.c |  2 ++
 3 files changed, 37 insertions(+)

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 5b0dd07b1ef1..60821c1ff2f3 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2833,6 +2833,19 @@ perf_callchain_user32(struct pt_regs *regs, struct 
perf_callchain_entry_ctx *ent
 
fp = compat_ptr(ss_base + regs->bp);
pagefault_disable();
+
+#ifdef CONFIG_UPROBES
+   /* see perf_callchain_user() below for why we do this */
+   if (current->utask) {
+   struct arch_uprobe *auprobe = current->utask->auprobe;
+   u32 ret_addr;
+
+   if (auprobe && auprobe->insn[0] == 0x55 /* push %ebp */ &&
+   !__get_user(ret_addr, (const u32 __user *)regs->sp))
+   perf_callchain_store(entry, ret_addr);
+   }
+#endif
+
while (entry->nr < entry->max_stack) {
if (!valid_user_frame(fp, sizeof(frame)))
break;
@@ -2884,6 +2897,26 @@ perf_callchain_user(struct perf_callchain_entry_ctx 
*entry, struct pt_regs *regs
return;
 
pagefault_disable();
+
+#ifdef CONFIG_UPROBES
+   /*
+* If we are called from uprobe handler, and we are indeed at the very
+* entry to user function (which is normally a `push %rbp` instruction,
+* under assumption of application being compiled with frame pointers),
+* we should read return address from *regs->sp before proceeding
+* to follow frame pointers, otherwise we'll skip immediate caller
+* as %rbp is not yet setup.
+*/
+   if (current->utask) {
+   struct arch_uprobe *auprobe = current->utask->auprobe;
+   unsigned long ret_addr;
+
+   if (auprobe && auprobe->insn[0] == 0x55 /* push %rbp/%ebp */ &&
+   !__get_user(ret_addr, (const unsigned long __user 
*)regs->sp))
+   perf_callchain_store(entry, ret_addr);
+   }
+#endif
+
while (entry->nr < entry->max_stack) {
if (!valid_user_frame(fp, sizeof(frame)))
break;
diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
index b503fafb7fb3..a270a5892ab4 100644
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -76,6 +76,8 @@ struct uprobe_task {
struct uprobe   *active_uprobe;
unsigned long   xol_vaddr;
 
+   struct arch_uprobe  *auprobe;
+
struct return_instance  *return_instances;
unsigned intdepth;
 };
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 99be2adedbc0..6e22e4d80f1e 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -2082,6 +2082,7 @@ static void handler_chain(struct uprobe *uprobe, struct 
pt_regs *regs)
bool need_prep = false; /* prepare return uprobe, when needed */
 
down_read(&uprobe->register_rwsem);
+   current->utask->auprobe = &uprobe->arch;
for (uc = uprobe->consumers; uc; uc = uc->next) {
int rc = 0;
 
@@ -2096,6 +2097,7 @@ static void handler_chain(struct uprobe

Re: [PATCH v2 00/12] uprobes: add batched register/unregister APIs and per-CPU RW semaphore

2024-07-02 Thread Andrii Nakryiko
On Tue, Jul 2, 2024 at 4:54 AM Peter Zijlstra  wrote:
>
>
> +LKML
>
> On Tue, Jul 02, 2024 at 12:23:53PM +0200, Peter Zijlstra wrote:
> > On Mon, Jul 01, 2024 at 03:39:23PM -0700, Andrii Nakryiko wrote:
> > > This patch set, ultimately, switches global uprobes_treelock from RW 
> > > spinlock
> > > to per-CPU RW semaphore, which has better performance and scales better 
> > > under
> > > contention and multiple parallel threads triggering lots of uprobes.
> >
> > Why not RCU + normal lock thing?
>
> Something like the *completely* untested below.
>
> ---
> diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
> index 2c83ba776fc7..03b38f3f7be3 100644
> --- a/kernel/events/uprobes.c
> +++ b/kernel/events/uprobes.c
> @@ -40,6 +40,7 @@ static struct rb_root uprobes_tree = RB_ROOT;
>  #define no_uprobe_events() RB_EMPTY_ROOT(&uprobes_tree)
>
>  static DEFINE_RWLOCK(uprobes_treelock);/* serialize rbtree access */
> +static seqcount_rwlock_t uprobes_seqcount = 
> SEQCNT_RWLOCK_ZERO(uprobes_seqcount, &uprobes_treelock);
>
>  #define UPROBES_HASH_SZ13
>  /* serialize uprobe->pending_list */
> @@ -54,6 +55,7 @@ DEFINE_STATIC_PERCPU_RWSEM(dup_mmap_sem);
>  struct uprobe {
> struct rb_node  rb_node;/* node in the rb tree */
> refcount_t  ref;
> +   struct rcu_head rcu;
> struct rw_semaphore register_rwsem;
> struct rw_semaphore consumer_rwsem;
> struct list_headpending_list;
> @@ -67,7 +69,7 @@ struct uprobe {
>  * The generic code assumes that it has two members of unknown type
>  * owned by the arch-specific code:
>  *
> -*  insn -  copy_insn() saves the original instruction here for
> +*  insn -  copy_insn() saves the original instruction here for
>  *  arch_uprobe_analyze_insn().
>  *
>  *  ixol -  potentially modified instruction to execute out of
> @@ -593,6 +595,12 @@ static struct uprobe *get_uprobe(struct uprobe *uprobe)
> return uprobe;
>  }
>
> +static void uprobe_free_rcu(struct rcu_head *rcu)
> +{
> +   struct uprobe *uprobe = container_of(rcu, struct uprobe, rcu);
> +   kfree(uprobe);
> +}
> +
>  static void put_uprobe(struct uprobe *uprobe)
>  {
> if (refcount_dec_and_test(&uprobe->ref)) {
> @@ -604,7 +612,8 @@ static void put_uprobe(struct uprobe *uprobe)

right above this we have roughly this:

percpu_down_write(&uprobes_treelock);

/* refcount check */
rb_erase(&uprobe->rb_node, &uprobes_tree);

percpu_up_write(&uprobes_treelock);


This writer lock is necessary for modification of the RB tree. And I
was under impression that I shouldn't be doing
percpu_(down|up)_write() inside the normal
rcu_read_lock()/rcu_read_unlock() region (percpu_down_write has
might_sleep() in it). But maybe I'm wrong, hopefully Paul can help to
clarify.

But actually what's wrong with RCU Tasks Trace flavor? I will
ultimately use it anyway to avoid uprobe taking unnecessary refcount
and to protect uprobe->consumers iteration and uc->handler() calls,
which could be sleepable, so would need rcu_read_lock_trace().

> mutex_lock(&delayed_uprobe_lock);
> delayed_uprobe_remove(uprobe, NULL);
> mutex_unlock(&delayed_uprobe_lock);
> -   kfree(uprobe);
> +
> +   call_rcu(&uprobe->rcu, uprobe_free_rcu);
> }
>  }
>
> @@ -668,12 +677,25 @@ static struct uprobe *__find_uprobe(struct inode 
> *inode, loff_t offset)
>  static struct uprobe *find_uprobe(struct inode *inode, loff_t offset)
>  {
> struct uprobe *uprobe;
> +   unsigned seq;
>
> -   read_lock(&uprobes_treelock);
> -   uprobe = __find_uprobe(inode, offset);
> -   read_unlock(&uprobes_treelock);
> +   guard(rcu)();
>
> -   return uprobe;
> +   do {
> +   seq = read_seqcount_begin(&uprobes_seqcount);
> +   uprobes = __find_uprobe(inode, offset);
> +   if (uprobes) {
> +   /*
> +* Lockless RB-tree lookups are prone to 
> false-negatives.
> +* If they find something, it's good. If they do not 
> find,
> +* it needs to be validated.
> +*/
> +   return uprobes;
> +   }
> +   } while (read_seqcount_retry(&uprobes_seqcount, seq));
> +
> +   /* Really didn't find anything. */
> +   return NULL;
>  }

Honest questio

Re: [PATCHv2 bpf-next 1/9] uprobe: Add support for session consumer

2024-07-02 Thread Andrii Nakryiko
On Mon, Jul 1, 2024 at 9:41 AM Jiri Olsa  wrote:
>
> Adding support for uprobe consumer to be defined as session and have
> new behaviour for consumer's 'handler' and 'ret_handler' callbacks.
>
> The session means that 'handler' and 'ret_handler' callbacks are
> connected in a way that allows to:
>
>   - control execution of 'ret_handler' from 'handler' callback
>   - share data between 'handler' and 'ret_handler' callbacks
>
> The session is enabled by setting new 'session' bool field to true
> in uprobe_consumer object.
>
> We keep count of session consumers for uprobe and allocate session_consumer
> object for each in return_instance object. This allows us to store
> return values of 'handler' callbacks and data pointers of shared
> data between both handlers.
>
> The session concept fits to our common use case where we do filtering
> on entry uprobe and based on the result we decide to run the return
> uprobe (or not).
>
> It's also convenient to share the data between session callbacks.
>
> The control of 'ret_handler' callback execution is done via return
> value of the 'handler' callback. If it's 0 we install and execute
> return uprobe, if it's 1 we do not.
>
> Signed-off-by: Jiri Olsa 
> ---
>  include/linux/uprobes.h |  16 -
>  kernel/events/uprobes.c | 129 +---
>  kernel/trace/bpf_trace.c|   6 +-
>  kernel/trace/trace_uprobe.c |  12 ++--
>  4 files changed, 144 insertions(+), 19 deletions(-)
>
> diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
> index f46e0ca0169c..903a860a8d01 100644
> --- a/include/linux/uprobes.h
> +++ b/include/linux/uprobes.h
> @@ -34,15 +34,18 @@ enum uprobe_filter_ctx {
>  };
>
>  struct uprobe_consumer {
> -   int (*handler)(struct uprobe_consumer *self, struct pt_regs *regs);
> +   int (*handler)(struct uprobe_consumer *self, struct pt_regs *regs, 
> __u64 *data);
> int (*ret_handler)(struct uprobe_consumer *self,
> unsigned long func,
> -   struct pt_regs *regs);
> +   struct pt_regs *regs, __u64 *data);
> bool (*filter)(struct uprobe_consumer *self,
> enum uprobe_filter_ctx ctx,
> struct mm_struct *mm);
>
> struct uprobe_consumer *next;
> +
> +   boolsession;/* marks uprobe session 
> consumer */
> +   unsigned intsession_id; /* set when uprobe_consumer 
> is registered */
>  };
>
>  #ifdef CONFIG_UPROBES
> @@ -80,6 +83,12 @@ struct uprobe_task {
> unsigned intdepth;
>  };
>
> +struct session_consumer {
> +   __u64   cookie;
> +   unsigned intid;
> +   int rc;

you'll be using u64 for ID, right? so this struct will be 24 bytes.
Maybe we can just use topmost bit of ID to store whether uretprobe
should run or not? It's trivial to mask out during ID comparisons

> +};
> +
>  struct return_instance {
> struct uprobe   *uprobe;
> unsigned long   func;
> @@ -88,6 +97,9 @@ struct return_instance {
> boolchained;/* true, if instance is 
> nested */
>
> struct return_instance  *next;  /* keep as stack */
> +
> +   int sessions_cnt;

there is 7 byte gap before next field, let's put sessions_cnt there

> +   struct session_consumer sessions[];
>  };
>
>  enum rp_check {
> diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
> index 2c83ba776fc7..4da410460f2a 100644
> --- a/kernel/events/uprobes.c
> +++ b/kernel/events/uprobes.c
> @@ -63,6 +63,8 @@ struct uprobe {
> loff_t  ref_ctr_offset;
> unsigned long   flags;
>
> +   unsigned intsessions_cnt;
> +
> /*
>  * The generic code assumes that it has two members of unknown type
>  * owned by the arch-specific code:
> @@ -750,11 +752,30 @@ static struct uprobe *alloc_uprobe(struct inode *inode, 
> loff_t offset,
> return uprobe;
>  }
>
> +static void
> +uprobe_consumer_account(struct uprobe *uprobe, struct uprobe_consumer *uc)
> +{
> +   static unsigned int session_id;

(besides what Peter mentioned about wrap around of 32-bit counter)
let's use atomic here to not rely on any particular locking
(unnecessarily), this might make my life easier in the future, thanks.
This is registration time, low frequency, extra atomic won't hurt.

It might be already broken, actually, for two independently registering uprobes.

> +
> +   if (uc->session) {
> +   uprobe->sessions_cnt++;
> +   uc->session_id = ++session_id ?: ++session_id;
> +   }
> +}
> +
> +static void
> +uprobe_consumer_unaccount(struct uprobe *uprobe, struct uprobe_consumer *uc)

this fits in 100 characters, keep it single line, please. Same for
account function

> +{
> +   if (uc-

Re: [PATCHv2 bpf-next 1/9] uprobe: Add support for session consumer

2024-07-02 Thread Andrii Nakryiko
On Tue, Jul 2, 2024 at 9:11 AM Jiri Olsa  wrote:
>
> On Tue, Jul 02, 2024 at 03:04:08PM +0200, Peter Zijlstra wrote:
> > On Mon, Jul 01, 2024 at 06:41:07PM +0200, Jiri Olsa wrote:
> >
> > > +static void
> > > +uprobe_consumer_account(struct uprobe *uprobe, struct uprobe_consumer 
> > > *uc)
> > > +{
> > > +   static unsigned int session_id;
> > > +
> > > +   if (uc->session) {
> > > +   uprobe->sessions_cnt++;
> > > +   uc->session_id = ++session_id ?: ++session_id;
> > > +   }
> > > +}
> >
> > The way I understand this code, you create a consumer every time you do
> > uprobe_register() and unregister makes it go away.
> >
> > Now, register one, then 4g-1 times register+unregister, then register
> > again.
> >
> > The above seems to then result in two consumers with the same
> > session_id, which leads to trouble.
> >
> > Hmm?
>
> ugh true.. will make it u64 :)
>
> I think we could store uprobe_consumer pointer+ref in session_consumer,
> and that would make the unregister path more interesting.. will check

More interesting how? It's actually a great idea, uprobe_consumer
pointer itself is a unique ID and 64-bit. We can still use lowest bit
for RC (see my other reply).

>
> thanks,
> jirka



Re: [PATCHv2 bpf-next 2/9] bpf: Add support for uprobe multi session attach

2024-07-02 Thread Andrii Nakryiko
On Mon, Jul 1, 2024 at 9:42 AM Jiri Olsa  wrote:
>
> Adding support to attach bpf program for entry and return probe
> of the same function. This is common use case which at the moment
> requires to create two uprobe multi links.
>
> Adding new BPF_TRACE_UPROBE_SESSION attach type that instructs
> kernel to attach single link program to both entry and exit probe.
>
> It's possible to control execution of the bpf program on return
> probe simply by returning zero or non zero from the entry bpf
> program execution to execute or not the bpf program on return
> probe respectively.
>
> Signed-off-by: Jiri Olsa 
> ---
>  include/uapi/linux/bpf.h   |  1 +
>  kernel/bpf/syscall.c   |  9 +++--
>  kernel/trace/bpf_trace.c   | 25 +++--
>  tools/include/uapi/linux/bpf.h |  1 +
>  4 files changed, 28 insertions(+), 8 deletions(-)
>

LGTM

Acked-by: Andrii Nakryiko 

> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 35bcf52dbc65..1d93cb014884 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -1116,6 +1116,7 @@ enum bpf_attach_type {
> BPF_NETKIT_PRIMARY,
> BPF_NETKIT_PEER,
> BPF_TRACE_KPROBE_SESSION,
> +   BPF_TRACE_UPROBE_SESSION,
> __MAX_BPF_ATTACH_TYPE
>  };
>
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index 869265852d51..2a63a528fa3c 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -4049,10 +4049,14 @@ static int bpf_prog_attach_check_attach_type(const 
> struct bpf_prog *prog,
> if (prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI &&
> attach_type != BPF_TRACE_UPROBE_MULTI)
> return -EINVAL;
> +   if (prog->expected_attach_type == BPF_TRACE_UPROBE_SESSION &&
> +   attach_type != BPF_TRACE_UPROBE_SESSION)
> +   return -EINVAL;
> if (attach_type != BPF_PERF_EVENT &&
> attach_type != BPF_TRACE_KPROBE_MULTI &&
> attach_type != BPF_TRACE_KPROBE_SESSION &&
> -   attach_type != BPF_TRACE_UPROBE_MULTI)
> +   attach_type != BPF_TRACE_UPROBE_MULTI &&
> +   attach_type != BPF_TRACE_UPROBE_SESSION)
> return -EINVAL;
> return 0;
> case BPF_PROG_TYPE_SCHED_CLS:
> @@ -5315,7 +5319,8 @@ static int link_create(union bpf_attr *attr, bpfptr_t 
> uattr)
> else if (attr->link_create.attach_type == 
> BPF_TRACE_KPROBE_MULTI ||
>  attr->link_create.attach_type == 
> BPF_TRACE_KPROBE_SESSION)
> ret = bpf_kprobe_multi_link_attach(attr, prog);
> -   else if (attr->link_create.attach_type == 
> BPF_TRACE_UPROBE_MULTI)
> +   else if (attr->link_create.attach_type == 
> BPF_TRACE_UPROBE_MULTI ||
> +attr->link_create.attach_type == 
> BPF_TRACE_UPROBE_SESSION)
> ret = bpf_uprobe_multi_link_attach(attr, prog);
> break;
> default:
> diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
> index 02d052639dfe..1b19c1cdb5e1 100644
> --- a/kernel/trace/bpf_trace.c
> +++ b/kernel/trace/bpf_trace.c
> @@ -1645,6 +1645,17 @@ static inline bool is_kprobe_session(const struct 
> bpf_prog *prog)
> return prog->expected_attach_type == BPF_TRACE_KPROBE_SESSION;
>  }
>
> +static inline bool is_uprobe_multi(const struct bpf_prog *prog)
> +{
> +   return prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI ||
> +  prog->expected_attach_type == BPF_TRACE_UPROBE_SESSION;
> +}
> +
> +static inline bool is_uprobe_session(const struct bpf_prog *prog)
> +{
> +   return prog->expected_attach_type == BPF_TRACE_UPROBE_SESSION;
> +}
> +
>  static const struct bpf_func_proto *
>  kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
>  {
> @@ -1662,13 +1673,13 @@ kprobe_prog_func_proto(enum bpf_func_id func_id, 
> const struct bpf_prog *prog)
> case BPF_FUNC_get_func_ip:
> if (is_kprobe_multi(prog))
> return &bpf_get_func_ip_proto_kprobe_multi;
> -   if (prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI)
> +   if (is_uprobe_multi(prog))
> return &bpf_get_func_ip_proto_uprobe_multi;
> return &bpf_get_func_ip_proto_kprobe;
> case BPF_FUNC_get_attach_cookie:
> if (is_kprobe_multi(prog))
> 

Re: [PATCHv2 bpf-next 3/9] bpf: Add support for uprobe multi session context

2024-07-02 Thread Andrii Nakryiko
On Mon, Jul 1, 2024 at 9:42 AM Jiri Olsa  wrote:
>
> Placing bpf_session_run_ctx layer in between bpf_run_ctx and
> bpf_uprobe_multi_run_ctx, so the session data can be retrieved
> from uprobe_multi link.
>
> Plus granting session kfuncs access to uprobe session programs.
>
> Signed-off-by: Jiri Olsa 
> ---
>  kernel/trace/bpf_trace.c | 23 +++
>  1 file changed, 15 insertions(+), 8 deletions(-)
>

Acked-by: Andrii Nakryiko 


> diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
> index 1b19c1cdb5e1..d431b880ca11 100644
> --- a/kernel/trace/bpf_trace.c
> +++ b/kernel/trace/bpf_trace.c
> @@ -3184,7 +3184,7 @@ struct bpf_uprobe_multi_link {
>  };
>
>  struct bpf_uprobe_multi_run_ctx {
> -   struct bpf_run_ctx run_ctx;
> +   struct bpf_session_run_ctx session_ctx;
> unsigned long entry_ip;
> struct bpf_uprobe *uprobe;
>  };
> @@ -3297,10 +3297,15 @@ static const struct bpf_link_ops 
> bpf_uprobe_multi_link_lops = {
>
>  static int uprobe_prog_run(struct bpf_uprobe *uprobe,
>unsigned long entry_ip,
> -  struct pt_regs *regs)
> +  struct pt_regs *regs,
> +  bool is_return, void *data)
>  {
> struct bpf_uprobe_multi_link *link = uprobe->link;
> struct bpf_uprobe_multi_run_ctx run_ctx = {
> +   .session_ctx = {
> +   .is_return = is_return,
> +   .data = data,
> +   },
> .entry_ip = entry_ip,
> .uprobe = uprobe,
> };
> @@ -3319,7 +3324,7 @@ static int uprobe_prog_run(struct bpf_uprobe *uprobe,
>
> migrate_disable();
>
> -   old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
> +   old_run_ctx = bpf_set_run_ctx(&run_ctx.session_ctx.run_ctx);
> err = bpf_prog_run(link->link.prog, regs);
> bpf_reset_run_ctx(old_run_ctx);
>
> @@ -3349,7 +3354,7 @@ uprobe_multi_link_handler(struct uprobe_consumer *con, 
> struct pt_regs *regs,
> struct bpf_uprobe *uprobe;
>
> uprobe = container_of(con, struct bpf_uprobe, consumer);
> -   return uprobe_prog_run(uprobe, instruction_pointer(regs), regs);
> +   return uprobe_prog_run(uprobe, instruction_pointer(regs), regs, 
> false, data);
>  }
>
>  static int
> @@ -3359,14 +3364,15 @@ uprobe_multi_link_ret_handler(struct uprobe_consumer 
> *con, unsigned long func, s
> struct bpf_uprobe *uprobe;
>
> uprobe = container_of(con, struct bpf_uprobe, consumer);
> -   return uprobe_prog_run(uprobe, func, regs);
> +   return uprobe_prog_run(uprobe, func, regs, true, data);
>  }
>
>  static u64 bpf_uprobe_multi_entry_ip(struct bpf_run_ctx *ctx)
>  {
> struct bpf_uprobe_multi_run_ctx *run_ctx;
>
> -   run_ctx = container_of(current->bpf_ctx, struct 
> bpf_uprobe_multi_run_ctx, run_ctx);
> +   run_ctx = container_of(current->bpf_ctx, struct 
> bpf_uprobe_multi_run_ctx,
> +  session_ctx.run_ctx);
> return run_ctx->entry_ip;
>  }
>
> @@ -3374,7 +3380,8 @@ static u64 bpf_uprobe_multi_cookie(struct bpf_run_ctx 
> *ctx)
>  {
> struct bpf_uprobe_multi_run_ctx *run_ctx;
>
> -   run_ctx = container_of(current->bpf_ctx, struct 
> bpf_uprobe_multi_run_ctx, run_ctx);
> +   run_ctx = container_of(current->bpf_ctx, struct 
> bpf_uprobe_multi_run_ctx,
> +  session_ctx.run_ctx);
> return run_ctx->uprobe->cookie;
>  }
>
> @@ -3565,7 +3572,7 @@ static int bpf_kprobe_multi_filter(const struct 
> bpf_prog *prog, u32 kfunc_id)
> if (!btf_id_set8_contains(&kprobe_multi_kfunc_set_ids, kfunc_id))
> return 0;
>
> -   if (!is_kprobe_session(prog))
> +   if (!is_kprobe_session(prog) && !is_uprobe_session(prog))
> return -EACCES;
>
> return 0;
> --
> 2.45.2
>
>



Re: [PATCHv2 bpf-next 4/9] libbpf: Add support for uprobe multi session attach

2024-07-02 Thread Andrii Nakryiko
On Mon, Jul 1, 2024 at 9:42 AM Jiri Olsa  wrote:
>
> Adding support to attach program in uprobe session mode
> with bpf_program__attach_uprobe_multi function.
>
> Adding session bool to bpf_uprobe_multi_opts struct that allows
> to load and attach the bpf program via uprobe session.
> the attachment to create uprobe multi session.
>
> Also adding new program loader section that allows:
>   SEC("uprobe.session/bpf_fentry_test*")
>
> and loads/attaches uprobe program as uprobe session.
>
> Signed-off-by: Jiri Olsa 
> ---
>  tools/lib/bpf/bpf.c|  1 +
>  tools/lib/bpf/libbpf.c | 50 --
>  tools/lib/bpf/libbpf.h |  4 +++-
>  3 files changed, 52 insertions(+), 3 deletions(-)
>

[...]

> @@ -9362,6 +9363,7 @@ static const struct bpf_sec_def section_defs[] = {
> SEC_DEF("kprobe.session+",  KPROBE, BPF_TRACE_KPROBE_SESSION, 
> SEC_NONE, attach_kprobe_session),
> SEC_DEF("uprobe.multi+",KPROBE, BPF_TRACE_UPROBE_MULTI, 
> SEC_NONE, attach_uprobe_multi),
> SEC_DEF("uretprobe.multi+", KPROBE, BPF_TRACE_UPROBE_MULTI, 
> SEC_NONE, attach_uprobe_multi),
> +   SEC_DEF("uprobe.session+",  KPROBE, BPF_TRACE_UPROBE_SESSION, 
> SEC_NONE, attach_uprobe_session),

sleepable ones as well?

> SEC_DEF("uprobe.multi.s+",  KPROBE, BPF_TRACE_UPROBE_MULTI, 
> SEC_SLEEPABLE, attach_uprobe_multi),
> SEC_DEF("uretprobe.multi.s+",   KPROBE, BPF_TRACE_UPROBE_MULTI, 
> SEC_SLEEPABLE, attach_uprobe_multi),
> SEC_DEF("ksyscall+",KPROBE, 0, SEC_NONE, attach_ksyscall),
> @@ -11698,6 +11700,40 @@ static int attach_uprobe_multi(const struct 
> bpf_program *prog, long cookie, stru
> return ret;
>  }
>
> +static int attach_uprobe_session(const struct bpf_program *prog, long 
> cookie, struct bpf_link **link)
> +{
> +   char *binary_path = NULL, *func_name = NULL;
> +   LIBBPF_OPTS(bpf_uprobe_multi_opts, opts,
> +   .session = true,
> +   );

nit: keep a single line?

> +   int n, ret = -EINVAL;
> +   const char *spec;
> +
> +   *link = NULL;
> +
> +   spec = prog->sec_name + sizeof("uprobe.session/") - 1;
> +   n = sscanf(spec, "%m[^:]:%m[^\n]",
> +  &binary_path, &func_name);

single line, wrapping lines is a necessary evil, please

[...]



Re: [PATCHv2 bpf-next 5/9] libbpf: Add uprobe session attach type names to attach_type_name

2024-07-02 Thread Andrii Nakryiko
On Mon, Jul 1, 2024 at 9:43 AM Jiri Olsa  wrote:
>
> Adding uprobe session attach type name to attach_type_name,
> so libbpf_bpf_attach_type_str returns proper string name for
> BPF_TRACE_UPROBE_SESSION attach type.
>
> Signed-off-by: Jiri Olsa 
> ---
>  tools/lib/bpf/libbpf.c | 1 +
>  1 file changed, 1 insertion(+)
>

Can you merge this into a patch that adds BPF_TRACE_UPROBE_SESSION to
keep bisectability of BPF selftests? It's a trivial patch, so
shouldn't be a big deal.

> diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
> index 492a8eb4d047..e69a54264580 100644
> --- a/tools/lib/bpf/libbpf.c
> +++ b/tools/lib/bpf/libbpf.c
> @@ -133,6 +133,7 @@ static const char * const attach_type_name[] = {
> [BPF_NETKIT_PRIMARY]= "netkit_primary",
> [BPF_NETKIT_PEER]   = "netkit_peer",
> [BPF_TRACE_KPROBE_SESSION]  = "trace_kprobe_session",
> +   [BPF_TRACE_UPROBE_SESSION]  = "trace_uprobe_session",
>  };
>
>  static const char * const link_type_name[] = {
> --
> 2.45.2
>



Re: [PATCHv2 bpf-next 6/9] selftests/bpf: Add uprobe session test

2024-07-02 Thread Andrii Nakryiko
On Mon, Jul 1, 2024 at 9:43 AM Jiri Olsa  wrote:
>
> Adding uprobe session test and testing that the entry program
> return value controls execution of the return probe program.
>
> Signed-off-by: Jiri Olsa 
> ---
>  .../bpf/prog_tests/uprobe_multi_test.c| 42 +++
>  .../bpf/progs/uprobe_multi_session.c  | 53 +++
>  2 files changed, 95 insertions(+)
>  create mode 100644 tools/testing/selftests/bpf/progs/uprobe_multi_session.c
>

LGTM.
Acked-by: Andrii Nakryiko 

> diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c 
> b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c
> index bf6ca8e3eb13..cd9581f46c73 100644
> --- a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c
> +++ b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c
> @@ -6,6 +6,7 @@
>  #include "uprobe_multi.skel.h"
>  #include "uprobe_multi_bench.skel.h"
>  #include "uprobe_multi_usdt.skel.h"
> +#include "uprobe_multi_session.skel.h"
>  #include "bpf/libbpf_internal.h"
>  #include "testing_helpers.h"
>  #include "../sdt.h"
> @@ -615,6 +616,45 @@ static void test_link_api(void)
> __test_link_api(child);
>  }
>
> +static void test_session_skel_api(void)
> +{
> +   struct uprobe_multi_session *skel = NULL;
> +   LIBBPF_OPTS(bpf_kprobe_multi_opts, opts);
> +   struct bpf_link *link = NULL;
> +   int err;
> +
> +   skel = uprobe_multi_session__open_and_load();
> +   if (!ASSERT_OK_PTR(skel, "fentry_raw_skel_load"))
> +   goto cleanup;
> +
> +   skel->bss->pid = getpid();
> +
> +   err = uprobe_multi_session__attach(skel);
> +   if (!ASSERT_OK(err, " uprobe_multi_session__attach"))
> +   goto cleanup;
> +
> +   /* trigger all probes */
> +   skel->bss->uprobe_multi_func_1_addr = (__u64) uprobe_multi_func_1;
> +   skel->bss->uprobe_multi_func_2_addr = (__u64) uprobe_multi_func_2;
> +   skel->bss->uprobe_multi_func_3_addr = (__u64) uprobe_multi_func_3;
> +
> +   uprobe_multi_func_1();
> +   uprobe_multi_func_2();
> +   uprobe_multi_func_3();
> +
> +   /*
> +* We expect 2 for uprobe_multi_func_2 because it runs both 
> entry/return probe,
> +* uprobe_multi_func_[13] run just the entry probe.
> +*/
> +   ASSERT_EQ(skel->bss->uprobe_session_result[0], 1, 
> "uprobe_multi_func_1_result");
> +   ASSERT_EQ(skel->bss->uprobe_session_result[1], 2, 
> "uprobe_multi_func_2_result");
> +   ASSERT_EQ(skel->bss->uprobe_session_result[2], 1, 
> "uprobe_multi_func_3_result");
> +
> +cleanup:
> +   bpf_link__destroy(link);
> +   uprobe_multi_session__destroy(skel);
> +}
> +
>  static void test_bench_attach_uprobe(void)
>  {
> long attach_start_ns = 0, attach_end_ns = 0;
> @@ -703,4 +743,6 @@ void test_uprobe_multi_test(void)
> test_bench_attach_usdt();
> if (test__start_subtest("attach_api_fails"))
> test_attach_api_fails();
> +   if (test__start_subtest("session"))
> +   test_session_skel_api();
>  }
> diff --git a/tools/testing/selftests/bpf/progs/uprobe_multi_session.c 
> b/tools/testing/selftests/bpf/progs/uprobe_multi_session.c
> new file mode 100644
> index ..72c00ae68372
> --- /dev/null
> +++ b/tools/testing/selftests/bpf/progs/uprobe_multi_session.c
> @@ -0,0 +1,53 @@
> +// SPDX-License-Identifier: GPL-2.0
> +#include 
> +#include 
> +#include 
> +#include 
> +#include "bpf_kfuncs.h"
> +#include "bpf_misc.h"
> +
> +char _license[] SEC("license") = "GPL";
> +
> +__u64 uprobe_multi_func_1_addr = 0;
> +__u64 uprobe_multi_func_2_addr = 0;
> +__u64 uprobe_multi_func_3_addr = 0;
> +
> +__u64 uprobe_session_result[3];
> +
> +int pid = 0;
> +
> +static int uprobe_multi_check(void *ctx, bool is_return)
> +{
> +   const __u64 funcs[] = {
> +   uprobe_multi_func_1_addr,
> +   uprobe_multi_func_2_addr,
> +   uprobe_multi_func_3_addr,
> +   };
> +   unsigned int i;
> +   __u64 addr;
> +
> +   if (bpf_get_current_pid_tgid() >> 32 != pid)
> +   return 1;
> +
> +   addr = bpf_get_func_ip(ctx);
> +
> +   for (i = 0; i < ARRAY_SIZE(funcs); i++) {
> +   if (funcs[i] == addr) {
> +   uprobe_session_result[i]++;
> +   break;
> +   }
> +   }
> +
> +   /* only uprobe_multi_func_2 executes return probe */
> +   if ((addr == uprobe_multi_func_1_addr) ||
> +   (addr == uprobe_multi_func_3_addr))
> +   return 1;
> +
> +   return 0;
> +}
> +
> +SEC("uprobe.session//proc/self/exe:uprobe_multi_func_*")
> +int uprobe(struct pt_regs *ctx)
> +{
> +   return uprobe_multi_check(ctx, bpf_session_is_return());
> +}
> --
> 2.45.2
>



Re: [PATCHv2 bpf-next 7/9] selftests/bpf: Add uprobe session cookie test

2024-07-02 Thread Andrii Nakryiko
On Mon, Jul 1, 2024 at 9:43 AM Jiri Olsa  wrote:
>
> Adding uprobe session test that verifies the cookie value
> get properly propagated from entry to return program.
>
> Signed-off-by: Jiri Olsa 
> ---
>  .../bpf/prog_tests/uprobe_multi_test.c| 31 
>  .../bpf/progs/uprobe_multi_session_cookie.c   | 48 +++
>  2 files changed, 79 insertions(+)
>  create mode 100644 
> tools/testing/selftests/bpf/progs/uprobe_multi_session_cookie.c
>

LGTM

Acked-by: Andrii Nakryiko 


> diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c 
> b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c
> index cd9581f46c73..d5f78fc61013 100644
> --- a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c
> +++ b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c
> @@ -7,6 +7,7 @@
>  #include "uprobe_multi_bench.skel.h"
>  #include "uprobe_multi_usdt.skel.h"
>  #include "uprobe_multi_session.skel.h"
> +#include "uprobe_multi_session_cookie.skel.h"
>  #include "bpf/libbpf_internal.h"
>  #include "testing_helpers.h"
>  #include "../sdt.h"
> @@ -655,6 +656,34 @@ static void test_session_skel_api(void)
> uprobe_multi_session__destroy(skel);
>  }
>
> +static void test_session_cookie_skel_api(void)
> +{
> +   struct uprobe_multi_session_cookie *skel = NULL;
> +   int err;
> +
> +   skel = uprobe_multi_session_cookie__open_and_load();
> +   if (!ASSERT_OK_PTR(skel, "fentry_raw_skel_load"))
> +   goto cleanup;
> +
> +   skel->bss->pid = getpid();
> +
> +   err = uprobe_multi_session_cookie__attach(skel);
> +   if (!ASSERT_OK(err, " kprobe_multi_session__attach"))
> +   goto cleanup;
> +
> +   /* trigger all probes */
> +   uprobe_multi_func_1();
> +   uprobe_multi_func_2();
> +   uprobe_multi_func_3();
> +
> +   ASSERT_EQ(skel->bss->test_uprobe_1_result, 1, "test_uprobe_1_result");
> +   ASSERT_EQ(skel->bss->test_uprobe_2_result, 2, "test_uprobe_2_result");
> +   ASSERT_EQ(skel->bss->test_uprobe_3_result, 3, "test_uprobe_3_result");
> +
> +cleanup:
> +   uprobe_multi_session_cookie__destroy(skel);
> +}
> +
>  static void test_bench_attach_uprobe(void)
>  {
> long attach_start_ns = 0, attach_end_ns = 0;
> @@ -745,4 +774,6 @@ void test_uprobe_multi_test(void)
> test_attach_api_fails();
> if (test__start_subtest("session"))
> test_session_skel_api();
> +   if (test__start_subtest("session_cookie"))
> +   test_session_cookie_skel_api();
>  }
> diff --git a/tools/testing/selftests/bpf/progs/uprobe_multi_session_cookie.c 
> b/tools/testing/selftests/bpf/progs/uprobe_multi_session_cookie.c
> new file mode 100644
> index ..5befdf944dc6
> --- /dev/null
> +++ b/tools/testing/selftests/bpf/progs/uprobe_multi_session_cookie.c
> @@ -0,0 +1,48 @@
> +// SPDX-License-Identifier: GPL-2.0
> +#include 
> +#include 
> +#include 
> +#include 
> +#include "bpf_kfuncs.h"
> +
> +char _license[] SEC("license") = "GPL";
> +
> +int pid = 0;
> +
> +__u64 test_uprobe_1_result = 0;
> +__u64 test_uprobe_2_result = 0;
> +__u64 test_uprobe_3_result = 0;
> +
> +static int check_cookie(__u64 val, __u64 *result)
> +{
> +   __u64 *cookie;
> +
> +   if (bpf_get_current_pid_tgid() >> 32 != pid)
> +   return 1;
> +
> +   cookie = bpf_session_cookie();
> +
> +   if (bpf_session_is_return())
> +   *result = *cookie == val ? val : 0;
> +   else
> +   *cookie = val;
> +   return 0;
> +}
> +
> +SEC("uprobe.session//proc/self/exe:uprobe_multi_func_1")
> +int uprobe_1(struct pt_regs *ctx)
> +{
> +   return check_cookie(1, &test_uprobe_1_result);
> +}
> +
> +SEC("uprobe.session//proc/self/exe:uprobe_multi_func_2")
> +int uprobe_2(struct pt_regs *ctx)
> +{
> +   return check_cookie(2, &test_uprobe_2_result);
> +}
> +
> +SEC("uprobe.session//proc/self/exe:uprobe_multi_func_3")
> +int uprobe_3(struct pt_regs *ctx)
> +{
> +   return check_cookie(3, &test_uprobe_3_result);
> +}
> --
> 2.45.2
>



Re: [PATCHv2 bpf-next 8/9] selftests/bpf: Add uprobe session recursive test

2024-07-02 Thread Andrii Nakryiko
On Mon, Jul 1, 2024 at 9:43 AM Jiri Olsa  wrote:
>
> Adding uprobe session test that verifies the cookie value is stored
> properly when single uprobe-ed function is executed recursively.
>
> Signed-off-by: Jiri Olsa 
> ---
>  .../bpf/prog_tests/uprobe_multi_test.c| 57 +++
>  .../progs/uprobe_multi_session_recursive.c| 44 ++
>  2 files changed, 101 insertions(+)
>  create mode 100644 
> tools/testing/selftests/bpf/progs/uprobe_multi_session_recursive.c
>

Nice!

Acked-by: Andrii Nakryiko 


[...]

> +static void test_session_recursive_skel_api(void)
> +{
> +   struct uprobe_multi_session_recursive *skel = NULL;
> +   int i, err;
> +
> +   skel = uprobe_multi_session_recursive__open_and_load();
> +   if (!ASSERT_OK_PTR(skel, 
> "uprobe_multi_session_recursive__open_and_load"))
> +   goto cleanup;
> +
> +   skel->bss->pid = getpid();
> +
> +   err = uprobe_multi_session_recursive__attach(skel);
> +   if (!ASSERT_OK(err, "uprobe_multi_session_recursive__attach"))
> +   goto cleanup;
> +
> +   for (i = 0; i < ARRAY_SIZE(skel->bss->test_uprobe_cookie_entry); i++)
> +   skel->bss->test_uprobe_cookie_entry[i] = i + 1;
> +
> +   uprobe_session_recursive(5);
> +
> +   /*

nit: unnecessary empty comment line

> +* entry uprobe:
> +* uprobe_session_recursive(5) { *cookie = 1, return 0
> +*   uprobe_session_recursive(4) {   *cookie = 2, return 1
> +* uprobe_session_recursive(3) { *cookie = 3, return 0
> +*   uprobe_session_recursive(2) {   *cookie = 4, return 1
> +* uprobe_session_recursive(1) { *cookie = 5, return 0
> +*   uprobe_session_recursive(0) {   *cookie = 6, return 1
> +*  return uprobe:
> +*   } i = 0  not executed
> +* } i = 1
> test_uprobe_cookie_return[0] = 5
> +*   } i = 2  not executed
> +* } i = 3
> test_uprobe_cookie_return[1] = 3
> +*   } i = 4  not executed
> +* } i = 5
> test_uprobe_cookie_return[2] = 1
> +*/
> +

[...]



Re: [PATCHv2 bpf-next 9/9] selftests/bpf: Add uprobe session consumers test

2024-07-02 Thread Andrii Nakryiko
On Mon, Jul 1, 2024 at 9:44 AM Jiri Olsa  wrote:
>
> Adding test that attached/detaches multiple consumers on
> single uprobe and verifies all were hit as expected.
>
> Signed-off-by: Jiri Olsa 
> ---
>  .../bpf/prog_tests/uprobe_multi_test.c| 203 ++
>  .../progs/uprobe_multi_session_consumers.c|  53 +
>  2 files changed, 256 insertions(+)
>  create mode 100644 
> tools/testing/selftests/bpf/progs/uprobe_multi_session_consumers.c
>

This is clever, though bit notation obscures the meaning of the code a
bit. But thanks for the long comment explaining the overall idea.

> diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c 
> b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c
> index b521590fdbb9..83eac954cf00 100644
> --- a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c
> +++ b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c
> @@ -9,6 +9,7 @@
>  #include "uprobe_multi_session.skel.h"
>  #include "uprobe_multi_session_cookie.skel.h"
>  #include "uprobe_multi_session_recursive.skel.h"
> +#include "uprobe_multi_session_consumers.skel.h"
>  #include "bpf/libbpf_internal.h"
>  #include "testing_helpers.h"
>  #include "../sdt.h"
> @@ -739,6 +740,206 @@ static void test_session_recursive_skel_api(void)
> uprobe_multi_session_recursive__destroy(skel);
>  }
>
> +static int uprobe_attach(struct uprobe_multi_session_consumers *skel, int 
> bit)
> +{
> +   struct bpf_program **prog = &skel->progs.uprobe_0 + bit;
> +   struct bpf_link **link = &skel->links.uprobe_0 + bit;
> +   LIBBPF_OPTS(bpf_uprobe_multi_opts, opts);
> +
> +   /*
> +* bit: 0,1 uprobe session
> +* bit: 2,3 uprobe entry
> +* bit: 4,5 uprobe return
> +*/
> +   opts.session = bit < 2;
> +   opts.retprobe = bit == 4 || bit == 5;
> +
> +   *link = bpf_program__attach_uprobe_multi(*prog, 0, "/proc/self/exe",
> +
> "uprobe_session_consumer_test",
> +&opts);
> +   if (!ASSERT_OK_PTR(*link, "bpf_program__attach_uprobe_multi"))
> +   return -1;
> +   return 0;
> +}
> +
> +static void uprobe_detach(struct uprobe_multi_session_consumers *skel, int 
> bit)
> +{
> +   struct bpf_link **link = &skel->links.uprobe_0 + bit;

ok, this is nasty, no one guarantees this should keep working,
explicit switch would be preferable

> +
> +   bpf_link__destroy(*link);
> +   *link = NULL;
> +}
> +
> +static bool test_bit(int bit, unsigned long val)
> +{
> +   return val & (1 << bit);
> +}
> +
> +noinline int
> +uprobe_session_consumer_test(struct uprobe_multi_session_consumers *skel,
> +unsigned long before, unsigned long after)
> +{
> +   int bit;
> +
> +   /* detach uprobe for each unset bit in 'before' state ... */
> +   for (bit = 0; bit < 6; bit++) {

Does "bit" correspond to the uprobe_X program? Maybe call it an uprobe
index or something, if that's the case? bits are just representations,
but semantically meaningful is identifier of an uprobe program, right?

> +   if (test_bit(bit, before) && !test_bit(bit, after))
> +   uprobe_detach(skel, bit);
> +   }
> +
> +   /* ... and attach all new bits in 'after' state */
> +   for (bit = 0; bit < 6; bit++) {
> +   if (!test_bit(bit, before) && test_bit(bit, after)) {
> +   if (!ASSERT_OK(uprobe_attach(skel, bit), 
> "uprobe_attach_after"))
> +   return -1;
> +   }
> +   }
> +   return 0;
> +}
> +

[...]

> +
> +static void test_session_consumers(void)
> +{
> +   struct uprobe_multi_session_consumers *skel;
> +   int before, after;
> +
> +   skel = uprobe_multi_session_consumers__open_and_load();
> +   if (!ASSERT_OK_PTR(skel, 
> "uprobe_multi_session_consumers__open_and_load"))
> +   return;
> +
> +   /*
> +* The idea of this test is to try all possible combinations of
> +* uprobes consumers attached on single function.
> +*
> +*  - 1 uprobe session with return handler called
> +*  - 1 uprobe session without return handler called
> +*  - 2 uprobe entry consumer
> +*  - 2 uprobe exit consumers
> +*
> +* The test uses 6 uprobes attached on single function, but that
> +* translates into single uprobe with 6 consumers in kernel.
> +*
> +* The before/after values present the state of attached consumers
> +* before and after the probed function:
> +*
> +*  bit 0   : uprobe session with return
> +*  bit 1   : uprobe session with no return
> +*  bit 2,3 : uprobe entry
> +*  bit 4,5 : uprobe return
> +*
> +* For example for:
> +*
> +*   before = 0b10101
> +*   after  = 0b00110
> + 

Re: [PATCH v2] perf,x86: avoid missing caller address in stack traces captured in uprobe

2024-07-02 Thread Andrii Nakryiko
On Tue, Jul 2, 2024 at 4:39 PM Josh Poimboeuf  wrote:
>
> On Tue, Jul 02, 2024 at 04:35:56PM -0700, Josh Poimboeuf wrote:
> > On Tue, Jul 02, 2024 at 10:18:58AM -0700, Andrii Nakryiko wrote:
> > > When tracing user functions with uprobe functionality, it's common to
> > > install the probe (e.g., a BPF program) at the first instruction of the
> > > function. This is often going to be `push %rbp` instruction in function
> > > preamble, which means that within that function frame pointer hasn't
> > > been established yet. This leads to consistently missing an actual
> > > caller of the traced function, because perf_callchain_user() only
> > > records current IP (capturing traced function) and then following frame
> > > pointer chain (which would be caller's frame, containing the address of
> > > caller's caller).
> > >
> > > So when we have target_1 -> target_2 -> target_3 call chain and we are
> > > tracing an entry to target_3, captured stack trace will report
> > > target_1 -> target_3 call chain, which is wrong and confusing.
> > >
> > > This patch proposes a x86-64-specific heuristic to detect `push %rbp`
> > > (`push %ebp` on 32-bit architecture) instruction being traced. Given
> > > entire kernel implementation of user space stack trace capturing works
> > > under assumption that user space code was compiled with frame pointer
> > > register (%rbp/%ebp) preservation, it seems pretty reasonable to use
> > > this instruction as a strong indicator that this is the entry to the
> > > function. In that case, return address is still pointed to by %rsp/%esp,
> > > so we fetch it and add to stack trace before proceeding to unwind the
> > > rest using frame pointer-based logic.
> > >
> > > Signed-off-by: Andrii Nakryiko 
> >
> > Should it also check for ENDBR64?
> >

Sure, I can add a check for endbr64 as well. endbr64 probably can be
used not just at function entry, is that right? So it might be another
case of false positive (which I think is ok, see below).

> > When compiled with -fcf-protection=branch, the first instruction of the
> > function will almost always be ENDBR64.  I'm not sure about other
> > distros, but at least Fedora compiles its binaries like that.
>
> BTW, there are some cases (including leaf functions and some stack
> alignment sequences) where a "push %rbp" can happen inside a function.
> Then it would presumably add a bogus trace entry.  Are such false
> positives ok?

I think such cases should be rare. People mostly seem to trace user
function entry/exit, rarely if ever they trace something within the
function, except for USDT cases, where it will be a nop instruction
that they trace.

In general, even with false positives, I think it's overwhelmingly
better to get correct entry stack trace 99.9% of the time, and in the
rest 0.01% cases it's fine having one extra bogus entry (but the rest
should still be correct), which should be easy for humans to recognize
and filter out, if necessary.

>
> --
> Josh



Re: [PATCHv2 bpf-next 1/9] uprobe: Add support for session consumer

2024-07-02 Thread Andrii Nakryiko
On Tue, Jul 2, 2024 at 4:55 PM Masami Hiramatsu  wrote:
>
> Hi Jiri,
>
> On Mon,  1 Jul 2024 18:41:07 +0200
> Jiri Olsa  wrote:
>
> > Adding support for uprobe consumer to be defined as session and have
> > new behaviour for consumer's 'handler' and 'ret_handler' callbacks.
> >
> > The session means that 'handler' and 'ret_handler' callbacks are
> > connected in a way that allows to:
> >
> >   - control execution of 'ret_handler' from 'handler' callback
> >   - share data between 'handler' and 'ret_handler' callbacks
> >
> > The session is enabled by setting new 'session' bool field to true
> > in uprobe_consumer object.
> >
> > We keep count of session consumers for uprobe and allocate session_consumer
> > object for each in return_instance object. This allows us to store
> > return values of 'handler' callbacks and data pointers of shared
> > data between both handlers.
> >
> > The session concept fits to our common use case where we do filtering
> > on entry uprobe and based on the result we decide to run the return
> > uprobe (or not).
> >
> > It's also convenient to share the data between session callbacks.
> >
> > The control of 'ret_handler' callback execution is done via return
> > value of the 'handler' callback. If it's 0 we install and execute
> > return uprobe, if it's 1 we do not.
> >
> > Signed-off-by: Jiri Olsa 
> > ---
> >  include/linux/uprobes.h |  16 -
> >  kernel/events/uprobes.c | 129 +---
> >  kernel/trace/bpf_trace.c|   6 +-
> >  kernel/trace/trace_uprobe.c |  12 ++--
> >  4 files changed, 144 insertions(+), 19 deletions(-)
> >
> > diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
> > index f46e0ca0169c..903a860a8d01 100644
> > --- a/include/linux/uprobes.h
> > +++ b/include/linux/uprobes.h
> > @@ -34,15 +34,18 @@ enum uprobe_filter_ctx {
> >  };
> >
> >  struct uprobe_consumer {
> > - int (*handler)(struct uprobe_consumer *self, struct pt_regs *regs);
> > + int (*handler)(struct uprobe_consumer *self, struct pt_regs *regs, 
> > __u64 *data);
> >   int (*ret_handler)(struct uprobe_consumer *self,
> >   unsigned long func,
> > - struct pt_regs *regs);
> > + struct pt_regs *regs, __u64 *data);
> >   bool (*filter)(struct uprobe_consumer *self,
> >   enum uprobe_filter_ctx ctx,
> >   struct mm_struct *mm);
> >
> >   struct uprobe_consumer *next;
> > +
> > + boolsession;/* marks uprobe session 
> > consumer */
> > + unsigned intsession_id; /* set when uprobe_consumer 
> > is registered */
>
> Hmm, why this has both session and session_id?

session is caller's request to establish session semantics. Jiri, I
think it's better to move it higher next to
handler/ret_handler/filter, that's the part of uprobe_consumer struct
which has read-only caller-provided data (I'm adding offset and
ref_ctr_offset there as well).

> I also think we can use the address of uprobe_consumer itself as a unique id.

+1

>
> Also, if we can set session enabled by default, and skip ret_handler by 
> handler's
> return value, it is more simpler. (If handler returns a specific value, skip 
> ret_handler)

you mean derive if it's a session or not by both handler and
ret_handler being set? I guess this works fine for BPF side, because
there we never had them both set. If this doesn't regress others, I
think it's OK. We just need to make sure we don't unnecessarily
allocate session state for consumers that don't set both handler and
ret_handler. That would be a waste.

>
> >  };
> >
> >  #ifdef CONFIG_UPROBES
> > @@ -80,6 +83,12 @@ struct uprobe_task {
> >   unsigned intdepth;
> >  };
> >
> > +struct session_consumer {
> > + __u64   cookie;
>
> And this cookie looks not scalable. If we can pass a data to handler, I would 
> like to
> reuse it to pass the target function parameters to ret_handler as 
> kretprobe/fprobe does.
>
> int (*handler)(struct uprobe_consumer *self, struct pt_regs *regs, 
> void *data);
>
> uprobes can collect its uc's required sizes and allocate the memory (shadow 
> stack frame)
> at handler_chain().

The goal here is to keep this simple and fast. I'd prefer to keep it
small and fixed size, if possible. I'm thinking about caching and
reusing return_instance as one of the future optimizations, so if we
can keep this more or less fixed (assuming there is typically not more
than 1 or 2 consumers per uprobe, which seems realistic), this will
provide a way to avoid excessive memory allocations.

>
> > + unsigned intid;
> > + int rc;
> > +};
> > +
> >  struct return_instance {
> >   struct uprobe   *uprobe;
> >   unsigned long   func;
> > @@ -88,6 +97,9 @@ struct return_instance {
> >   boolchained;/* true, if 

Re: [PATCH v2] perf,x86: avoid missing caller address in stack traces captured in uprobe

2024-07-02 Thread Andrii Nakryiko
On Tue, Jul 2, 2024 at 6:11 PM Josh Poimboeuf  wrote:
>
> On Tue, Jul 02, 2024 at 05:06:14PM -0700, Andrii Nakryiko wrote:
> > > > Should it also check for ENDBR64?
> > > >
> >
> > Sure, I can add a check for endbr64 as well. endbr64 probably can be
> > used not just at function entry, is that right? So it might be another
> > case of false positive (which I think is ok, see below).
>
> Yeah, at least theoretically they could happen in the middle of a
> function for implementing C switch jump tables.
>
> > > > When compiled with -fcf-protection=branch, the first instruction of the
> > > > function will almost always be ENDBR64.  I'm not sure about other
> > > > distros, but at least Fedora compiles its binaries like that.
> > >
> > > BTW, there are some cases (including leaf functions and some stack
> > > alignment sequences) where a "push %rbp" can happen inside a function.
> > > Then it would presumably add a bogus trace entry.  Are such false
> > > positives ok?
> >
> > I think such cases should be rare. People mostly seem to trace user
> > function entry/exit, rarely if ever they trace something within the
> > function, except for USDT cases, where it will be a nop instruction
> > that they trace.
> >
> > In general, even with false positives, I think it's overwhelmingly
> > better to get correct entry stack trace 99.9% of the time, and in the
> > rest 0.01% cases it's fine having one extra bogus entry (but the rest
> > should still be correct), which should be easy for humans to recognize
> > and filter out, if necessary.
>
> Agreed, this is a definite improvement overall.

Cool, I'll incorporate that into v3 and send it soon.

>
> BTW, soon there will be support for sframes instead of frame pointers,
> at which point these checks should only be done for the frame pointer
> case.

Nice, this is one of the reasons I've been thinking about asynchronous
stack trace capture in BPF (see [0] from recent LSF/MM).

Few questions, while we are at it. Does it mean that
perf_callchain_user() will support working from sleepable context and
will wait for data to be paged in? Is anyone already working on this?
Any pointers?

 [0] 
https://docs.google.com/presentation/d/1k10-HtK7pP5CMMa86dDCdLW55fHOut4co3Zs5akk0t4

>
> --
> Josh



[PATCH v3] perf,x86: avoid missing caller address in stack traces captured in uprobe

2024-07-02 Thread Andrii Nakryiko
When tracing user functions with uprobe functionality, it's common to
install the probe (e.g., a BPF program) at the first instruction of the
function. This is often going to be `push %rbp` instruction in function
preamble, which means that within that function frame pointer hasn't
been established yet. This leads to consistently missing an actual
caller of the traced function, because perf_callchain_user() only
records current IP (capturing traced function) and then following frame
pointer chain (which would be caller's frame, containing the address of
caller's caller).

So when we have target_1 -> target_2 -> target_3 call chain and we are
tracing an entry to target_3, captured stack trace will report
target_1 -> target_3 call chain, which is wrong and confusing.

This patch proposes a x86-64-specific heuristic to detect `push %rbp`
(`push %ebp` on 32-bit architecture) instruction being traced. Given
entire kernel implementation of user space stack trace capturing works
under assumption that user space code was compiled with frame pointer
register (%rbp/%ebp) preservation, it seems pretty reasonable to use
this instruction as a strong indicator that this is the entry to the
function. In that case, return address is still pointed to by %rsp/%esp,
so we fetch it and add to stack trace before proceeding to unwind the
rest using frame pointer-based logic.

We also check for `endbr64` (for 64-bit modes) as another common pattern
for function entry, as suggested by Josh Poimboeuf. Even if we get this
wrong sometimes for uprobes attached not at the function entry, it's OK
because stack trace will still be overall meaningful, just with one
extra bogus entry. If we don't detect this, we end up with guaranteed to
be missing caller function entry in the stack trace, which is worse
overall.

Signed-off-by: Andrii Nakryiko 
---
v2->v3:
  - added endr64 detection and extracted heuristics into a function (Josh);
v1->v2:
  - use native unsigned long for ret_addr (Peter);
  - add same logic for compat logic in perf_callchain_user32 (Peter).

 arch/x86/events/core.c  | 56 +
 include/linux/uprobes.h |  2 ++
 kernel/events/uprobes.c |  2 ++
 3 files changed, 60 insertions(+)

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 5b0dd07b1ef1..2174a9d2173e 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2813,6 +2813,31 @@ static unsigned long get_segment_base(unsigned int 
segment)
return get_desc_base(desc);
 }
 
+#ifdef CONFIG_UPROBES
+/*
+ * Heuristic-based check if uprobe is installed at the function entry.
+ *
+ * Under assumption of user code being compiled with frame pointers,
+ * `push %rbp/%ebp` is a good indicator that we indeed are.
+ *
+ * Similarly, `endbr64` (assuming 64-bit mode) is also a common pattern.
+ * If we get this wrong, captured stack trace might have one extra bogus
+ * entry, but the rest of stack trace will still be meaningful.
+ */
+static bool is_uprobe_at_func_entry(struct pt_regs *regs, struct arch_uprobe 
*auprobe)
+{
+   if (!auprobe)
+   return false;
+   /* push %rbp/%ebp */
+   if (auprobe->insn[0] == 0x55)
+   return true;
+   /* endbr64 (64-bit only) */
+   if (user_64bit_mode(regs) && *(u32 *)auprobe->insn == 0xfa1e0ff3)
+   return true;
+   return false;
+}
+#endif
+
 #ifdef CONFIG_IA32_EMULATION
 
 #include 
@@ -2833,6 +2858,18 @@ perf_callchain_user32(struct pt_regs *regs, struct 
perf_callchain_entry_ctx *ent
 
fp = compat_ptr(ss_base + regs->bp);
pagefault_disable();
+
+#ifdef CONFIG_UPROBES
+   /* see perf_callchain_user() below for why we do this */
+   if (current->utask) {
+   u32 ret_addr;
+
+   if (is_uprobe_at_func_entry(regs, current->utask->auprobe) &&
+   !__get_user(ret_addr, (const u32 __user *)regs->sp))
+   perf_callchain_store(entry, ret_addr);
+   }
+#endif
+
while (entry->nr < entry->max_stack) {
if (!valid_user_frame(fp, sizeof(frame)))
break;
@@ -2884,6 +2921,25 @@ perf_callchain_user(struct perf_callchain_entry_ctx 
*entry, struct pt_regs *regs
return;
 
pagefault_disable();
+
+#ifdef CONFIG_UPROBES
+   /*
+* If we are called from uprobe handler, and we are indeed at the very
+* entry to user function (which is normally a `push %rbp` instruction,
+* under assumption of application being compiled with frame pointers),
+* we should read return address from *regs->sp before proceeding
+* to follow frame pointers, otherwise we'll skip immediate caller
+* as %rbp is not yet setup.
+*/
+   if (current->utask) {
+   unsigned long ret_addr;
+
+   if (is_uprobe_at_func_entry(regs, cur

Re: [PATCH v2 00/12] uprobes: add batched register/unregister APIs and per-CPU RW semaphore

2024-07-02 Thread Andrii Nakryiko
On Tue, Jul 2, 2024 at 12:19 PM Peter Zijlstra  wrote:
>
> On Tue, Jul 02, 2024 at 10:54:51AM -0700, Andrii Nakryiko wrote:
>
> > > @@ -593,6 +595,12 @@ static struct uprobe *get_uprobe(struct uprobe 
> > > *uprobe)
> > > return uprobe;
> > >  }
> > >

[...]

> > > @@ -668,12 +677,25 @@ static struct uprobe *__find_uprobe(struct inode 
> > > *inode, loff_t offset)
> > >  static struct uprobe *find_uprobe(struct inode *inode, loff_t offset)
> > >  {
> > > struct uprobe *uprobe;
> > > +   unsigned seq;
> > >
> > > -   read_lock(&uprobes_treelock);
> > > -   uprobe = __find_uprobe(inode, offset);
> > > -   read_unlock(&uprobes_treelock);
> > > +   guard(rcu)();
> > >
> > > -   return uprobe;
> > > +   do {
> > > +   seq = read_seqcount_begin(&uprobes_seqcount);
> > > +   uprobes = __find_uprobe(inode, offset);
> > > +   if (uprobes) {
> > > +   /*
> > > +* Lockless RB-tree lookups are prone to 
> > > false-negatives.
> > > +* If they find something, it's good. If they do 
> > > not find,
> > > +* it needs to be validated.
> > > +*/
> > > +   return uprobes;
> > > +   }
> > > +   } while (read_seqcount_retry(&uprobes_seqcount, seq));
> > > +
> > > +   /* Really didn't find anything. */
> > > +   return NULL;
> > >  }
> >
> > Honest question here, as I don't understand the tradeoffs well enough.
> > Is there a lot of benefit to switching to seqcount lock vs using
> > percpu RW semaphore (previously recommended by Ingo). The latter is a
> > nice drop-in replacement and seems to be very fast and scale well.
>
> As you noted, that percpu-rwsem write side is quite insane. And you're
> creating this batch complexity to mitigate that.


Note that batch API is needed regardless of percpu RW semaphore or
not. As I mentioned, once uprobes_treelock is mitigated one way or the
other, the next one is uprobe->register_rwsem. For scalability, we
need to get rid of it and preferably not add any locking at all. So
tentatively I'd like to have lockless RCU-protected iteration over
uprobe->consumers list and call consumer->handler(). This means that
on uprobes_unregister we'd need synchronize_rcu (for whatever RCU
flavor we end up using), to ensure that we don't free uprobe_consumer
memory from under handle_swbp() while it is actually triggering
consumers.

So, without batched unregistration we'll be back to the same problem
I'm solving here: doing synchronize_rcu() for each attached uprobe one
by one is prohibitively slow. We went through this exercise with
ftrace/kprobes already and fixed it with batched APIs. Doing that for
uprobes seems unavoidable as well.

>
> The patches you propose are quite complex, this alternative not so much.

I agree that this custom refcounting is not trivial, but at least it's
pretty well contained within two low-level helpers which are all used
within this single .c file.

On the other hand, it actually gives us a) speed and better
scalability (I showed comparisons with refcount_inc_not_zero approach
earlier, I believe) and b) it actually simplifies logic during
registration (which is even more important aspect with batched API),
where we don't need to handle uprobe suddenly going away after we
already looked it up.

I believe overall it's an improvement worth doing.

>
> > Right now we are bottlenecked on uprobe->register_rwsem (not
> > uprobes_treelock anymore), which is currently limiting the scalability
> > of uprobes and I'm going to work on that next once I'm done with this
> > series.
>
> Right, but it looks fairly simple to replace that rwsem with a mutex and
> srcu.

srcu vs RCU Tasks Trace aside (which Paul addressed), see above about
the need for batched API and synchronize_rcu().



Re: [PATCH v2 00/12] uprobes: add batched register/unregister APIs and per-CPU RW semaphore

2024-07-02 Thread Andrii Nakryiko
On Tue, Jul 2, 2024 at 4:56 PM Paul E. McKenney  wrote:
>
> On Tue, Jul 02, 2024 at 09:18:57PM +0200, Peter Zijlstra wrote:
> > On Tue, Jul 02, 2024 at 10:54:51AM -0700, Andrii Nakryiko wrote:
> >
> > > > @@ -593,6 +595,12 @@ static struct uprobe *get_uprobe(struct uprobe 
> > > > *uprobe)
> > > > return uprobe;
> > > >  }
> > > >
> > > > +static void uprobe_free_rcu(struct rcu_head *rcu)
> > > > +{
> > > > +   struct uprobe *uprobe = container_of(rcu, struct uprobe, rcu);
> > > > +   kfree(uprobe);
> > > > +}
> > > > +
> > > >  static void put_uprobe(struct uprobe *uprobe)
> > > >  {
> > > > if (refcount_dec_and_test(&uprobe->ref)) {
> > > > @@ -604,7 +612,8 @@ static void put_uprobe(struct uprobe *uprobe)
> > >
> > > right above this we have roughly this:
> > >
> > > percpu_down_write(&uprobes_treelock);
> > >
> > > /* refcount check */
> > > rb_erase(&uprobe->rb_node, &uprobes_tree);
> > >
> > > percpu_up_write(&uprobes_treelock);
> > >
> > >
> > > This writer lock is necessary for modification of the RB tree. And I
> > > was under impression that I shouldn't be doing
> > > percpu_(down|up)_write() inside the normal
> > > rcu_read_lock()/rcu_read_unlock() region (percpu_down_write has
> > > might_sleep() in it). But maybe I'm wrong, hopefully Paul can help to
> > > clarify.
> >
> > preemptible RCU or SRCU would work.
>
> I agree that SRCU would work from a functional viewpoint.  No so for
> preemptible RCU, which permits preemption (and on -rt, blocking for
> spinlocks), it does not permit full-up blocking, and for good reason.
>
> > > But actually what's wrong with RCU Tasks Trace flavor?
> >
> > Paul, isn't this the RCU flavour you created to deal with
> > !rcu_is_watching()? The flavour that never should have been created in
> > favour of just cleaning up the mess instead of making more.
>
> My guess is that you are instead thinking of RCU Tasks Rude, which can
> be eliminated once all architectures get their entry/exit/deep-idle
> functions either inlined or marked noinstr.
>
> > > I will
> > > ultimately use it anyway to avoid uprobe taking unnecessary refcount
> > > and to protect uprobe->consumers iteration and uc->handler() calls,
> > > which could be sleepable, so would need rcu_read_lock_trace().
> >
> > I don't think you need trace-rcu for that. SRCU would do nicely I think.
>
> From a functional viewpoint, agreed.
>
> However, in the past, the memory-barrier and array-indexing overhead
> of SRCU has made it a no-go for lightweight probes into fastpath code.
> And these cases were what motivated RCU Tasks Trace (as opposed to RCU
> Tasks Rude).

Yep, and this is a similar case here. I've actually implemented
SRCU-based protection and benchmarked it (all other things being the
same). I see 5% slowdown for the fastest uprobe kind (entry uprobe on
nop) for the single-threaded use case. We go down from 3.15 millions/s
triggerings to slightly below 3 millions/s. With more threads the
difference increases a bit, though numbers vary a bit from run to run,
so I don't want to put out the exact number. But I see that for
SRCU-based implementation total aggregated peak achievable throughput
is about 3.5-3.6 mln/s vs this implementation reaching 4-4.1 mln/s.
Again, some of that could be variability, but I did run multiple
rounds and that's the trend I'm seeing.

>
> The other rule for RCU Tasks Trace is that although readers are permitted
> to block, this blocking can be for no longer than a major page fault.
> If you need longer-term blocking, then you should instead use SRCU.
>

And this is the case here. Right now rcu_read_lock_trace() is
protecting uprobes_treelock, which is only taken for the duration of
RB tree lookup/insert/delete. In my subsequent changes to eliminate
register_rwsem we might be executing uprobe_consumer under this RCU
lock, but those also should be only sleeping for page faults.

On the other hand, hot path (reader side) is quite hot with
millions/second executions and should add as little overhead as
possible (which is why I'm seeing SRCU-based implementation being
slower, as I mentioned above).

> Thanx, Paul
>
> > > > mutex_lock(&delayed_uprobe_lock);
> > > > delayed_uprobe_remove(uprobe, NULL);
> > > > m

Re: [PATCHv2 bpf-next 1/9] uprobe: Add support for session consumer

2024-07-03 Thread Andrii Nakryiko
On Wed, Jul 3, 2024 at 1:10 AM Peter Zijlstra  wrote:
>
> On Tue, Jul 02, 2024 at 01:51:28PM -0700, Andrii Nakryiko wrote:
> > > +static size_t ri_size(int sessions_cnt)
> > > +{
> > > +   struct return_instance *ri __maybe_unused;
> > > +
> > > +   return sizeof(*ri) + sessions_cnt * sizeof(ri->sessions[0]);
> >
> > just use struct_size()?
>
> Yeah, lets not. This is readable, struct_size() is not.

This hack with __maybe_unused is more readable than the standard
struct_size() helper that was added specifically for cases like this,
really?

I wonder if Kees agrees and whether there are any downsides to using
struct_size()

struct_size(struct return_instance, sessions, sessions_cnt) seems
readable enough to me, in any case.



Re: [PATCH v2] perf,x86: avoid missing caller address in stack traces captured in uprobe

2024-07-03 Thread Andrii Nakryiko
On Tue, Jul 2, 2024 at 11:11 PM Josh Poimboeuf  wrote:
>
> On Tue, Jul 02, 2024 at 08:35:08PM -0700, Andrii Nakryiko wrote:
> > On Tue, Jul 2, 2024 at 6:11 PM Josh Poimboeuf  wrote:
> > > On Tue, Jul 02, 2024 at 05:06:14PM -0700, Andrii Nakryiko wrote:
> > > > In general, even with false positives, I think it's overwhelmingly
> > > > better to get correct entry stack trace 99.9% of the time, and in the
> > > > rest 0.01% cases it's fine having one extra bogus entry (but the rest
> > > > should still be correct), which should be easy for humans to recognize
> > > > and filter out, if necessary.
> > >
> > > Agreed, this is a definite improvement overall.
> >
> > Cool, I'll incorporate that into v3 and send it soon.
> >

BTW, if you have a chance, please do take a look at v3 and leave your
ack, if you are ok with it. Thanks!

> > >
> > > BTW, soon there will be support for sframes instead of frame pointers,
> > > at which point these checks should only be done for the frame pointer
> > > case.
> >
> > Nice, this is one of the reasons I've been thinking about asynchronous
> > stack trace capture in BPF (see [0] from recent LSF/MM).
> >  [0] 
> > https://docs.google.com/presentation/d/1k10-HtK7pP5CMMa86dDCdLW55fHOut4co3Zs5akk0t4
>
> I don't seem to have permission to open it.
>

Argh, sorry, it's under my corporate account which doesn't allow
others to view it. Try this, I "published" it, let me know if that
still doesn't work:

  [0] 
https://docs.google.com/presentation/d/e/2PACX-1vRgL3UPbkrznwtNPKn-sSjvan7tFeMqOrIyZAFSSEPYiWG20JGSP80jBmZqGwqMuBGVmv9vyLU4KRTx/pub

> > Few questions, while we are at it. Does it mean that
> > perf_callchain_user() will support working from sleepable context and
> > will wait for data to be paged in? Is anyone already working on this?
> > Any pointers?
>
> I had a prototype here:
>
>   https://lkml.kernel.org/lkml/cover.1699487758.git.jpoim...@kernel.org
>
> Hopefully I can get started on v2 soon.

Ok, so you are going to work on this. Please cc me on future revisions
then. Thanks!

>
> --
> Josh



Re: [PATCH v2 00/12] uprobes: add batched register/unregister APIs and per-CPU RW semaphore

2024-07-03 Thread Andrii Nakryiko
On Wed, Jul 3, 2024 at 1:07 AM Peter Zijlstra  wrote:
>
> On Tue, Jul 02, 2024 at 09:47:41PM -0700, Andrii Nakryiko wrote:
>
> > > As you noted, that percpu-rwsem write side is quite insane. And you're
> > > creating this batch complexity to mitigate that.
> >
> >
> > Note that batch API is needed regardless of percpu RW semaphore or
> > not. As I mentioned, once uprobes_treelock is mitigated one way or the
> > other, the next one is uprobe->register_rwsem. For scalability, we
> > need to get rid of it and preferably not add any locking at all. So
> > tentatively I'd like to have lockless RCU-protected iteration over
> > uprobe->consumers list and call consumer->handler(). This means that
> > on uprobes_unregister we'd need synchronize_rcu (for whatever RCU
> > flavor we end up using), to ensure that we don't free uprobe_consumer
> > memory from under handle_swbp() while it is actually triggering
> > consumers.
> >
> > So, without batched unregistration we'll be back to the same problem
> > I'm solving here: doing synchronize_rcu() for each attached uprobe one
> > by one is prohibitively slow. We went through this exercise with
> > ftrace/kprobes already and fixed it with batched APIs. Doing that for
> > uprobes seems unavoidable as well.
>
> I'm not immediately seeing how you need that terrible refcount stuff for

Which part is terrible, please be more specific. I can switch to
refcount_inc_not_zero() and leave performance improvement on the
table, but why is that a good idea?

> the batching though. If all you need is group a few unregisters together
> in order to share a sync_rcu() that seems way overkill.
>
> You seem to have muddled the order of things, which makes the actual
> reason for doing things utterly unclear.

See -EGAIN handling in uprobe_register() code in current upstream
kernel. We manage to allocate and insert (or update existing) uprobe
in uprobes_tree. And then when we try to register we can post factum
detect that uprobe was removed from RB tree from under us. And we have
to go on a retry, allocating/inserting/updating it again.

This is quite problematic for batched API, in which I split the whole
attachment into few independent phase:

  - preallocate uprobe instances (for all consumers/uprobes)
  - insert them or reuse pre-existing ones (again, for all consumers
in one batch, protected by single writer lock on uprobes_treelock);
  - then register/apply for each VMA (you get it, for all consumers in one go).

Having this retry for some of uprobes because of this race is hugely
problematic, so I wanted to make it cleaner and simpler: once you
manage to insert/reuse uprobe, it's not going away from under me.
Which is why the change to refcounting schema.

And I think it's a major improvement. We can argue about
refcount_inc_not_zero vs this custom refcounting schema, but I think
the change should be made.

Now, imagine I also did all the seqcount and RCU stuff across entire
uprobe functionality. Wouldn't that be mind bending a little bit to
wrap your head around this?



Re: [PATCH v2 00/12] uprobes: add batched register/unregister APIs and per-CPU RW semaphore

2024-07-03 Thread Andrii Nakryiko
On Mon, Jul 1, 2024 at 3:39 PM Andrii Nakryiko  wrote:
>
> This patch set, ultimately, switches global uprobes_treelock from RW spinlock
> to per-CPU RW semaphore, which has better performance and scales better under
> contention and multiple parallel threads triggering lots of uprobes.
>
> To make this work well with attaching multiple uprobes (through BPF
> multi-uprobe), we need to add batched versions of uprobe register/unregister
> APIs. This is what most of the patch set is actually doing. The actual switch
> to per-CPU RW semaphore is trivial after that and is done in the very last
> patch #12. See commit message with some comparison numbers.
>

Peter,

I think I've addressed all the questions so far, but I wanted to take
a moment and bring all the discussions into a single palace, summarize
what I think are the main points of contention and hopefully make some
progress, or at least get us to a bit more constructive discussion
where *both sides* provide arguments. Right now there is a lot of "you
are doing X, but why don't you just do Y" with no argument for a) why
X is bad/wrong/inferior and b) why Y is better (and not just
equivalent or, even worse, inferior).

I trust you have the best intentions in mind for this piece of kernel
infrastructure, so do I, so let's try to find a path forward.

1. Strategically, uprobes/uretprobes have to be improved. Customers do
complain more and more that "uprobes are slow", justifiably so. Both
single-threaded performance matters, but also, critically, uprobes
scalability. I.e., if the kernel can handle N uprobe per second on a
single uncontended CPU, then triggering uprobes across M CPUs should,
ideally and roughly, give us about N * M total throughput.

This doesn't seem controversial, but I wanted to make it clear that
this is the end goal of my work. And no, this patch set alone doesn't,
yet, get us there. But it's a necessary step, IMO. Jiri Olsa took
single-threaded performance and is improving it with sys_uretprobe and
soon sys_uprobe, I'm looking into scalability and other smaller
single-threaded wins, where possible.

2. More tactically, RCU protection seems like the best way forward. We
got hung up on SRCU vs RCU Tasks Trace. Thanks to Paul, we also
clarified that RCU Tasks Trace has nothing to do with Tasks Rude
flavor (whatever that is, I have no idea).

Now, RCU Tasks Trace were specifically designed for least overhead
hotpath (reader side) performance, at the expense of slowing down much
rarer writers. My microbenchmarking does show at least 5% difference.
Both flavors can handle sleepable uprobes waiting for page faults.
Tasks Trace flavor is already used for tracing in the BPF realm,
including for sleepable uprobes and works well. It's not going away.

Now, you keep pushing for SRCU instead of RCU Tasks Trace, but I
haven't seen a single argument why. Please provide that, or let's
stick to RCU Tasks Trace, because uprobe's use case is an ideal case
of what Tasks Trace flavor was designed for.

3. Regardless of RCU flavor, due to RCU protection, we have to add
batched register/unregister APIs, so we can amortize sync_rcu cost
during deregistration. Can we please agree on that as well? This is
the main goal of this patch set and I'd like to land it before working
further on changing and improving the rest of the locking schema.

I won't be happy about it, but just to move things forward, I can drop
a) custom refcounting and/or b) percpu RW semaphore. Both are
beneficial but not essential for batched APIs work. But if you force
me to do that, please state clearly your reasons/arguments. No one had
yet pointed out why refcounting is broken and why percpu RW semaphore
is bad. On the contrary, Ingo Molnar did suggest percpu RW semaphore
in the first place (see [0]), but we postponed it due to the lack of
batched APIs, and promised to do this work. Here I am, doing the
promised work. Not purely because of percpu RW semaphore, but
benefiting from it just as well.

  [0] https://lore.kernel.org/linux-trace-kernel/zf+d9twfyidos...@gmail.com/

4. Another tactical thing, but an important one. Refcounting schema
for uprobes. I've replied already, but I think refcounting is
unavoidable for uretprobes, and current refcounting schema is
problematic for batched APIs due to race between finding uprobe and
there still being a possibility we'd need to undo all that and retry
again.

I think the main thing is to agree to change refcounting to avoid this
race, allowing for simpler batched registration. Hopefully we can
agree on that.

But also, refcount_inc_not_zero() which is another limiting factor for
scalability (see above about the end goal of scalability) vs
atomic64_add()-based epoch+refcount approach I took, which is
noticeably better on x86-64, and I don't think hurts any other
architecture, to say the least. I think the latte

Re: [PATCHv2 bpf-next 1/9] uprobe: Add support for session consumer

2024-07-03 Thread Andrii Nakryiko
On Wed, Jul 3, 2024 at 8:31 AM Jiri Olsa  wrote:
>
> On Tue, Jul 02, 2024 at 01:52:38PM -0700, Andrii Nakryiko wrote:
> > On Tue, Jul 2, 2024 at 9:11 AM Jiri Olsa  wrote:
> > >
> > > On Tue, Jul 02, 2024 at 03:04:08PM +0200, Peter Zijlstra wrote:
> > > > On Mon, Jul 01, 2024 at 06:41:07PM +0200, Jiri Olsa wrote:
> > > >
> > > > > +static void
> > > > > +uprobe_consumer_account(struct uprobe *uprobe, struct 
> > > > > uprobe_consumer *uc)
> > > > > +{
> > > > > +   static unsigned int session_id;
> > > > > +
> > > > > +   if (uc->session) {
> > > > > +   uprobe->sessions_cnt++;
> > > > > +   uc->session_id = ++session_id ?: ++session_id;
> > > > > +   }
> > > > > +}
> > > >
> > > > The way I understand this code, you create a consumer every time you do
> > > > uprobe_register() and unregister makes it go away.
> > > >
> > > > Now, register one, then 4g-1 times register+unregister, then register
> > > > again.
> > > >
> > > > The above seems to then result in two consumers with the same
> > > > session_id, which leads to trouble.
> > > >
> > > > Hmm?
> > >
> > > ugh true.. will make it u64 :)
> > >
> > > I think we could store uprobe_consumer pointer+ref in session_consumer,
> > > and that would make the unregister path more interesting.. will check
> >
> > More interesting how? It's actually a great idea, uprobe_consumer
>
> nah, got confused ;-)
>
> > pointer itself is a unique ID and 64-bit. We can still use lowest bit
> > for RC (see my other reply).
>
> I used pointers in the previous version, but then I thought what if the
> consumer gets free-ed and new one created (with same address.. maybe not
> likely but possible, right?) before the return probe is hit

I think no matter what we do, uprobe_unregister() API has to guarantee
that when it returns consumer won't be hit (i.e., we removed consumer
from uprobe->consumers list, waited for RCU grace period(s), etc). So
I don't think this should be a problem. And that's one of the reasons
for the need for batched unregister, because we'll have to do sync_rcu
call there for this.

>
> jirka



Re: [PATCHv2 bpf-next 1/9] uprobe: Add support for session consumer

2024-07-03 Thread Andrii Nakryiko
On Wed, Jul 3, 2024 at 9:09 AM Jiri Olsa  wrote:
>
> On Tue, Jul 02, 2024 at 05:13:38PM -0700, Andrii Nakryiko wrote:
> > On Tue, Jul 2, 2024 at 4:55 PM Masami Hiramatsu  wrote:
> > >
> > > Hi Jiri,
> > >
> > > On Mon,  1 Jul 2024 18:41:07 +0200
> > > Jiri Olsa  wrote:
> > >
> > > > Adding support for uprobe consumer to be defined as session and have
> > > > new behaviour for consumer's 'handler' and 'ret_handler' callbacks.
> > > >
> > > > The session means that 'handler' and 'ret_handler' callbacks are
> > > > connected in a way that allows to:
> > > >
> > > >   - control execution of 'ret_handler' from 'handler' callback
> > > >   - share data between 'handler' and 'ret_handler' callbacks
> > > >
> > > > The session is enabled by setting new 'session' bool field to true
> > > > in uprobe_consumer object.
> > > >
> > > > We keep count of session consumers for uprobe and allocate 
> > > > session_consumer
> > > > object for each in return_instance object. This allows us to store
> > > > return values of 'handler' callbacks and data pointers of shared
> > > > data between both handlers.
> > > >
> > > > The session concept fits to our common use case where we do filtering
> > > > on entry uprobe and based on the result we decide to run the return
> > > > uprobe (or not).
> > > >
> > > > It's also convenient to share the data between session callbacks.
> > > >
> > > > The control of 'ret_handler' callback execution is done via return
> > > > value of the 'handler' callback. If it's 0 we install and execute
> > > > return uprobe, if it's 1 we do not.
> > > >
> > > > Signed-off-by: Jiri Olsa 
> > > > ---
> > > >  include/linux/uprobes.h |  16 -
> > > >  kernel/events/uprobes.c | 129 +---
> > > >  kernel/trace/bpf_trace.c|   6 +-
> > > >  kernel/trace/trace_uprobe.c |  12 ++--
> > > >  4 files changed, 144 insertions(+), 19 deletions(-)
> > > >
> > > > diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
> > > > index f46e0ca0169c..903a860a8d01 100644
> > > > --- a/include/linux/uprobes.h
> > > > +++ b/include/linux/uprobes.h
> > > > @@ -34,15 +34,18 @@ enum uprobe_filter_ctx {
> > > >  };
> > > >
> > > >  struct uprobe_consumer {
> > > > - int (*handler)(struct uprobe_consumer *self, struct pt_regs 
> > > > *regs);
> > > > + int (*handler)(struct uprobe_consumer *self, struct pt_regs 
> > > > *regs, __u64 *data);
> > > >   int (*ret_handler)(struct uprobe_consumer *self,
> > > >   unsigned long func,
> > > > - struct pt_regs *regs);
> > > > + struct pt_regs *regs, __u64 *data);
> > > >   bool (*filter)(struct uprobe_consumer *self,
> > > >   enum uprobe_filter_ctx ctx,
> > > >   struct mm_struct *mm);
> > > >
> > > >   struct uprobe_consumer *next;
> > > > +
> > > > + boolsession;/* marks uprobe session 
> > > > consumer */
> > > > + unsigned intsession_id; /* set when 
> > > > uprobe_consumer is registered */
> > >
> > > Hmm, why this has both session and session_id?
> >
> > session is caller's request to establish session semantics. Jiri, I
>
> and session_id is set when uprobe is registered and used when
> return uprobe is executed to find matching uprobe_consumer,
> plz check handle_uretprobe_chain/session_consumer_find
>
> > think it's better to move it higher next to
> > handler/ret_handler/filter, that's the part of uprobe_consumer struct
> > which has read-only caller-provided data (I'm adding offset and
> > ref_ctr_offset there as well).
>
> ok, makes sense
>
> >
> > > I also think we can use the address of uprobe_consumer itself as a unique 
> > > id.
> >
> > +1
> >
> > >
> > > Also, if we can set session enabled by default, and skip ret_hand

Re: [PATCHv2 bpf-next 1/9] uprobe: Add support for session consumer

2024-07-03 Thread Andrii Nakryiko
On Wed, Jul 3, 2024 at 10:13 AM Jiri Olsa  wrote:
>
> On Tue, Jul 02, 2024 at 01:51:28PM -0700, Andrii Nakryiko wrote:
>
> SNIP
>
> > >  #ifdef CONFIG_UPROBES
> > > @@ -80,6 +83,12 @@ struct uprobe_task {
> > > unsigned intdepth;
> > >  };
> > >
> > > +struct session_consumer {
> > > +   __u64   cookie;
> > > +   unsigned intid;
> > > +   int rc;
> >
> > you'll be using u64 for ID, right? so this struct will be 24 bytes.
>
> yes
>
> > Maybe we can just use topmost bit of ID to store whether uretprobe
> > should run or not? It's trivial to mask out during ID comparisons
>
> actually.. I think we could store just consumers that need to be
> executed in return probe so there will be no need for 'rc' value

ah, nice idea. NULL would mean we have session uprobe, but for this
particular run we "disabled" uretprobe part of it. Great. And for
non-session uprobes we just won't have session_consumer at all, right?

[...]

> > > +static struct session_consumer *
> > > +session_consumer_next(struct return_instance *ri, struct 
> > > session_consumer *sc,
> > > + int session_id)
> > > +{
> > > +   struct session_consumer *next;
> > > +
> > > +   next = sc ? sc + 1 : &ri->sessions[0];
> > > +   next->id = session_id;
> >
> > it's kind of unexpected that "session_consumer_next" would actually
> > set an ID... Maybe drop int session_id as input argument and fill it
> > out outside of this function, this function being just a simple
> > iterator?
>
> yea, I was going back and forth on what to have in that function
> or not, to keep the change minimal, but makes sense, will move
>

great, thanks

> >
> > > +   return next;
> > > +}
> > > +

[...]

> >
> > > +   } else if (uc->ret_handler) {
> > > need_prep = true;
> > > +   }
> > >
> > > remove &= rc;
> > > }
> > >
> > > +   /* no removal if there's at least one session consumer */
> > > +   remove &= !uprobe->sessions_cnt;
> >
> > this is counter (not error, not pointer), let's stick to ` == 0`, please
> >
> > is this
> >
> > if (uprobe->sessions_cnt != 0)
> >remove = 0;
>
> yes ;-) will change
>

Thanks, I feel bad for being the only one to call this out, but I find
all these '!` constructs extremely unintuitive
and hard to reason about quickly. It's only pointers and error cases
that are more or less intuitive. Everything else, including
!strcmp(...) is just mind bending and exhausting... Perhaps I'm just
not a kernel engineer enough :)

> jirka
>
> >
> > ? I can't tell (honestly), without spending ridiculous amounts of
> > mental resources (for the underlying simplicity of the condition).
>
> SNIP



Re: [PATCH v2] perf,x86: avoid missing caller address in stack traces captured in uprobe

2024-07-03 Thread Andrii Nakryiko
On Wed, Jul 3, 2024 at 3:41 PM Josh Poimboeuf  wrote:
>
> On Wed, Jul 03, 2024 at 01:23:39PM -0700, Andrii Nakryiko wrote:
> > > >  [0] 
> > > > https://docs.google.com/presentation/d/1k10-HtK7pP5CMMa86dDCdLW55fHOut4co3Zs5akk0t4
> > >
> > > I don't seem to have permission to open it.
> > >
> >
> > Argh, sorry, it's under my corporate account which doesn't allow
> > others to view it. Try this, I "published" it, let me know if that
> > still doesn't work:
> >
> >   [0] 
> > https://docs.google.com/presentation/d/e/2PACX-1vRgL3UPbkrznwtNPKn-sSjvan7tFeMqOrIyZAFSSEPYiWG20JGSP80jBmZqGwqMuBGVmv9vyLU4KRTx/pub
>
> The new link doesn't work either :-)
>

Goodness, sorry about that. I just recreated it under my public
account and shared it with the world. This HAS to work:

  
https://docs.google.com/presentation/d/1eaOf9CVZlCOD6b7_UtZBYMfTyYIDZw9clyjzu-IIOIo

> > > > Few questions, while we are at it. Does it mean that
> > > > perf_callchain_user() will support working from sleepable context and
> > > > will wait for data to be paged in? Is anyone already working on this?
> > > > Any pointers?
> > >
> > > I had a prototype here:
> > >
> > >   https://lkml.kernel.org/lkml/cover.1699487758.git.jpoim...@kernel.org
> > >
> > > Hopefully I can get started on v2 soon.
> >
> > Ok, so you are going to work on this. Please cc me on future revisions
> > then. Thanks!
>
> Will do!
>
> --
> Josh



Re: [PATCH v3] perf,x86: avoid missing caller address in stack traces captured in uprobe

2024-07-03 Thread Andrii Nakryiko
On Wed, Jul 3, 2024 at 3:39 PM Josh Poimboeuf  wrote:
>
> On Tue, Jul 02, 2024 at 09:02:03PM -0700, Andrii Nakryiko wrote:
> > @@ -2833,6 +2858,18 @@ perf_callchain_user32(struct pt_regs *regs, struct 
> > perf_callchain_entry_ctx *ent
> >
> >   fp = compat_ptr(ss_base + regs->bp);
> >   pagefault_disable();
> > +
> > +#ifdef CONFIG_UPROBES
> > + /* see perf_callchain_user() below for why we do this */
> > + if (current->utask) {
> > + u32 ret_addr;
> > +
> > + if (is_uprobe_at_func_entry(regs, current->utask->auprobe) &&
> > + !__get_user(ret_addr, (const u32 __user *)regs->sp))
>
> Shouldn't the regs->sp value be checked with __access_ok() before
> calling __get_user()?

Ah, it's __get_user vs get_user quirk, right? Should I just use
get_user() here? It seems like existing code is trying to avoid two
__access_ok() checks for two fields of stack_frame, but here we don't
have that optimization opportunity anyways.

>
> Also, instead of littering functions with ifdefs it would be better to
> abstract this out into a separate function which has an "always return
> false" version for !CONFIG_UPROBES.  Then the above could be simplified to
> something like:

Sure, can do.

>
> ...
> pagefault_disable();

But I'd leave pagefault_disable() outside of that function, because
caller has to do it either way.

>
> if (is_uprobe_at_func_entry(regs, current) &&
> __access_ok(regs->sp, 4) &&
> !__get_user(ret_addr, (const u32 __user *)regs->sp))
> perf_callchain_store(entry, ret_addr);
> ...
>
> Also it's good practice to wait at least several days before posting new
> versions to avoid spamming reviewers and to give them time to digest
> what you've already sent.

I'm not sure about "at least several days", tbh. I generally try to
not post more often than once a day, and that only if I received some
meaningful reviewing feedback (like in your case). I do wait a few
days for reviews before pinging the mailing list again, though.

Would I get this feedback if I haven't posted v3? Or we'd just be
delaying the inevitable for a few more idle days? This particular
change (in it's initial version before yours and recent Peter's
comments) has been sitting under review since May 8th ([0], and then
posted without changes on May 21st, [1]), so I'm not exactly rushing
the things here.

Either way, I won't get to this until next week, so I won't spam you
too much anymore, sorry.

  [0] 
https://lore.kernel.org/linux-trace-kernel/20240508212605.4012172-4-and...@kernel.org/
  [1] 
https://lore.kernel.org/linux-trace-kernel/20240522013845.1631305-4-and...@kernel.org/

>
> --
> Josh



Re: [PATCH v2 00/12] uprobes: add batched register/unregister APIs and per-CPU RW semaphore

2024-07-08 Thread Andrii Nakryiko
On Thu, Jul 4, 2024 at 8:44 AM Paul E. McKenney  wrote:
>
> On Thu, Jul 04, 2024 at 11:15:59AM +0200, Peter Zijlstra wrote:
> > On Wed, Jul 03, 2024 at 02:33:06PM -0700, Andrii Nakryiko wrote:
> >
> > > 2. More tactically, RCU protection seems like the best way forward. We
> > > got hung up on SRCU vs RCU Tasks Trace. Thanks to Paul, we also
> > > clarified that RCU Tasks Trace has nothing to do with Tasks Rude
> > > flavor (whatever that is, I have no idea).
> > >
> > > Now, RCU Tasks Trace were specifically designed for least overhead
> > > hotpath (reader side) performance, at the expense of slowing down much
> > > rarer writers. My microbenchmarking does show at least 5% difference.
> > > Both flavors can handle sleepable uprobes waiting for page faults.
> > > Tasks Trace flavor is already used for tracing in the BPF realm,
> > > including for sleepable uprobes and works well. It's not going away.
> >
> > I need to look into this new RCU flavour and why it exists -- for
> > example, why can't SRCU be improved to gain the same benefits. This is
> > what we've always done, improve SRCU.
>
> Well, it is all software.  And I certainly pushed SRCU hard.  If I recall
> correctly, it took them a year to convince me that they needed something
> more than SRCU could reasonably be convinced to do.
>
> The big problem is that they need to be able to hook a simple BPF program
> (for example, count the number of calls with given argument values) on
> a fastpath function on a system running in production without causing
> the automation to decide that this system is too slow, thus whacking it
> over the head.  Any appreciable overhead is a no-go in this use case.
> It is not just that the srcu_read_lock() function's smp_mb() call would
> disqualify SRCU, its other added overhead would as well.  Plus this needs
> RCU Tasks Trace CPU stall warnings to catch abuse, and SRCU doesn't
> impose any limits on readers (how long to set the stall time?) and
> doesn't track tasks.
>
> > > Now, you keep pushing for SRCU instead of RCU Tasks Trace, but I
> > > haven't seen a single argument why. Please provide that, or let's
> > > stick to RCU Tasks Trace, because uprobe's use case is an ideal case
> > > of what Tasks Trace flavor was designed for.
> >
> > Because I actually know SRCU, and because it provides a local scope.
> > It isolates the unregister waiters from other random users. I'm not
> > going to use this funky new flavour until I truly understand it.
>
> It is only a few hundred lines of code on top of the infrastructure
> that also supports RCU Tasks and RCU Tasks Rude.  If you understand
> SRCU and preemptible RCU, there will be nothing exotic there, and it is
> simpler than Tree SRCU, to say nothing of preemptible RCU.  I would be
> more than happy to take you through it if you would like, but not before
> this coming Monday.
>
> > Also, we actually want two scopes here, there is no reason for the
> > consumer unreg to wait for the retprobe stuff.
>
> I don't know that the performance requirements for userspace retprobes are
> as severe as for function-call probes -- on that, I must defer to Andrii.

uretprobes are just as important (performance-wise and just in term of
functionality), as they are often used simultaneously (e.g., to time
some user function or capture input args and make decision whether to
log them based on return value). uretprobes are inherently slower
(because they are entry probe + some extra bookkeeping and overhead),
but we should do the best we can to ensure they are as performant as
possible


> To your two-scopes point, it is quite possible that SRCU could be used
> for userspace retprobes and RCU Tasks Trace for the others.  It certainly
> seems to me that SRCU would be better than explicit reference counting,
> but I could be missing something.  (Memory footprint, perhaps?  Though
> maybe a single srcu_struct could be shared among all userspace retprobes.
> Given the time-bounded reads, maybe stall warnings aren't needed,
> give or take things like interrupts, preemption, and vCPU preemption.
> Plus it is not like it would be hard to figure out which read-side code
> region was at fault when the synchronize_srcu() took too long.)
>
> Thanx, Paul
>
> > > 3. Regardless of RCU flavor, due to RCU protection, we have to add
> > > batched register/unregister APIs, so we can amortize sync_rcu cost
> > > during deregistration. Can we please agree on that as well? This is
> > > the main goal of this patch set and I'

Re: [PATCH v2 00/12] uprobes: add batched register/unregister APIs and per-CPU RW semaphore

2024-07-08 Thread Andrii Nakryiko
On Thu, Jul 4, 2024 at 2:16 AM Peter Zijlstra  wrote:
>
> On Wed, Jul 03, 2024 at 02:33:06PM -0700, Andrii Nakryiko wrote:
>
> > 2. More tactically, RCU protection seems like the best way forward. We
> > got hung up on SRCU vs RCU Tasks Trace. Thanks to Paul, we also
> > clarified that RCU Tasks Trace has nothing to do with Tasks Rude
> > flavor (whatever that is, I have no idea).
> >
> > Now, RCU Tasks Trace were specifically designed for least overhead
> > hotpath (reader side) performance, at the expense of slowing down much
> > rarer writers. My microbenchmarking does show at least 5% difference.
> > Both flavors can handle sleepable uprobes waiting for page faults.
> > Tasks Trace flavor is already used for tracing in the BPF realm,
> > including for sleepable uprobes and works well. It's not going away.
>
> I need to look into this new RCU flavour and why it exists -- for
> example, why can't SRCU be improved to gain the same benefits. This is
> what we've always done, improve SRCU.

Yes, that makes sense, in principle. But if it takes too much time to
improve SRCU, I'd say it's reasonable to use the faster solution until
it can be unified (if at all, of course).

>
> > Now, you keep pushing for SRCU instead of RCU Tasks Trace, but I
> > haven't seen a single argument why. Please provide that, or let's
> > stick to RCU Tasks Trace, because uprobe's use case is an ideal case
> > of what Tasks Trace flavor was designed for.
>
> Because I actually know SRCU, and because it provides a local scope.
> It isolates the unregister waiters from other random users. I'm not
> going to use this funky new flavour until I truly understand it.
>
> Also, we actually want two scopes here, there is no reason for the
> consumer unreg to wait for the retprobe stuff.
>

Uprobe attachment/detachment (i.e., register/unregister) is a very
rare operation. Its performance doesn't really matter in the great
scheme of things. In the sense that whether it takes 1, 10, or 200
milliseconds is immaterial compared to uprobe/uretprobe triggering
performance. The only important thing is that it doesn't take multiple
seconds and minutes (or even hours, if we do synchronize_rcu
unconditionally after each unregister) to attach/detach 100s/1000s+
uprobes.

I'm just saying this is the wrong target to optimize for if we just
ensure that it's reasonably performant in the face of multiple uprobes
registering/unregistering. (so one common SRCU scope for
registration/unregistration is totally fine, IMO)


> > 3. Regardless of RCU flavor, due to RCU protection, we have to add
> > batched register/unregister APIs, so we can amortize sync_rcu cost
> > during deregistration. Can we please agree on that as well? This is
> > the main goal of this patch set and I'd like to land it before working
> > further on changing and improving the rest of the locking schema.
>
> See my patch here:
>
>   
> https://lkml.kernel.org/r/20240704084524.gc28...@noisy.programming.kicks-ass.net
>
> I don't think it needs to be more complicated than that.

Alright, I'll take a closer look this week and will run it through my
tests and benchmarks, thanks for working on this and sending it out!

>
> > I won't be happy about it, but just to move things forward, I can drop
> > a) custom refcounting and/or b) percpu RW semaphore. Both are
> > beneficial but not essential for batched APIs work. But if you force
> > me to do that, please state clearly your reasons/arguments.
>
> The reason I'm pushing RCU here is because AFAICT uprobes doesn't
> actually need the stronger serialisation that rwlock (any flavour)
> provide. It is a prime candidate for RCU, and I think you'll find plenty
> papers / articles (by both Paul and others) that show that RCU scales
> better.
>
> As a bonus, you avoid that horrific write side cost that per-cpu rwsem
> has.
>
> The reason I'm not keen on that refcount thing was initially because I
> did not understand the justification for it, but worse, once I did read
> your justification, your very own numbers convinced me that the refcount
> is fundamentally problematic, in any way shape or form.
>
> > No one had yet pointed out why refcounting is broken
>
> Your very own numbers point out that refcounting is a problem here.

Yes, I already agreed on avoiding refcounting if possible. The
question above was why the refcounting I added was broken by itself.
But it's a moot point (at least for now), let me go look at your
patches.

>
> > and why percpu RW semaphore is bad.
>
> Literature and history show us that RCU -- where possible -- is
> always better than

[PATCH v4] perf,x86: avoid missing caller address in stack traces captured in uprobe

2024-07-08 Thread Andrii Nakryiko
When tracing user functions with uprobe functionality, it's common to
install the probe (e.g., a BPF program) at the first instruction of the
function. This is often going to be `push %rbp` instruction in function
preamble, which means that within that function frame pointer hasn't
been established yet. This leads to consistently missing an actual
caller of the traced function, because perf_callchain_user() only
records current IP (capturing traced function) and then following frame
pointer chain (which would be caller's frame, containing the address of
caller's caller).

So when we have target_1 -> target_2 -> target_3 call chain and we are
tracing an entry to target_3, captured stack trace will report
target_1 -> target_3 call chain, which is wrong and confusing.

This patch proposes a x86-64-specific heuristic to detect `push %rbp`
(`push %ebp` on 32-bit architecture) instruction being traced. Given
entire kernel implementation of user space stack trace capturing works
under assumption that user space code was compiled with frame pointer
register (%rbp/%ebp) preservation, it seems pretty reasonable to use
this instruction as a strong indicator that this is the entry to the
function. In that case, return address is still pointed to by %rsp/%esp,
so we fetch it and add to stack trace before proceeding to unwind the
rest using frame pointer-based logic.

We also check for `endbr64` (for 64-bit modes) as another common pattern
for function entry, as suggested by Josh Poimboeuf. Even if we get this
wrong sometimes for uprobes attached not at the function entry, it's OK
because stack trace will still be overall meaningful, just with one
extra bogus entry. If we don't detect this, we end up with guaranteed to
be missing caller function entry in the stack trace, which is worse
overall.

Signed-off-by: Andrii Nakryiko 
---
v3->v4:
  - use get_user() instead of __get_user(), given untrusted input (Josh);
  - reduced #ifdef-ery (Josh);
v2->v3:
  - added endr64 detection and extracted heuristics into a function (Josh);
v1->v2:
  - use native unsigned long for ret_addr (Peter);
  - add same logic for compat logic in perf_callchain_user32 (Peter).

 arch/x86/events/core.c  | 62 +
 include/linux/uprobes.h |  2 ++
 kernel/events/uprobes.c |  2 ++
 3 files changed, 66 insertions(+)

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 5b0dd07b1ef1..c09603d769a2 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -41,6 +41,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "perf_event.h"
 
@@ -2813,6 +2814,46 @@ static unsigned long get_segment_base(unsigned int 
segment)
return get_desc_base(desc);
 }
 
+#ifdef CONFIG_UPROBES
+/*
+ * Heuristic-based check if uprobe is installed at the function entry.
+ *
+ * Under assumption of user code being compiled with frame pointers,
+ * `push %rbp/%ebp` is a good indicator that we indeed are.
+ *
+ * Similarly, `endbr64` (assuming 64-bit mode) is also a common pattern.
+ * If we get this wrong, captured stack trace might have one extra bogus
+ * entry, but the rest of stack trace will still be meaningful.
+ */
+static bool is_uprobe_at_func_entry(struct pt_regs *regs)
+{
+   struct arch_uprobe *auprobe;
+
+   if (!current->utask)
+   return false;
+
+   auprobe = current->utask->auprobe;
+   if (!auprobe)
+   return false;
+
+   /* push %rbp/%ebp */
+   if (auprobe->insn[0] == 0x55)
+   return true;
+
+   /* endbr64 (64-bit only) */
+   if (user_64bit_mode(regs) && *(u32 *)auprobe->insn == 0xfa1e0ff3)
+   return true;
+
+   return false;
+}
+
+#else
+static bool is_uprobe_at_func_entry(struct pt_regs *regs)
+{
+   return false;
+}
+#endif /* CONFIG_UPROBES */
+
 #ifdef CONFIG_IA32_EMULATION
 
 #include 
@@ -2824,6 +2865,7 @@ perf_callchain_user32(struct pt_regs *regs, struct 
perf_callchain_entry_ctx *ent
unsigned long ss_base, cs_base;
struct stack_frame_ia32 frame;
const struct stack_frame_ia32 __user *fp;
+   u32 ret_addr;
 
if (user_64bit_mode(regs))
return 0;
@@ -2833,6 +2875,12 @@ perf_callchain_user32(struct pt_regs *regs, struct 
perf_callchain_entry_ctx *ent
 
fp = compat_ptr(ss_base + regs->bp);
pagefault_disable();
+
+   /* see perf_callchain_user() below for why we do this */
+   if (is_uprobe_at_func_entry(regs) &&
+   !get_user(ret_addr, (const u32 __user *)regs->sp))
+   perf_callchain_store(entry, ret_addr);
+
while (entry->nr < entry->max_stack) {
if (!valid_user_frame(fp, sizeof(frame)))
break;
@@ -2861,6 +2909,7 @@ perf_callchain_user(struct perf_callchain_entry_ctx 
*entry, struct pt_regs *regs
 {
struct stack_frame frame;
   

Re: [PATCH v4] perf,x86: avoid missing caller address in stack traces captured in uprobe

2024-07-09 Thread Andrii Nakryiko
On Tue, Jul 9, 2024 at 3:11 AM Peter Zijlstra  wrote:
>
> On Mon, Jul 08, 2024 at 04:11:27PM -0700, Andrii Nakryiko wrote:
> > +#ifdef CONFIG_UPROBES
> > +/*
> > + * Heuristic-based check if uprobe is installed at the function entry.
> > + *
> > + * Under assumption of user code being compiled with frame pointers,
> > + * `push %rbp/%ebp` is a good indicator that we indeed are.
> > + *
> > + * Similarly, `endbr64` (assuming 64-bit mode) is also a common pattern.
> > + * If we get this wrong, captured stack trace might have one extra bogus
> > + * entry, but the rest of stack trace will still be meaningful.
> > + */
> > +static bool is_uprobe_at_func_entry(struct pt_regs *regs)
> > +{
> > + struct arch_uprobe *auprobe;
> > +
> > + if (!current->utask)
> > + return false;
> > +
> > + auprobe = current->utask->auprobe;
> > + if (!auprobe)
> > + return false;
> > +
> > + /* push %rbp/%ebp */
> > + if (auprobe->insn[0] == 0x55)
> > + return true;
> > +
> > + /* endbr64 (64-bit only) */
> > + if (user_64bit_mode(regs) && *(u32 *)auprobe->insn == 0xfa1e0ff3)
> > + return true;
>
> I meant to reply to Josh suggesting this, but... how can this be? If you
> scribble the ENDBR with an INT3 things will #CP and we'll never get to
> the #BP.

Well, it seems like it works in practice, I just tried. Here's the
disassembly of the function:

19d0 :
19d0: f3 0f 1e fa   endbr64
19d4: 55pushq   %rbp
19d5: 48 89 e5  movq%rsp, %rbp
19d8: 48 83 ec 10   subq$0x10, %rsp
19dc: 48 8d 3d fe ed ff ff  leaq-0x1202(%rip), %rdi
 # 0x7e1 <__isoc99_scanf+0x7e1>
19e3: 48 8d 75 fc   leaq-0x4(%rbp), %rsi
19e7: b0 00 movb$0x0, %al
19e9: e8 f2 00 00 00callq   0x1ae0 <__isoc99_scanf+0x1ae0>
19ee: b8 01 00 00 00movl$0x1, %eax
19f3: 48 83 c4 10   addq$0x10, %rsp
19f7: 5dpopq%rbp
19f8: c3retq
19f9: 0f 1f 80 00 00 00 00  nopl(%rax)

And here's the state when uprobe is attached:

(gdb) disass/r urandlib_api_v1
Dump of assembler code for function urandlib_api_v1:
   0x7ffb734e39d0 <+0>: cc  int3
   0x7ffb734e39d1 <+1>: 0f 1e fanop%edx
   0x7ffb734e39d4 <+4>: 55  push   %rbp
   0x7ffb734e39d5 <+5>: 48 89 e5mov%rsp,%rbp
   0x7ffb734e39d8 <+8>: 48 83 ec 10 sub$0x10,%rsp
   0x7ffb734e39dc <+12>:48 8d 3d fe ed ff fflea
-0x1202(%rip),%rdi# 0x7ffb734e27e1
   0x7ffb734e39e3 <+19>:48 8d 75 fc lea-0x4(%rbp),%rsi
=> 0x7ffb734e39e7 <+23>:b0 00   mov$0x0,%al
   0x7ffb734e39e9 <+25>:e8 f2 00 00 00  call
0x7ffb734e3ae0 <__isoc99_scanf@plt>
   0x7ffb734e39ee <+30>:b8 01 00 00 00  mov$0x1,%eax
   0x7ffb734e39f3 <+35>:48 83 c4 10 add$0x10,%rsp
   0x7ffb734e39f7 <+39>:5d  pop%rbp
   0x7ffb734e39f8 <+40>:c3  ret


You can see it replaced the first byte, the following 3 bytes are
remnants of endb64 (gdb says it's a nop? :)), and then we proceeded,
you can see I stepped through a few more instructions.

Works by accident?

But either way, if we prevent uprobe to be placed on end64 that will
essentially break any code that does compile with endbr64
(-fcf-protection=branch), which is very not great (I suspect most
people that care would just disable that option in such a case).

>
> Also, we tried very hard to not have a literal encode ENDBR (I really
> should teach objtool about this one :/). If it somehow makes sense to
> keep this clause, please use: gen_endbr()

I'll just use is_endbr(), no problem.



Re: [PATCH v4] perf,x86: avoid missing caller address in stack traces captured in uprobe

2024-07-10 Thread Andrii Nakryiko
On Wed, Jul 10, 2024 at 4:39 AM Peter Zijlstra  wrote:
>
> On Tue, Jul 09, 2024 at 10:50:00AM -0700, Andrii Nakryiko wrote:
> > On Tue, Jul 9, 2024 at 3:11 AM Peter Zijlstra  wrote:
> > >
> > > On Mon, Jul 08, 2024 at 04:11:27PM -0700, Andrii Nakryiko wrote:
> > > > +#ifdef CONFIG_UPROBES
> > > > +/*
> > > > + * Heuristic-based check if uprobe is installed at the function entry.
> > > > + *
> > > > + * Under assumption of user code being compiled with frame pointers,
> > > > + * `push %rbp/%ebp` is a good indicator that we indeed are.
> > > > + *
> > > > + * Similarly, `endbr64` (assuming 64-bit mode) is also a common 
> > > > pattern.
> > > > + * If we get this wrong, captured stack trace might have one extra 
> > > > bogus
> > > > + * entry, but the rest of stack trace will still be meaningful.
> > > > + */
> > > > +static bool is_uprobe_at_func_entry(struct pt_regs *regs)
> > > > +{
> > > > + struct arch_uprobe *auprobe;
> > > > +
> > > > + if (!current->utask)
> > > > + return false;
> > > > +
> > > > + auprobe = current->utask->auprobe;
> > > > + if (!auprobe)
> > > > + return false;
> > > > +
> > > > + /* push %rbp/%ebp */
> > > > + if (auprobe->insn[0] == 0x55)
> > > > + return true;
> > > > +
> > > > + /* endbr64 (64-bit only) */
> > > > + if (user_64bit_mode(regs) && *(u32 *)auprobe->insn == 0xfa1e0ff3)
> > > > + return true;
> > >
> > > I meant to reply to Josh suggesting this, but... how can this be? If you
> > > scribble the ENDBR with an INT3 things will #CP and we'll never get to
> > > the #BP.
> >
> > Well, it seems like it works in practice, I just tried. Here's the
> > disassembly of the function:
> >
> > 19d0 :
> > 19d0: f3 0f 1e fa   endbr64
> > 19d4: 55pushq   %rbp
> > 19d5: 48 89 e5  movq%rsp, %rbp
> > 19d8: 48 83 ec 10   subq$0x10, %rsp
> > 19dc: 48 8d 3d fe ed ff ff  leaq-0x1202(%rip), %rdi
> >  # 0x7e1 <__isoc99_scanf+0x7e1>
> > 19e3: 48 8d 75 fc   leaq-0x4(%rbp), %rsi
> > 19e7: b0 00 movb$0x0, %al
> > 19e9: e8 f2 00 00 00callq   0x1ae0 
> > <__isoc99_scanf+0x1ae0>
> > 19ee: b8 01 00 00 00movl$0x1, %eax
> > 19f3: 48 83 c4 10   addq$0x10, %rsp
> > 19f7: 5dpopq%rbp
> > 19f8: c3retq
> > 19f9: 0f 1f 80 00 00 00 00  nopl(%rax)
> >
> > And here's the state when uprobe is attached:
> >
> > (gdb) disass/r urandlib_api_v1
> > Dump of assembler code for function urandlib_api_v1:
> >0x7ffb734e39d0 <+0>: cc  int3
> >0x7ffb734e39d1 <+1>: 0f 1e fanop%edx
> >0x7ffb734e39d4 <+4>: 55  push   %rbp
> >0x7ffb734e39d5 <+5>: 48 89 e5mov%rsp,%rbp
> >0x7ffb734e39d8 <+8>: 48 83 ec 10 sub$0x10,%rsp
> >0x7ffb734e39dc <+12>:48 8d 3d fe ed ff fflea
> > -0x1202(%rip),%rdi# 0x7ffb734e27e1
> >0x7ffb734e39e3 <+19>:48 8d 75 fc lea
> > -0x4(%rbp),%rsi
> > => 0x7ffb734e39e7 <+23>:b0 00   mov$0x0,%al
> >0x7ffb734e39e9 <+25>:e8 f2 00 00 00  call
> > 0x7ffb734e3ae0 <__isoc99_scanf@plt>
> >0x7ffb734e39ee <+30>:b8 01 00 00 00  mov$0x1,%eax
> >0x7ffb734e39f3 <+35>:48 83 c4 10 add$0x10,%rsp
> >0x7ffb734e39f7 <+39>:5d  pop%rbp
> >0x7ffb734e39f8 <+40>:c3  ret
> >
> >
> > You can see it replaced the first byte, the following 3 bytes are
> > remnants of endb64 (gdb says it's a nop? :)), and then we proceeded,
> > you can see I stepped through a few more instructions.
> >
> > Works by accident?
>
> Yeah, we don't actually have Userspace IBT enabled yet, even on hardware
> that supports it.

OK, I don't know what the implications are, but it's a good accident :)

Anyways, what should I do for v4? Drop is_endbr6() check or keep it?



Re: [PATCH 1/3] uprobes: kill uprobe_register_refctr()

2024-07-10 Thread Andrii Nakryiko
On Wed, Jul 10, 2024 at 9:32 AM Oleg Nesterov  wrote:
>
> It doesn't make any sense to have 2 versions of _register(). Note that
> trace_uprobe_enable(), the only user of uprobe_register(), doesn't need
> to check tu->ref_ctr_offset to decide which one should be used, it could
> safely pass ref_ctr_offset == 0 to uprobe_register_refctr().
>
> Add this argument to uprobe_register(), update the callers, and kill
> uprobe_register_refctr().
>
> Signed-off-by: Oleg Nesterov 
> ---
>  include/linux/uprobes.h |  9 ++---
>  kernel/events/uprobes.c | 23 +--
>  kernel/trace/bpf_trace.c|  2 +-
>  kernel/trace/trace_uprobe.c |  8 ++--
>  4 files changed, 10 insertions(+), 32 deletions(-)
>

LGTM with few nits below.

Acked-by: Andrii Nakryiko 

>  /*
>   * uprobe_apply - unregister an already registered probe.
>   * @inode: the file in which the probe has to be removed.
> diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
> index d1daeab1bbc1..467f358c8ce7 100644
> --- a/kernel/trace/bpf_trace.c
> +++ b/kernel/trace/bpf_trace.c
> @@ -3477,7 +3477,7 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr 
> *attr, struct bpf_prog *pr
>   &bpf_uprobe_multi_link_lops, prog);
>
> for (i = 0; i < cnt; i++) {
> -   err = uprobe_register_refctr(d_real_inode(link->path.dentry),
> +   err = uprobe_register(d_real_inode(link->path.dentry),
>  uprobes[i].offset,
>  uprobes[i].ref_ctr_offset,
>  &uprobes[i].consumer);

please adjust indentation here

> diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
> index c98e3b3386ba..78a5c40e885a 100644
> --- a/kernel/trace/trace_uprobe.c
> +++ b/kernel/trace/trace_uprobe.c
> @@ -1089,12 +1089,8 @@ static int trace_uprobe_enable(struct trace_uprobe 
> *tu, filter_func_t filter)
> tu->consumer.filter = filter;
> tu->inode = d_real_inode(tu->path.dentry);
>
> -   if (tu->ref_ctr_offset)
> -   ret = uprobe_register_refctr(tu->inode, tu->offset,
> -   tu->ref_ctr_offset, &tu->consumer);
> -   else
> -   ret = uprobe_register(tu->inode, tu->offset, &tu->consumer);
> -
> +   ret = uprobe_register(tu->inode, tu->offset, tu->ref_ctr_offset,
> + &tu->consumer);

doesn't fit under 100 characters? If it does, please keep as a single line.

> if (ret)
> tu->inode = NULL;
>
> --
> 2.25.1.362.g51ebf55
>
>



Re: [PATCH 3/3] uprobes: make uprobe_register() return struct uprobe *

2024-07-10 Thread Andrii Nakryiko
On Wed, Jul 10, 2024 at 9:33 AM Oleg Nesterov  wrote:
>
> This way uprobe_unregister() and uprobe_apply() do not need find_uprobe() +
> put_uprobe(). And to me this change simplifies the code a bit.
>
> Signed-off-by: Oleg Nesterov 
> ---
>  include/linux/uprobes.h | 14 ++--
>  kernel/events/uprobes.c | 45 -
>  kernel/trace/bpf_trace.c| 12 +-
>  kernel/trace/trace_uprobe.c | 28 +++
>  4 files changed, 41 insertions(+), 58 deletions(-)
>
> diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
> index aa89a8b67039..399509befcf4 100644
> --- a/include/linux/uprobes.h
> +++ b/include/linux/uprobes.h

I don't see struct uprobe forward-declared in this header, maybe we
should add it?

> @@ -110,9 +110,9 @@ extern bool is_trap_insn(uprobe_opcode_t *insn);
>  extern unsigned long uprobe_get_swbp_addr(struct pt_regs *regs);
>  extern unsigned long uprobe_get_trap_addr(struct pt_regs *regs);
>  extern int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct 
> *mm, unsigned long vaddr, uprobe_opcode_t);
> -extern int uprobe_register(struct inode *inode, loff_t offset, loff_t 
> ref_ctr_offset, struct uprobe_consumer *uc);
> -extern int uprobe_apply(struct inode *inode, loff_t offset, struct 
> uprobe_consumer *uc, bool);
> -extern void uprobe_unregister(struct inode *inode, loff_t offset, struct 
> uprobe_consumer *uc);
> +extern struct uprobe *uprobe_register(struct inode *inode, loff_t offset, 
> loff_t ref_ctr_offset, struct uprobe_consumer *uc);
> +extern int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, 
> bool);
> +extern void uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer 
> *uc);
>  extern int uprobe_mmap(struct vm_area_struct *vma);
>  extern void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, 
> unsigned long end);
>  extern void uprobe_start_dup_mmap(void);
> @@ -147,18 +147,18 @@ static inline void uprobes_init(void)
>
>  #define uprobe_get_trap_addr(regs) instruction_pointer(regs)
>
> -static inline int
> +static inline struct uprobe *
>  uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, 
> struct uprobe_consumer *uc)
>  {
> -   return -ENOSYS;
> +   return ERR_PTR(-ENOSYS);
>  }
>  static inline int
> -uprobe_apply(struct inode *inode, loff_t offset, struct uprobe_consumer *uc, 
> bool add)
> +uprobe_apply(struct uprobe* uprobe, struct uprobe_consumer *uc, bool add)
>  {
> return -ENOSYS;
>  }

complete aside, when I was looking at this code I was wondering why we
even need uprobe_apply, it looks like some hacky variant of
uprobe_register and uprobe_unregister. I didn't dig deeper, but think
whether we even need this? If this is just to avoid (for some period)
some consumer callback calling, then that could be handled at the
consumer side by ignoring such calls.

callback call is cheap, it's the int3 handling that's expensive and
with uprobe_apply we are already paying it anyways, so what is this
for?

>  static inline void
> -uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer 
> *uc)
> +uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc)
>  {
>  }
>  static inline int uprobe_mmap(struct vm_area_struct *vma)

[...]

>
> @@ -1133,41 +1126,39 @@ EXPORT_SYMBOL_GPL(uprobe_unregister);
>   * refcount is released when the last @uc for the @uprobe
>   * unregisters. Caller of uprobe_register() is required to keep @inode
>   * (and the containing mount) referenced.
> - *
> - * Return errno if it cannot successully install probes
> - * else return 0 (success)

mention that it never returns NULL, but rather encodes error code
inside the pointer on error? It's an important part of the contract.

>   */
> -int uprobe_register(struct inode *inode, loff_t offset, loff_t 
> ref_ctr_offset,
> -   struct uprobe_consumer *uc)
> +struct uprobe *uprobe_register(struct inode *inode,
> +   loff_t offset, loff_t ref_ctr_offset,
> +   struct uprobe_consumer *uc)
>  {

[...]

> @@ -1186,35 +1177,27 @@ int uprobe_register(struct inode *inode, loff_t 
> offset, loff_t ref_ctr_offset,
>
> if (unlikely(ret == -EAGAIN))
> goto retry;
> -   return ret;
> +
> +   return ret ? ERR_PTR(ret) : uprobe;
>  }
>  EXPORT_SYMBOL_GPL(uprobe_register);
>
>  /*

this should be /** for doccomment checking (you'd get a warning for
missing @uprobe if there was this extra star)

>   * uprobe_apply - unregister an already registered probe.
> - * @inode: the file in which the probe has to be removed.
> - * @offset: offset from the start of the file.

add @uprobe description now?

>   * @uc: consumer which wants to add more or remove some breakpoints
>   * @add: add or remove the breakpoints
>   */
> -int uprobe_apply(struct inode *inode, loff_t offset,
> -   struct uprobe_consumer *uc, boo

Re: [PATCH 3/3] uprobes: make uprobe_register() return struct uprobe *

2024-07-10 Thread Andrii Nakryiko
On Wed, Jul 10, 2024 at 9:49 AM Jiri Olsa  wrote:
>
> On Wed, Jul 10, 2024 at 06:31:33PM +0200, Oleg Nesterov wrote:
>
> SNIP
>
> > diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
> > index 467f358c8ce7..7571811127a2 100644
> > --- a/kernel/trace/bpf_trace.c
> > +++ b/kernel/trace/bpf_trace.c
> > @@ -3157,6 +3157,7 @@ struct bpf_uprobe {
> >   loff_t offset;
> >   unsigned long ref_ctr_offset;
> >   u64 cookie;
> > + struct uprobe *uprobe;
> >   struct uprobe_consumer consumer;
> >  };
> >
> > @@ -3180,10 +3181,8 @@ static void bpf_uprobe_unregister(struct path *path, 
> > struct bpf_uprobe *uprobes,
> >  {
> >   u32 i;
> >
> > - for (i = 0; i < cnt; i++) {
> > - uprobe_unregister(d_real_inode(path->dentry), 
> > uprobes[i].offset,
> > -   &uprobes[i].consumer);
> > - }
>
> nice, we could also drop path argument now

see my comments to Oleg, I think we can/should get rid of link->path
altogether if uprobe itself keeps inode alive.

BTW, Jiri, do we have any test for multi-uprobe that simulates partial
attachment success/failure (whichever way you want to look at it). It
would be super useful to have to check at least some error handling
code in the uprobe code base. If we don't, do you mind adding
something simple to BPF selftests?

>
> jirka
>
> > + for (i = 0; i < cnt; i++)
> > + uprobe_unregister(uprobes[i].uprobe, &uprobes[i].consumer);
> >  }
> >
> >  static void bpf_uprobe_multi_link_release(struct bpf_link *link)
> > @@ -3477,11 +3476,12 @@ int bpf_uprobe_multi_link_attach(const union 
> > bpf_attr *attr, struct bpf_prog *pr
> > &bpf_uprobe_multi_link_lops, prog);
> >
> >   for (i = 0; i < cnt; i++) {
> > - err = uprobe_register(d_real_inode(link->path.dentry),
> > + uprobes[i].uprobe = 
> > uprobe_register(d_real_inode(link->path.dentry),
> >uprobes[i].offset,
> >uprobes[i].ref_ctr_offset,
> >&uprobes[i].consumer);
> > - if (err) {
> > + if (IS_ERR(uprobes[i].uprobe)) {
> > + err = PTR_ERR(uprobes[i].uprobe);
> >   bpf_uprobe_unregister(&path, uprobes, i);
> >   goto error_free;
> >   }



Re: [PATCH v4] perf,x86: avoid missing caller address in stack traces captured in uprobe

2024-07-10 Thread Andrii Nakryiko
On Wed, Jul 10, 2024 at 9:24 AM Josh Poimboeuf  wrote:
>
> On Wed, Jul 10, 2024 at 08:11:57AM -0700, Andrii Nakryiko wrote:
> > On Wed, Jul 10, 2024 at 4:39 AM Peter Zijlstra  wrote:
> > > On Tue, Jul 09, 2024 at 10:50:00AM -0700, Andrii Nakryiko wrote:
> > > > You can see it replaced the first byte, the following 3 bytes are
> > > > remnants of endb64 (gdb says it's a nop? :)), and then we proceeded,
> > > > you can see I stepped through a few more instructions.
> > > >
> > > > Works by accident?
> > >
> > > Yeah, we don't actually have Userspace IBT enabled yet, even on hardware
> > > that supports it.
> >
> > OK, I don't know what the implications are, but it's a good accident :)
> >
> > Anyways, what should I do for v4? Drop is_endbr6() check or keep it?
>
> Given the current behavior of uprobe overwriting ENDBR64 with INT3, the
> is_endbr6() check still makes sense, otherwise is_uprobe_at_func_entry()
> would never return true on OSes which have the ENDBR64 compiled in.
>
> However, once userspace IBT actually gets enabled, uprobe should skip
> the ENDBR64 and patch the subsequent instruction.  Then the is_endbr6()
> check would no longer be needed.
>

Ok, I'll keep it then, thanks.

> --
> Josh



[PATCH v5] perf,x86: avoid missing caller address in stack traces captured in uprobe

2024-07-10 Thread Andrii Nakryiko
When tracing user functions with uprobe functionality, it's common to
install the probe (e.g., a BPF program) at the first instruction of the
function. This is often going to be `push %rbp` instruction in function
preamble, which means that within that function frame pointer hasn't
been established yet. This leads to consistently missing an actual
caller of the traced function, because perf_callchain_user() only
records current IP (capturing traced function) and then following frame
pointer chain (which would be caller's frame, containing the address of
caller's caller).

So when we have target_1 -> target_2 -> target_3 call chain and we are
tracing an entry to target_3, captured stack trace will report
target_1 -> target_3 call chain, which is wrong and confusing.

This patch proposes a x86-64-specific heuristic to detect `push %rbp`
(`push %ebp` on 32-bit architecture) instruction being traced. Given
entire kernel implementation of user space stack trace capturing works
under assumption that user space code was compiled with frame pointer
register (%rbp/%ebp) preservation, it seems pretty reasonable to use
this instruction as a strong indicator that this is the entry to the
function. In that case, return address is still pointed to by %rsp/%esp,
so we fetch it and add to stack trace before proceeding to unwind the
rest using frame pointer-based logic.

We also check for `endbr64` (for 64-bit modes) as another common pattern
for function entry, as suggested by Josh Poimboeuf. Even if we get this
wrong sometimes for uprobes attached not at the function entry, it's OK
because stack trace will still be overall meaningful, just with one
extra bogus entry. If we don't detect this, we end up with guaranteed to
be missing caller function entry in the stack trace, which is worse
overall.

Signed-off-by: Andrii Nakryiko 
---
 arch/x86/events/core.c  | 63 +
 include/linux/uprobes.h |  2 ++
 kernel/events/uprobes.c |  2 ++
 3 files changed, 67 insertions(+)

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 5b0dd07b1ef1..780b8dc36f05 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -41,6 +41,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 
 #include "perf_event.h"
 
@@ -2813,6 +2815,46 @@ static unsigned long get_segment_base(unsigned int 
segment)
return get_desc_base(desc);
 }
 
+#ifdef CONFIG_UPROBES
+/*
+ * Heuristic-based check if uprobe is installed at the function entry.
+ *
+ * Under assumption of user code being compiled with frame pointers,
+ * `push %rbp/%ebp` is a good indicator that we indeed are.
+ *
+ * Similarly, `endbr64` (assuming 64-bit mode) is also a common pattern.
+ * If we get this wrong, captured stack trace might have one extra bogus
+ * entry, but the rest of stack trace will still be meaningful.
+ */
+static bool is_uprobe_at_func_entry(struct pt_regs *regs)
+{
+   struct arch_uprobe *auprobe;
+
+   if (!current->utask)
+   return false;
+
+   auprobe = current->utask->auprobe;
+   if (!auprobe)
+   return false;
+
+   /* push %rbp/%ebp */
+   if (auprobe->insn[0] == 0x55)
+   return true;
+
+   /* endbr64 (64-bit only) */
+   if (user_64bit_mode(regs) && is_endbr(*(u32 *)auprobe->insn))
+   return true;
+
+   return false;
+}
+
+#else
+static bool is_uprobe_at_func_entry(struct pt_regs *regs)
+{
+   return false;
+}
+#endif /* CONFIG_UPROBES */
+
 #ifdef CONFIG_IA32_EMULATION
 
 #include 
@@ -2824,6 +2866,7 @@ perf_callchain_user32(struct pt_regs *regs, struct 
perf_callchain_entry_ctx *ent
unsigned long ss_base, cs_base;
struct stack_frame_ia32 frame;
const struct stack_frame_ia32 __user *fp;
+   u32 ret_addr;
 
if (user_64bit_mode(regs))
return 0;
@@ -2833,6 +2876,12 @@ perf_callchain_user32(struct pt_regs *regs, struct 
perf_callchain_entry_ctx *ent
 
fp = compat_ptr(ss_base + regs->bp);
pagefault_disable();
+
+   /* see perf_callchain_user() below for why we do this */
+   if (is_uprobe_at_func_entry(regs) &&
+   !get_user(ret_addr, (const u32 __user *)regs->sp))
+   perf_callchain_store(entry, ret_addr);
+
while (entry->nr < entry->max_stack) {
if (!valid_user_frame(fp, sizeof(frame)))
break;
@@ -2861,6 +2910,7 @@ perf_callchain_user(struct perf_callchain_entry_ctx 
*entry, struct pt_regs *regs
 {
struct stack_frame frame;
const struct stack_frame __user *fp;
+   unsigned long ret_addr;
 
if (perf_guest_state()) {
/* TODO: We don't support guest os callchain now */
@@ -2884,6 +2934,19 @@ perf_callchain_user(struct perf_callchain_entry_ctx 
*entry, struct pt_regs *regs
return;
 
pagefault_di

Re: [PATCH 3/3] uprobes: make uprobe_register() return struct uprobe *

2024-07-10 Thread Andrii Nakryiko
On Wed, Jul 10, 2024 at 12:38 PM Jiri Olsa  wrote:
>
> On Wed, Jul 10, 2024 at 11:23:10AM -0700, Andrii Nakryiko wrote:
> > On Wed, Jul 10, 2024 at 9:49 AM Jiri Olsa  wrote:
> > >
> > > On Wed, Jul 10, 2024 at 06:31:33PM +0200, Oleg Nesterov wrote:
> > >
> > > SNIP
> > >
> > > > diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
> > > > index 467f358c8ce7..7571811127a2 100644
> > > > --- a/kernel/trace/bpf_trace.c
> > > > +++ b/kernel/trace/bpf_trace.c
> > > > @@ -3157,6 +3157,7 @@ struct bpf_uprobe {
> > > >   loff_t offset;
> > > >   unsigned long ref_ctr_offset;
> > > >   u64 cookie;
> > > > + struct uprobe *uprobe;
> > > >   struct uprobe_consumer consumer;
> > > >  };
> > > >
> > > > @@ -3180,10 +3181,8 @@ static void bpf_uprobe_unregister(struct path 
> > > > *path, struct bpf_uprobe *uprobes,
> > > >  {
> > > >   u32 i;
> > > >
> > > > - for (i = 0; i < cnt; i++) {
> > > > - uprobe_unregister(d_real_inode(path->dentry), 
> > > > uprobes[i].offset,
> > > > -   &uprobes[i].consumer);
> > > > - }
> > >
> > > nice, we could also drop path argument now
> >
> > see my comments to Oleg, I think we can/should get rid of link->path
> > altogether if uprobe itself keeps inode alive.
>
> yea, I was thinking of that, but then it's kind of useful to have it in
> bpf_uprobe_multi_link_fill_link_info, otherwise we have to take it from
> first uprobe in the link, but ok, still probably worth to remove it ;-)

if we need it for link_info, probably cleaner to just keep it, no big deal then

>
> anyway as you wrote it's ok for follow up cleanup, I'll check on that
>
> >
> > BTW, Jiri, do we have any test for multi-uprobe that simulates partial
> > attachment success/failure (whichever way you want to look at it). It
> > would be super useful to have to check at least some error handling
> > code in the uprobe code base. If we don't, do you mind adding
> > something simple to BPF selftests?
>
> there's test_attach_api_fails, but I think all checked fails are before
> actually calling uprobe_register function
>
> I think there are few ways to fail the uprobe_register, like install it
> on top of int3.. will check add some test for that
>

great, thank you!

> jirka
>
> >
> > >
> > > jirka
> > >
> > > > + for (i = 0; i < cnt; i++)
> > > > + uprobe_unregister(uprobes[i].uprobe, 
> > > > &uprobes[i].consumer);
> > > >  }
> > > >
> > > >  static void bpf_uprobe_multi_link_release(struct bpf_link *link)
> > > > @@ -3477,11 +3476,12 @@ int bpf_uprobe_multi_link_attach(const union 
> > > > bpf_attr *attr, struct bpf_prog *pr
> > > > &bpf_uprobe_multi_link_lops, prog);
> > > >
> > > >   for (i = 0; i < cnt; i++) {
> > > > - err = uprobe_register(d_real_inode(link->path.dentry),
> > > > + uprobes[i].uprobe = 
> > > > uprobe_register(d_real_inode(link->path.dentry),
> > > >uprobes[i].offset,
> > > >uprobes[i].ref_ctr_offset,
> > > >&uprobes[i].consumer);
> > > > - if (err) {
> > > > + if (IS_ERR(uprobes[i].uprobe)) {
> > > > + err = PTR_ERR(uprobes[i].uprobe);
> > > >   bpf_uprobe_unregister(&path, uprobes, i);
> > > >   goto error_free;
> > > >   }



Re: [PATCH 3/3] uprobes: make uprobe_register() return struct uprobe *

2024-07-10 Thread Andrii Nakryiko
On Wed, Jul 10, 2024 at 1:18 PM Oleg Nesterov  wrote:
>
> On 07/10, Andrii Nakryiko wrote:
> >
> > On Wed, Jul 10, 2024 at 9:33 AM Oleg Nesterov  wrote:
> > >
> > > This way uprobe_unregister() and uprobe_apply() do not need find_uprobe() 
> > > +
> > > put_uprobe(). And to me this change simplifies the code a bit.
> > >
> > > Signed-off-by: Oleg Nesterov 
> > > ---
> > >  include/linux/uprobes.h | 14 ++--
> > >  kernel/events/uprobes.c | 45 -
> > >  kernel/trace/bpf_trace.c| 12 +-
> > >  kernel/trace/trace_uprobe.c | 28 +++
> > >  4 files changed, 41 insertions(+), 58 deletions(-)
> > >
> > > diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
> > > index aa89a8b67039..399509befcf4 100644
> > > --- a/include/linux/uprobes.h
> > > +++ b/include/linux/uprobes.h
> >
> > I don't see struct uprobe forward-declared in this header, maybe we
> > should add it?
>
> Probably yes, thanks... Although the current code already uses
> struct uprobes * without forward-declaration at least if CONFIG_UPROBES=y.
> Thanks, will add.
>

Yep, I saw that and was wondering as well.

> > >  static inline int
> > > -uprobe_apply(struct inode *inode, loff_t offset, struct uprobe_consumer 
> > > *uc, bool add)
> > > +uprobe_apply(struct uprobe* uprobe, struct uprobe_consumer *uc, bool add)
> > >  {
> > > return -ENOSYS;
> > >  }
> >
> > complete aside, when I was looking at this code I was wondering why we
> > even need uprobe_apply, it looks like some hacky variant of
> > uprobe_register and uprobe_unregister.
>
> All I can say is that
>
> - I can hardly recall this logic, I'll try to do this tomorrow
>   and write another email
>
> - in any case this logic is ugly and needs more cleanups
>
> - but this patch only tries to simplify this code without any
>   visible changes.

yep, that's why it's an aside, up to you

>
> > > @@ -1133,41 +1126,39 @@ EXPORT_SYMBOL_GPL(uprobe_unregister);
> > >   * refcount is released when the last @uc for the @uprobe
> > >   * unregisters. Caller of uprobe_register() is required to keep @inode
> > >   * (and the containing mount) referenced.
> > > - *
> > > - * Return errno if it cannot successully install probes
> > > - * else return 0 (success)
> >
> > mention that it never returns NULL, but rather encodes error code
> > inside the pointer on error? It's an important part of the contract.
>
> OK...
>
> > >  /*
> >
> > this should be /** for doccomment checking (you'd get a warning for
> > missing @uprobe if there was this extra star)
>
> Well, this is what we have before this patch, but OK
>
> > >   * uprobe_apply - unregister an already registered probe.
> > > - * @inode: the file in which the probe has to be removed.
> > > - * @offset: offset from the start of the file.
> >
> > add @uprobe description now?
>
> If only I knew what this @uprobe description can say ;)

I'm pointing this out because I accidentally used /** for comment for
some function, and I got some bot report about missing argument. I
think /** makes sense for documenting "public API" function, so which
is why all the above.

>
> > > @@ -3180,10 +3181,8 @@ static void bpf_uprobe_unregister(struct path 
> > > *path, struct bpf_uprobe *uprobes,
> > >  {
> > > u32 i;
> > >
> > > -   for (i = 0; i < cnt; i++) {
> > > -   uprobe_unregister(d_real_inode(path->dentry), 
> > > uprobes[i].offset,
> > > - &uprobes[i].consumer);
> > > -   }
> > > +   for (i = 0; i < cnt; i++)
> >
> > you'll now need !IS_ERR_OR_NULL(uprobes[i].uprobe) check (or just NULL
> > check if you null-out it below)
>
> Hmm... are you sure? I'll re-check... See also the end of my email.

no, you are right, it should be fine

>
> > > @@ -3477,11 +3476,12 @@ int bpf_uprobe_multi_link_attach(const union 
> > > bpf_attr *attr, struct bpf_prog *pr
> > >   &bpf_uprobe_multi_link_lops, prog);
> > >
> > > for (i = 0; i < cnt; i++) {
> > > -   err = uprobe_register(d_real_inode(link->path.dentry),
> > > +   uprobes[i].uprobe = 
> > > uprobe_register(d_real_inode(

Re: [PATCH 3/3] uprobes: make uprobe_register() return struct uprobe *

2024-07-11 Thread Andrii Nakryiko
On Thu, Jul 11, 2024 at 2:28 AM Oleg Nesterov  wrote:
>
> On 07/10, Oleg Nesterov wrote:
> >
> > -void uprobe_unregister(struct inode *inode, loff_t offset, struct 
> > uprobe_consumer *uc)
> > +void uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc)
> >  {
> > - struct uprobe *uprobe;
> > -
> > - uprobe = find_uprobe(inode, offset);
> > - if (WARN_ON(!uprobe))
> > - return;
> > -
> >   down_write(&uprobe->register_rwsem);
> >   __uprobe_unregister(uprobe, uc);
> >   up_write(&uprobe->register_rwsem);
> > - put_uprobe(uprobe);
>
> OK, this is obviously wrong, needs get_uprobe/put_uprobe. 
> __uprobe_unregister()
> can free this uprobe, so up_write(&uprobe->register_rwsem) is not safe.

uprobe_register(), given it returns an uprobe instance to the caller
should keep refcount on it (it belongs to uprobe_consumer). That's
what I did for my patches, are you going to do that as well?

We basically do the same thing, just interfaces look a bit different.


>
> I'll send V2 on top of Peter's new version.
>
> Oleg.
>



  1   2   3   4   5   6   7   8   >