from:""

Re: [PATCH] tracing/probes: fix error check in parse_btf_field()

2024-05-26 Thread Carlos López




Hi,

On 26/5/24 12:17, Masami Hiramatsu (Google) wrote:

On Sat, 25 May 2024 20:21:32 +0200
Carlos López  wrote:


btf_find_struct_member() might return NULL or an error via the
ERR_PTR() macro. However, its caller in parse_btf_field() only checks
for the NULL condition. Fix this by using IS_ERR() and returning the
error up the stack.



Thanks for finding it!
I think this requires new error message for error_log file.
Can you add the log as

trace_probe_log_err(ctx->offset, BTF_ERROR);

And define BTF_ERROR in ERRORS@kernel/trace/trace_probe.h ?


Sounds good, but should we perhaps reuse BAD_BTF_TID?

```
C(BAD_BTF_TID,  "Failed to get BTF type info."),\
```

`btf_find_struct_member()` fails if `type` is not a struct or if it runs
OOM while allocating the anon stack, so it seems appropriate.

Best,
Carlos


Thank you,


Fixes: c440adfbe3025 ("tracing/probes: Support BTF based data structure field 
access")
Signed-off-by: Carlos López 
---
  kernel/trace/trace_probe.c | 2 ++
  1 file changed, 2 insertions(+)

diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 5e263c141574..5417e9712157 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -554,6 +554,8 @@ static int parse_btf_field(char *fieldname, const struct 
btf_type *type,
anon_offs = 0;
field = btf_find_struct_member(ctx->btf, type, 
fieldname,
   _offs);
+   if (IS_ERR(field))
+   return PTR_ERR(field);
if (!field) {
trace_probe_log_err(ctx->offset, NO_BTF_FIELD);
return -ENOENT;
--
2.35.3






--
Carlos López
Security Engineer
SUSE Software Solutions

[PATCH] ftrace: Fix stack trace entry generated by ftrace_pid_func()

2024-05-26 Thread Tatsuya S

On setting set_ftrace_pid, a extra entry generated by ftrace_pid_func()
is shown on stack trace(CONFIG_UNWINDER_FRAME_POINTER=y).

[004] .68.459382: 
 => 0xa00090af
 => ksys_read
 => __x64_sys_read
 => x64_sys_call
 => do_syscall_64
 => entry_SYSCALL_64_after_hwframe

To resolve this issue, increment skip count
in function_stack_trace_call() if pids are set.

Signed-off-by: Tatsuya S 
---
 include/linux/ftrace.h | 1 +
 kernel/trace/ftrace.c  | 2 +-
 kernel/trace/trace_functions.c | 7 ++-
 3 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 800995c425e0..d14447c0d0e9 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -666,6 +666,7 @@ unsigned long ftrace_location(unsigned long ip);
 unsigned long ftrace_location_range(unsigned long start, unsigned long end);
 unsigned long ftrace_get_addr_new(struct dyn_ftrace *rec);
 unsigned long ftrace_get_addr_curr(struct dyn_ftrace *rec);
+bool ftrace_pids_enabled(struct ftrace_ops *ops);
 
 extern ftrace_func_t ftrace_trace_function;
 
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 65208d3b5ed9..e8ddd56d1e55 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -99,7 +99,7 @@ struct ftrace_ops *function_trace_op __read_mostly = 
_list_end;
 /* What to set function_trace_op to */
 static struct ftrace_ops *set_function_trace_op;
 
-static bool ftrace_pids_enabled(struct ftrace_ops *ops)
+bool ftrace_pids_enabled(struct ftrace_ops *ops)
 {
struct trace_array *tr;
 
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 9f1bfbe105e8..455c9a880199 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -223,6 +223,7 @@ function_stack_trace_call(unsigned long ip, unsigned long 
parent_ip,
long disabled;
int cpu;
unsigned int trace_ctx;
+   int skip = STACK_SKIP;
 
if (unlikely(!tr->function_enabled))
return;
@@ -239,7 +240,11 @@ function_stack_trace_call(unsigned long ip, unsigned long 
parent_ip,
if (likely(disabled == 1)) {
trace_ctx = tracing_gen_ctx_flags(flags);
trace_function(tr, ip, parent_ip, trace_ctx);
-   __trace_stack(tr, trace_ctx, STACK_SKIP);
+#ifdef CONFIG_UNWINDER_FRAME_POINTER
+   if (ftrace_pids_enabled(op))
+   skip++;
+#endif
+   __trace_stack(tr, trace_ctx, skip);
}
 
atomic_dec(>disabled);
-- 
2.45.1

Re: [PATCH] tracing/probes: fix error check in parse_btf_field()

2024-05-26 Thread Google

On Sat, 25 May 2024 20:21:32 +0200
Carlos López  wrote:

> btf_find_struct_member() might return NULL or an error via the
> ERR_PTR() macro. However, its caller in parse_btf_field() only checks
> for the NULL condition. Fix this by using IS_ERR() and returning the
> error up the stack.
> 

Thanks for finding it!
I think this requires new error message for error_log file.
Can you add the log as

trace_probe_log_err(ctx->offset, BTF_ERROR);

And define BTF_ERROR in ERRORS@kernel/trace/trace_probe.h ?

Thank you,

> Fixes: c440adfbe3025 ("tracing/probes: Support BTF based data structure field 
> access")
> Signed-off-by: Carlos López 
> ---
>  kernel/trace/trace_probe.c | 2 ++
>  1 file changed, 2 insertions(+)
> 
> diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
> index 5e263c141574..5417e9712157 100644
> --- a/kernel/trace/trace_probe.c
> +++ b/kernel/trace/trace_probe.c
> @@ -554,6 +554,8 @@ static int parse_btf_field(char *fieldname, const struct 
> btf_type *type,
>   anon_offs = 0;
>   field = btf_find_struct_member(ctx->btf, type, 
> fieldname,
>  _offs);
> + if (IS_ERR(field))
> + return PTR_ERR(field);
>   if (!field) {
>   trace_probe_log_err(ctx->offset, NO_BTF_FIELD);
>   return -ENOENT;
> -- 
> 2.35.3
> 


-- 
Masami Hiramatsu (Google)

[PATCH 3/3] tracing/kprobe: Remove cleanup code unrelated to selftest

2024-05-26 Thread Masami Hiramatsu (Google)

From: Masami Hiramatsu (Google) 

This cleanup all kprobe events code is not related to the selftest
itself, and it can fail by the reason unrelated to this test.
If the test is successful, the generated events are cleaned up.
And if not, we cannot guarantee that the kprobe events will work
correctly. So, anyway, there is no need to clean it up.

Signed-off-by: Masami Hiramatsu (Google) 
---
 kernel/trace/trace_kprobe.c |5 -
 1 file changed, 5 deletions(-)

diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 4abed36544d0..f94628c15c14 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -2129,11 +2129,6 @@ static __init int kprobe_trace_self_tests_init(void)
}
 
 end:
-   ret = dyn_events_release_all(_kprobe_ops);
-   if (ret) {
-   pr_warn("error on cleaning up probes.\n");
-   warn++;
-   }
/*
 * Wait for the optimizer work to finish. Otherwise it might fiddle
 * with probes in already freed __init text.

[PATCH 2/3] tracing/kprobe: Remove unneeded WARN_ON_ONCE() in selftests

2024-05-26 Thread Masami Hiramatsu (Google)

From: Masami Hiramatsu (Google) 

Since the kprobe-events selftest shows OK or NG with the reason, the
WARN_ON_ONCE()s for each place are redundant. Let's remove it.

Signed-off-by: Masami Hiramatsu (Google) 
---
 kernel/trace/trace_kprobe.c |   26 +-
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 16383247bdbf..4abed36544d0 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -2023,18 +2023,18 @@ static __init int kprobe_trace_self_tests_init(void)
pr_info("Testing kprobe tracing: ");
 
ret = create_or_delete_trace_kprobe("p:testprobe 
kprobe_trace_selftest_target $stack $stack0 +0($stack)");
-   if (WARN_ON_ONCE(ret)) {
+   if (ret) {
pr_warn("error on probing function entry.\n");
warn++;
} else {
/* Enable trace point */
tk = find_trace_kprobe("testprobe", KPROBE_EVENT_SYSTEM);
-   if (WARN_ON_ONCE(tk == NULL)) {
+   if (tk == NULL) {
pr_warn("error on getting new probe.\n");
warn++;
} else {
file = find_trace_probe_file(tk, top_trace_array());
-   if (WARN_ON_ONCE(file == NULL)) {
+   if (file == NULL) {
pr_warn("error on getting probe file.\n");
warn++;
} else
@@ -2044,18 +2044,18 @@ static __init int kprobe_trace_self_tests_init(void)
}
 
ret = create_or_delete_trace_kprobe("r:testprobe2 
kprobe_trace_selftest_target $retval");
-   if (WARN_ON_ONCE(ret)) {
+   if (ret) {
pr_warn("error on probing function return.\n");
warn++;
} else {
/* Enable trace point */
tk = find_trace_kprobe("testprobe2", KPROBE_EVENT_SYSTEM);
-   if (WARN_ON_ONCE(tk == NULL)) {
+   if (tk == NULL) {
pr_warn("error on getting 2nd new probe.\n");
warn++;
} else {
file = find_trace_probe_file(tk, top_trace_array());
-   if (WARN_ON_ONCE(file == NULL)) {
+   if (file == NULL) {
pr_warn("error on getting probe file.\n");
warn++;
} else
@@ -2079,7 +2079,7 @@ static __init int kprobe_trace_self_tests_init(void)
 
/* Disable trace points before removing it */
tk = find_trace_kprobe("testprobe", KPROBE_EVENT_SYSTEM);
-   if (WARN_ON_ONCE(tk == NULL)) {
+   if (tk == NULL) {
pr_warn("error on getting test probe.\n");
warn++;
} else {
@@ -2089,7 +2089,7 @@ static __init int kprobe_trace_self_tests_init(void)
}
 
file = find_trace_probe_file(tk, top_trace_array());
-   if (WARN_ON_ONCE(file == NULL)) {
+   if (file == NULL) {
pr_warn("error on getting probe file.\n");
warn++;
} else
@@ -2098,7 +2098,7 @@ static __init int kprobe_trace_self_tests_init(void)
}
 
tk = find_trace_kprobe("testprobe2", KPROBE_EVENT_SYSTEM);
-   if (WARN_ON_ONCE(tk == NULL)) {
+   if (tk == NULL) {
pr_warn("error on getting 2nd test probe.\n");
warn++;
} else {
@@ -2108,7 +2108,7 @@ static __init int kprobe_trace_self_tests_init(void)
}
 
file = find_trace_probe_file(tk, top_trace_array());
-   if (WARN_ON_ONCE(file == NULL)) {
+   if (file == NULL) {
pr_warn("error on getting probe file.\n");
warn++;
} else
@@ -2117,20 +2117,20 @@ static __init int kprobe_trace_self_tests_init(void)
}
 
ret = create_or_delete_trace_kprobe("-:testprobe");
-   if (WARN_ON_ONCE(ret)) {
+   if (ret) {
pr_warn("error on deleting a probe.\n");
warn++;
}
 
ret = create_or_delete_trace_kprobe("-:testprobe2");
-   if (WARN_ON_ONCE(ret)) {
+   if (ret) {
pr_warn("error on deleting a probe.\n");
warn++;
}
 
 end:
ret = dyn_events_release_all(_kprobe_ops);
-   if (WARN_ON_ONCE(ret)) {
+   if (ret) {
pr_warn("error on cleaning up probes.\n");
warn++;
}

[PATCH 1/3] tracing: Build event generation tests only as modules

2024-05-26 Thread Masami Hiramatsu (Google)

From: Masami Hiramatsu (Google) 

Since the kprobes and synth event generation tests adds and enable
generated events in init_module() and delete it in exit_module(),
if we make it as built-in, those events are left in kernel and cause
kprobe event self-test failure.

[   97.349708] [ cut here ]
[   97.353453] WARNING: CPU: 3 PID: 1 at kernel/trace/trace_kprobe.c:2133 
kprobe_trace_self_tests_init+0x3f1/0x480
[   97.357106] Modules linked in:
[   97.358488] CPU: 3 PID: 1 Comm: swapper/0 Not tainted 
6.9.0-g699646734ab5-dirty #14
[   97.361556] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 
1.15.0-1 04/01/2014
[   97.363880] RIP: 0010:kprobe_trace_self_tests_init+0x3f1/0x480
[   97.365538] Code: a8 24 08 82 e9 ae fd ff ff 90 0f 0b 90 48 c7 c7 e5 aa 0b 
82 e9 ee fc ff ff 90 0f 0b 90 48 c7 c7 2d 61 06 82 e9 8e fd ff ff 90 <0f> 0b 90 
48 c7 c7 33 0b 0c 82 89 c6 e8 6e 03 1f ff 41 ff c7 e9 90
[   97.370429] RSP: :c9013b50 EFLAGS: 00010286
[   97.371852] RAX: fff0 RBX: 888005919c00 RCX: 
[   97.373829] RDX: 888003f4 RSI: 8236a598 RDI: 888003f40a68
[   97.375715] RBP:  R08: 0001 R09: 
[   97.377675] R10: 811c9ae5 R11: 8120c4e0 R12: 
[   97.379591] R13: 0001 R14: 0015 R15: 
[   97.381536] FS:  () GS:88807dcc() 
knlGS:
[   97.383813] CS:  0010 DS:  ES:  CR0: 80050033
[   97.385449] CR2:  CR3: 02244000 CR4: 06b0
[   97.387347] DR0:  DR1:  DR2: 
[   97.389277] DR3:  DR6: fffe0ff0 DR7: 0400
[   97.391196] Call Trace:
[   97.391967]  
[   97.392647]  ? __warn+0xcc/0x180
[   97.393640]  ? kprobe_trace_self_tests_init+0x3f1/0x480
[   97.395181]  ? report_bug+0xbd/0x150
[   97.396234]  ? handle_bug+0x3e/0x60
[   97.397311]  ? exc_invalid_op+0x1a/0x50
[   97.398434]  ? asm_exc_invalid_op+0x1a/0x20
[   97.399652]  ? trace_kprobe_is_busy+0x20/0x20
[   97.400904]  ? tracing_reset_all_online_cpus+0x15/0x90
[   97.402304]  ? kprobe_trace_self_tests_init+0x3f1/0x480
[   97.403773]  ? init_kprobe_trace+0x50/0x50
[   97.404972]  do_one_initcall+0x112/0x240
[   97.406113]  do_initcall_level+0x95/0xb0
[   97.407286]  ? kernel_init+0x1a/0x1a0
[   97.408401]  do_initcalls+0x3f/0x70
[   97.409452]  kernel_init_freeable+0x16f/0x1e0
[   97.410662]  ? rest_init+0x1f0/0x1f0
[   97.411738]  kernel_init+0x1a/0x1a0
[   97.412788]  ret_from_fork+0x39/0x50
[   97.413817]  ? rest_init+0x1f0/0x1f0
[   97.414844]  ret_from_fork_asm+0x11/0x20
[   97.416285]  
[   97.417134] irq event stamp: 13437323
[   97.418376] hardirqs last  enabled at (13437337): [] 
console_unlock+0x11c/0x150
[   97.421285] hardirqs last disabled at (13437370): [] 
console_unlock+0x101/0x150
[   97.423838] softirqs last  enabled at (13437366): [] 
handle_softirqs+0x23f/0x2a0
[   97.426450] softirqs last disabled at (13437393): [] 
__irq_exit_rcu+0x66/0xd0
[   97.428850] ---[ end trace  ]---

To avoid this issue, build these tests only as modules.

Fixes: 9fe41efaca08 ("tracing: Add synth event generation test module")
Fixes: 64836248dda2 ("tracing: Add kprobe event command generation test module")
Signed-off-by: Masami Hiramatsu (Google) 
---
 kernel/trace/Kconfig |4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 166ad5444eea..721c3b221048 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -1136,7 +1136,7 @@ config PREEMPTIRQ_DELAY_TEST
 
 config SYNTH_EVENT_GEN_TEST
tristate "Test module for in-kernel synthetic event generation"
-   depends on SYNTH_EVENTS
+   depends on SYNTH_EVENTS && m
help
   This option creates a test module to check the base
   functionality of in-kernel synthetic event definition and
@@ -1149,7 +1149,7 @@ config SYNTH_EVENT_GEN_TEST
 
 config KPROBE_EVENT_GEN_TEST
tristate "Test module for in-kernel kprobe event generation"
-   depends on KPROBE_EVENTS
+   depends on KPROBE_EVENTS && m
help
   This option creates a test module to check the base
   functionality of in-kernel kprobe event definition.

[PATCH 0/3] tracing: Fix some selftest issues

2024-05-26 Thread Masami Hiramatsu (Google)

Hi,

Here is a series of some fixes/improvements for the test modules and boot
time selftest of kprobe events. I found a WARNING message with some boot 
time selftest configuration, which came from the combination of embedded
kprobe generate API tests module and ftrace boot-time selftest. So the main
problem is that the test module should not be built-in. But I also think
this WARNING message is useless (because there are warning messages already)
and the cleanup code is redundant. This series fixes those issues.

Thank you,

---

Masami Hiramatsu (Google) (3):
  tracing: Build event generation tests only as modules
  tracing/kprobe: Remove unneeded WARN_ON_ONCE() in selftests
  tracing/kprobe: Remove cleanup code unrelated to selftest


 kernel/trace/Kconfig|4 ++--
 kernel/trace/trace_kprobe.c |   29 -
 2 files changed, 14 insertions(+), 19 deletions(-)

--
Masami Hiramatsu (Google)

[PATCH v2] x86/paravirt: Disable virt spinlock on bare metal

2024-05-25 Thread Chen Yu

The kernel can change spinlock behavior when running as a guest. But
this guest-friendly behavior causes performance problems on bare metal.
So there's a 'virt_spin_lock_key' static key to switch between the two
modes.

The static key is always enabled by default (run in guest mode) and
should be disabled for bare metal (and in some guests that want native
behavior).

Performance drop is reported when running encode/decode workload and
BenchSEE cache sub-workload.
Bisect points to commit ce0a1b608bfc ("x86/paravirt: Silence unused
native_pv_lock_init() function warning"). When CONFIG_PARAVIRT_SPINLOCKS
is disabled the virt_spin_lock_key is incorrectly set to true on bare
metal. The qspinlock degenerates to test-and-set spinlock, which
decrease the performance on bare metal.

Fix this by disabling virt_spin_lock_key if it is on bare metal,
regardless of CONFIG_PARAVIRT_SPINLOCKS.

Fixes: ce0a1b608bfc ("x86/paravirt: Silence unused native_pv_lock_init() 
function warning")
Suggested-by: Dave Hansen 
Suggested-by: Qiuxu Zhuo 
Reported-by: Prem Nath Dey 
Reported-by: Xiaoping Zhou 
Reviewed-by: Juergen Gross 
Signed-off-by: Chen Yu 
---
v1->v2:
  Refine the commit log per Dave's suggestion.
  Simplify the fix by directly disabling the virt_spin_lock_key on bare metal.
  Collect Reviewed-by from Juergen.
---
 arch/x86/kernel/paravirt.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 5358d43886ad..c193c9e60a1b 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -55,8 +55,7 @@ DEFINE_STATIC_KEY_TRUE(virt_spin_lock_key);
 
 void __init native_pv_lock_init(void)
 {
-   if (IS_ENABLED(CONFIG_PARAVIRT_SPINLOCKS) &&
-   !boot_cpu_has(X86_FEATURE_HYPERVISOR))
+   if (!boot_cpu_has(X86_FEATURE_HYPERVISOR))
static_branch_disable(_spin_lock_key);
 }
 
-- 
2.25.1

Re: [RFC PATCH 00/20] Introduce the famfs shared-memory file system

2024-05-25 Thread Dave Chinner

On Fri, May 24, 2024 at 09:55:48AM +0200, Miklos Szeredi wrote:
> On Fri, 24 May 2024 at 02:47, John Groves  wrote:
> 
> > Apologies, but I'm short on time at the moment - going into a long holiday
> > weekend in the US with family plans. I should be focused again by middle of
> > next week.
> 
> NP.
> 
> Obviously I'll need to test it before anything is merged, other than
> that this is not urgent at all...
> 
> > But can you check /proc/cmdline to see of the memmap arg got through without
> > getting mangled? The '$' tends to get fubar'd. You might need \$, or I've 
> > seen
> > the need for \\\$. If it's un-mangled, there should be a dax device.
> 
> /proc/cmdline shows the option correctly:
> 
> root@kvm:~# cat /proc/cmdline
> root=/dev/vda console=hvc0 memmap=4G$4G
> 
> > If that doesn't work, it's worth trying '!' instead, which I think would 
> > give
> > you a pmem device - if the arg gets through (but ! is less likely to get
> > horked). That pmem device can be converted to devdax...
> 
> That doesn't work either.  No device created in /dev  (dax or pmem).

I think you need to do some ndctl magic to get the memory to be
namespaced correctly for the correct devices to appear.

https://docs.pmem.io/ndctl-user-guide/managing-namespaces

IIRC, need to set the type to pmem and the mode to fsdax, devdax or
raw to get the relevant device nodes to be created for the range..

Cheers,

Dave.

-- 
Dave Chinner
da...@fromorbit.com

[PATCH] tracing/probes: fix error check in parse_btf_field()

2024-05-25 Thread Carlos López

btf_find_struct_member() might return NULL or an error via the
ERR_PTR() macro. However, its caller in parse_btf_field() only checks
for the NULL condition. Fix this by using IS_ERR() and returning the
error up the stack.

Fixes: c440adfbe3025 ("tracing/probes: Support BTF based data structure field 
access")
Signed-off-by: Carlos López 
---
 kernel/trace/trace_probe.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 5e263c141574..5417e9712157 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -554,6 +554,8 @@ static int parse_btf_field(char *fieldname, const struct 
btf_type *type,
anon_offs = 0;
field = btf_find_struct_member(ctx->btf, type, 
fieldname,
   _offs);
+   if (IS_ERR(field))
+   return PTR_ERR(field);
if (!field) {
trace_probe_log_err(ctx->offset, NO_BTF_FIELD);
return -ENOENT;
-- 
2.35.3

Re: [PATCH RFC 1/2] dt-bindings: soc: qcom,smsm: Allow specifying mboxes instead of qcom,ipc

2024-05-25 Thread Krzysztof Kozlowski

On 24/05/2024 19:55, Luca Weiss wrote:
> On Donnerstag, 23. Mai 2024 08:19:11 MESZ Krzysztof Kozlowski wrote:
>> On 23/05/2024 08:16, Luca Weiss wrote:
>>> On Donnerstag, 23. Mai 2024 08:02:13 MESZ Krzysztof Kozlowski wrote:
 On 22/05/2024 19:34, Luca Weiss wrote:
> On Mittwoch, 22. Mai 2024 08:49:43 MESZ Krzysztof Kozlowski wrote:
>> On 21/05/2024 22:35, Luca Weiss wrote:
>>> On Dienstag, 21. Mai 2024 10:58:07 MESZ Krzysztof Kozlowski wrote:
 On 20/05/2024 17:11, Luca Weiss wrote:
> Hi Krzysztof
>
> Ack, sounds good.
>
> Maybe also from you, any opinion between these two binding styles?
>
> So first using index of mboxes for the numbering, where for the known
> usages the first element (and sometimes the 3rd - ipc-2) are empty <>.
>
> The second variant is using mbox-names to get the correct channel-mbox
> mapping.
>
> -   qcom,ipc-1 = < 8 13>;
> -   qcom,ipc-2 = < 8 9>;
> -   qcom,ipc-3 = < 8 19>;
> +   mboxes = <0>, < 13>, < 9>, < 19>;
>
> vs.
>
> -   qcom,ipc-1 = < 8 13>;
> -   qcom,ipc-2 = < 8 9>;
> -   qcom,ipc-3 = < 8 19>;
> +   mboxes = < 13>, < 9>, < 19>;
> +   mbox-names = "ipc-1", "ipc-2", "ipc-3";

 Sorry, don't get, ipc-1 is the first mailbox, so why would there be <0>
 in first case?
>>>
>>> Actually not, ipc-0 would be permissible by the driver, used for the 
>>> 0th host
>>>
>>> e.g. from:
>>>
>>> /* Iterate over all hosts to check whom wants a kick */
>>> for (host = 0; host < smsm->num_hosts; host++) {
>>> hostp = >hosts[host];
>>>
>>> Even though no mailbox is specified in any upstream dts for this 0th 
>>> host I
>>> didn't want the bindings to restrict that, that's why in the first 
>>> example
>>> there's an empty element (<0>) for the 0th smsm host
>>>
 Anyway, the question is if you need to know that some
 mailbox is missing. But then it is weird to name them "ipc-1" etc.
>>>
>>> In either case we'd just query the mbox (either by name or index) and 
>>> then
>>> see if it's there? Not quite sure I understand the sentence..
>>> Pretty sure either binding would work the same way.
>>
>> The question is: does the driver care only about having some mailboxes
>> or the driver cares about each specific mailbox? IOW, is skipping ipc-0
>> important for the driver?
>
> There's nothing special from driver side about any mailbox. Some SoCs have
> a mailbox for e.g. hosts 1&2&3, some have only 1&3, and apq8064 even has
> 1&2&3&4.
>
> And if the driver doesn't find a mailbox for a host, it just ignores it
> but then of course it can't 'ring' the mailbox for that host when 
> necessary.
>
> Not sure how much more I can add here, to be fair I barely understand what
> this driver is doing myself apart from the obvious.

 From what you said, it looks like it is enough to just list mailboxes,
 e.g. for ipc-1, ipc-2 and ipc-4 (so no ipc-0 and ipc-3):
>>>
>>> No, for sure we need also the possibility to list ipc-3.
>>
>> ? You can list it, what's the problem>
> 
> Maybe we're talking past each other...
> 
> You asked why this wouldn't work:
> 
>   e.g. for ipc-1, ipc-2 and ipc-4 (so no ipc-0 and ipc-3):
>   mboxes = < 13>, < 9>, < 19>;
> 
> How would we know that the 3rd mailbox ( 19) is for the 4th host
> (previous ipc-4)?
> 
> 1. If we use mboxes with indexes we'd need to have <0> values for
> "smsm hosts" where we don't have a mailbox for - this is at least
> for the 2nd smsm host (qcom,ipc-2) for a bunch of SoCs.
> 
> 2. If we use mboxes with mbox-names then we could skip that since we
> can directly specify which "smsm host" a given mailbox is for.
> 
> My only question really is whether 1. or 2. is a better idea.
> 
> Is this clearer now or still not?


So again, does the driver care about missing entry? If so, why?

Best regards,
Krzysztof

Re: [PATCH v10 00/36] tracing: fprobe: function_graph: Multi-function graph and fprobe on fgraph

2024-05-25 Thread Google

On Fri, 24 May 2024 18:41:56 -0400
Steven Rostedt  wrote:

> On Tue,  7 May 2024 23:08:00 +0900
> "Masami Hiramatsu (Google)"  wrote:
> 
> > Steven Rostedt (VMware) (15):
> >   function_graph: Convert ret_stack to a series of longs
> >   fgraph: Use BUILD_BUG_ON() to make sure we have structures divisible 
> > by long
> >   function_graph: Add an array structure that will allow multiple 
> > callbacks
> >   function_graph: Allow multiple users to attach to function graph
> >   function_graph: Remove logic around ftrace_graph_entry and return
> >   ftrace/function_graph: Pass fgraph_ops to function graph callbacks
> >   ftrace: Allow function_graph tracer to be enabled in instances
> >   ftrace: Allow ftrace startup flags exist without dynamic ftrace
> >   function_graph: Have the instances use their own ftrace_ops for 
> > filtering
> >   function_graph: Add "task variables" per task for fgraph_ops
> >   function_graph: Move set_graph_function tests to shadow stack global 
> > var
> >   function_graph: Move graph depth stored data to shadow stack global 
> > var
> >   function_graph: Move graph notrace bit to shadow stack global var
> >   function_graph: Implement fgraph_reserve_data() and 
> > fgraph_retrieve_data()
> >   function_graph: Add selftest for passing local variables
> 
> Hi Masami,
> 
> While reviewing these patches, I realized there's several things I dislike
> about the patches I wrote. So I took these patches and started cleaning
> them up a little. Mostly renaming functions and adding comments.

Thanks for cleaning up the patches!!

> 
> As this is a major change to the function graph tracer, and I feel nervous
> about building something on top of this, how about I take over these
> patches and push them out for the next merge window. I'm hoping to get them
> into linux-next by v6.10-rc2 (I spent the day working on them, and it's
> mostly minor tweaks).

OK.

> Then I can push it out to 6.11 and get some good testing against it. Then
> we can add your stuff on top and get that merged in 6.12.

Yeah, it is reasonable plan. I also concerns about the stability. Especially,
this involves fprobe side changes too. If we introduce both at once, it may
mess up many things.

> 
> If all goes well, I'm hoping to get a series on just these patches (and
> your selftest addition) by tonight.
> 
> Thoughts?

I agree with you.

Thank you,

> 
> -- Steve


-- 
Masami Hiramatsu (Google)

Re: [PATCH v10 07/36] function_graph: Allow multiple users to attach to function graph

2024-05-25 Thread Google

On Fri, 24 May 2024 21:32:08 -0400
Steven Rostedt  wrote:

> On Tue,  7 May 2024 23:09:22 +0900
> "Masami Hiramatsu (Google)"  wrote:
> 
> > @@ -109,6 +244,21 @@ ftrace_push_return_trace(unsigned long ret, unsigned 
> > long func,
> > if (!current->ret_stack)
> > return -EBUSY;
> >  
> > +   /*
> > +* At first, check whether the previous fgraph callback is pushed by
> > +* the fgraph on the same function entry.
> > +* But if @func is the self tail-call function, we also need to ensure
> > +* the ret_stack is not for the previous call by checking whether the
> > +* bit of @fgraph_idx is set or not.
> > +*/
> > +   ret_stack = get_ret_stack(current, current->curr_ret_stack, );
> > +   if (ret_stack && ret_stack->func == func &&
> > +   get_fgraph_type(current, offset + FGRAPH_FRAME_OFFSET) == 
> > FGRAPH_TYPE_BITMAP &&
> > +   !is_fgraph_index_set(current, offset + FGRAPH_FRAME_OFFSET, 
> > fgraph_idx))
> > +   return offset + FGRAPH_FRAME_OFFSET;
> > +
> > +   val = (FGRAPH_TYPE_RESERVED << FGRAPH_TYPE_SHIFT) | FGRAPH_FRAME_OFFSET;
> > +
> > BUILD_BUG_ON(SHADOW_STACK_SIZE % sizeof(long));
> 
> I'm trying to figure out what the above is trying to do. This gets called
> once in function_graph_enter() (or function_graph_enter_ops()). What
> exactly are you trying to catch here?

Aah, good catch! This was originally for catching the self tail-call case with
multiple fgraph callback on the same function, but it was my misread.
In later patch ([12/36]), we introduced function_graph_enter_ops() so that
we can skip checking hash table and directly pass the fgraph_ops to user
callback. I thought this function_graph_enter_ops() is used even if multiple
fgraph is set on the same function. In this case, we always need to check the
stack can be reused(pushed by other fgraph_ops on the same function) or not.
But as we discussed, the function_graph_enter_ops() is used only when only
one fgraph is set on the function (if there are multiple fgraphs are set on
the same function, use function_graph_enter() ), we are sure that 
ftrace_push_return_trace() is called only once on hooking the function entry.
Thus we don't need to reuse it.

> 
> Is it from this email:
> 
>   
> https://lore.kernel.org/all/20231110105154.df937bf9f200a0c16806c...@kernel.org/
> 
> As that's the last version before you added the above code.
> 
> But you also noticed it may not be needed, but triggered a crash without it
> in v3:
> 
>   
> https://lore.kernel.org/all/20231205234511.3839128259dfec153ea7d...@kernel.org/
> 
> I removed this code in my version and it runs just fine. Perhaps there was
> another bug that this was hiding that you fixed in later versions?

No problem. I think we can remove this block safely.

Thank you,

> 
> -- Steve
> 

-- 
Masami Hiramatsu (Google)

Re: [PATCH] x86/paravirt: Disable virt spinlock when CONFIG_PARAVIRT_SPINLOCKS disabled

2024-05-25 Thread Chen Yu

On 2024-05-23 at 09:30:59 -0700, Dave Hansen wrote:
> On 5/16/24 06:02, Chen Yu wrote:
> > Performance drop is reported when running encode/decode workload and
> > BenchSEE cache sub-workload.
> > Bisect points to commit ce0a1b608bfc ("x86/paravirt: Silence unused
> > native_pv_lock_init() function warning"). When CONFIG_PARAVIRT_SPINLOCKS
> > is disabled the virt_spin_lock_key is set to true on bare-metal.
> > The qspinlock degenerates to test-and-set spinlock, which decrease the
> > performance on bare-metal.
> > 
> > Fix this by disabling virt_spin_lock_key if CONFIG_PARAVIRT_SPINLOCKS
> > is not set, or it is on bare-metal.
> 
> This is missing some background:
> 
> The kernel can change spinlock behavior when running as a guest.  But
> this guest-friendly behavior causes performance problems on bare metal.
> So there's a 'virt_spin_lock_key' static key to switch between the two
> modes.
> 
> The static key is always enabled by default (run in guest mode) and
> should be disabled for bare metal (and in some guests that want native
> behavior).
> 
> ... then describe the regression and the fix
>
Thanks Juergen for your review.

And thanks Dave for the write up, I'll refine the log according to your 
suggestion. 

> > diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
> > index 5358d43886ad..ee51c0949ed8 100644
> > --- a/arch/x86/kernel/paravirt.c
> > +++ b/arch/x86/kernel/paravirt.c
> > @@ -55,7 +55,7 @@ DEFINE_STATIC_KEY_TRUE(virt_spin_lock_key);
> >  
> >  void __init native_pv_lock_init(void)
> >  {
> > -   if (IS_ENABLED(CONFIG_PARAVIRT_SPINLOCKS) &&
> > +   if (!IS_ENABLED(CONFIG_PARAVIRT_SPINLOCKS) ||
> > !boot_cpu_has(X86_FEATURE_HYPERVISOR))
> > static_branch_disable(_spin_lock_key);
> >  }
> This gets used at a single site:
> 
> if (pv_enabled())
> goto pv_queue;
> 
> if (virt_spin_lock(lock))
> return;
> 
> which is logically:
> 
>   if (IS_ENABLED(CONFIG_PARAVIRT_SPINLOCKS))
>   goto ...; // don't look at virt_spin_lock_key
> 
>   if (virt_spin_lock_key)
>   return; // On virt, but non-paravirt.  Did Test-and-Set
>   // spinlock.
>

Thanks for the description in detail, my original change might break the
"X86_FEATURE_HYPERVISOR + NO_CONFIG_PARAVIRT_SPINLOCKS " case that, the guest
can not fall into test-and-set.
 
> So I _think_ Arnd was trying to optimize native_pv_lock_init() away when
> it's going to get skipped over anyway by the 'goto'.
> 
> But this took me at least 30 minutes of scratching my head and trying to
> untangle the whole thing.  It's all far too subtle for my taste, and all
> of that to save a few bytes of init text in a configuration that's
> probably not even used very often (PARAVIRT=y, but PARAVIRT_SPINLOCKS=n).
> 
> Let's just keep it simple.  How about the attached patch?

Yes, this one works, I'll refine it.

thanks,
Chenyu

[PATCH 18/20] ftrace: Add multiple fgraph storage selftest

2024-05-24 Thread Steven Rostedt

From: "Masami Hiramatsu (Google)" 

Add a selftest for multiple function graph tracer with storage on a same
function. In this case, the shadow stack entry will be shared among those
fgraph with different data storage. So this will ensure the fgraph will
not mixed those storage data.

Link: 
https://lore.kernel.org/linux-trace-kernel/171509111465.162236.3795819216426570800.stgit@devnote2

Signed-off-by: Masami Hiramatsu (Google) 
Suggested-by: Steven Rostedt (Google) 
Signed-off-by: Steven Rostedt (Google) 
---
 kernel/trace/trace_selftest.c | 171 +-
 1 file changed, 126 insertions(+), 45 deletions(-)

diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index fcdc744c245e..369efc569238 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -762,28 +762,32 @@ trace_selftest_startup_function(struct tracer *trace, 
struct trace_array *tr)
 #define SHORT_NUMBER 12345
 #define WORD_NUMBER 1234567890
 #define LONG_NUMBER 1234567890123456789LL
-
-static int fgraph_store_size __initdata;
-static const char *fgraph_store_type_name __initdata;
-static char *fgraph_error_str __initdata;
-static char fgraph_error_str_buf[128] __initdata;
+#define ERRSTR_BUFLEN 128
+
+struct fgraph_fixture {
+   struct fgraph_ops gops;
+   int store_size;
+   const char *store_type_name;
+   char error_str_buf[ERRSTR_BUFLEN];
+   char *error_str;
+};
 
 static __init int store_entry(struct ftrace_graph_ent *trace,
  struct fgraph_ops *gops)
 {
-   const char *type = fgraph_store_type_name;
-   int size = fgraph_store_size;
+   struct fgraph_fixture *fixture = container_of(gops, struct 
fgraph_fixture, gops);
+   const char *type = fixture->store_type_name;
+   int size = fixture->store_size;
void *p;
 
p = fgraph_reserve_data(gops->idx, size);
if (!p) {
-   snprintf(fgraph_error_str_buf, sizeof(fgraph_error_str_buf),
+   snprintf(fixture->error_str_buf, ERRSTR_BUFLEN,
 "Failed to reserve %s\n", type);
-   fgraph_error_str = fgraph_error_str_buf;
return 0;
}
 
-   switch (fgraph_store_size) {
+   switch (size) {
case 1:
*(char *)p = BYTE_NUMBER;
break;
@@ -804,7 +808,8 @@ static __init int store_entry(struct ftrace_graph_ent 
*trace,
 static __init void store_return(struct ftrace_graph_ret *trace,
struct fgraph_ops *gops)
 {
-   const char *type = fgraph_store_type_name;
+   struct fgraph_fixture *fixture = container_of(gops, struct 
fgraph_fixture, gops);
+   const char *type = fixture->store_type_name;
long long expect = 0;
long long found = -1;
int size;
@@ -812,20 +817,18 @@ static __init void store_return(struct ftrace_graph_ret 
*trace,
 
p = fgraph_retrieve_data(gops->idx, );
if (!p) {
-   snprintf(fgraph_error_str_buf, sizeof(fgraph_error_str_buf),
+   snprintf(fixture->error_str_buf, ERRSTR_BUFLEN,
 "Failed to retrieve %s\n", type);
-   fgraph_error_str = fgraph_error_str_buf;
return;
}
-   if (fgraph_store_size > size) {
-   snprintf(fgraph_error_str_buf, sizeof(fgraph_error_str_buf),
+   if (fixture->store_size > size) {
+   snprintf(fixture->error_str_buf, ERRSTR_BUFLEN,
 "Retrieved size %d is smaller than expected %d\n",
-size, (int)fgraph_store_size);
-   fgraph_error_str = fgraph_error_str_buf;
+size, (int)fixture->store_size);
return;
}
 
-   switch (fgraph_store_size) {
+   switch (fixture->store_size) {
case 1:
expect = BYTE_NUMBER;
found = *(char *)p;
@@ -845,45 +848,44 @@ static __init void store_return(struct ftrace_graph_ret 
*trace,
}
 
if (found != expect) {
-   snprintf(fgraph_error_str_buf, sizeof(fgraph_error_str_buf),
+   snprintf(fixture->error_str_buf, ERRSTR_BUFLEN,
 "%s returned not %lld but %lld\n", type, expect, 
found);
-   fgraph_error_str = fgraph_error_str_buf;
return;
}
-   fgraph_error_str = NULL;
+   fixture->error_str = NULL;
 }
 
-static struct fgraph_ops store_bytes __initdata = {
-   .entryfunc  = store_entry,
-   .retfunc= store_return,
-};
-
-static int __init test_graph_storage_type(const char *name, int size)
+static int __init init_fgraph_fixture(struct fgraph_fixture *fixture)
 {
char *func_name;
int len;
-   int ret;
 
-   fgraph_store_type_name = name;
-   fgraph_store_size = size;
+   snprintf(fixture->error_str_buf, ERRSTR_BUFLEN,
+"Failed to

[PATCH 20/20] function_graph: Use bitmask to loop on fgraph entry

2024-05-24 Thread Steven Rostedt

From: "Steven Rostedt (Google)" 

Instead of looping through all the elements of fgraph_array[] to see if
there's an gops attached to one and then calling its gops->func(). Create
a fgraph_array_bitmask that sets bits when an index in the array is
reserved (via the simple lru algorithm). Then only the bits set in this
bitmask needs to be looked at where only elements in the array that have
ops registered need to be looked at.

Note, we do not care about races. If a bit is set before the gops is
assigned, it only wastes time looking at the element and ignoring it (as
it did before this bitmask is added).

Signed-off-by: Steven Rostedt (Google) 
---
 kernel/trace/fgraph.c | 8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index 5e8e13ffcfb6..1aae521e5997 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -173,6 +173,7 @@ DEFINE_STATIC_KEY_FALSE(kill_ftrace_graph);
 int ftrace_graph_active;
 
 static struct fgraph_ops *fgraph_array[FGRAPH_ARRAY_SIZE];
+static unsigned long fgraph_array_bitmask;
 
 /* LRU index table for fgraph_array */
 static int fgraph_lru_table[FGRAPH_ARRAY_SIZE];
@@ -197,6 +198,8 @@ static int fgraph_lru_release_index(int idx)
 
fgraph_lru_table[fgraph_lru_last] = idx;
fgraph_lru_last = (fgraph_lru_last + 1) % FGRAPH_ARRAY_SIZE;
+
+   clear_bit(idx, _array_bitmask);
return 0;
 }
 
@@ -211,6 +214,8 @@ static int fgraph_lru_alloc_index(void)
 
fgraph_lru_table[fgraph_lru_next] = -1;
fgraph_lru_next = (fgraph_lru_next + 1) % FGRAPH_ARRAY_SIZE;
+
+   set_bit(idx, _array_bitmask);
return idx;
 }
 
@@ -632,7 +637,8 @@ int function_graph_enter(unsigned long ret, unsigned long 
func,
if (offset < 0)
goto out;
 
-   for (i = 0; i < FGRAPH_ARRAY_SIZE; i++) {
+   for_each_set_bit(i, _array_bitmask,
+sizeof(fgraph_array_bitmask) * BITS_PER_BYTE) {
struct fgraph_ops *gops = fgraph_array[i];
int save_curr_ret_stack;
 
-- 
2.43.0

[PATCH 19/20] function_graph: Use for_each_set_bit() in __ftrace_return_to_handler()

2024-05-24 Thread Steven Rostedt

From: "Steven Rostedt (Google)" 

Instead of iterating through the entire fgraph_array[] and seeing if one
of the bitmap bits are set to know to call the array's retfunc() function,
use for_each_set_bit() on the bitmap itself. This will only iterate for
the number of set bits.

Signed-off-by: Steven Rostedt (Google) 
---
 kernel/trace/fgraph.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index 4d503b3e45ad..5e8e13ffcfb6 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -827,11 +827,10 @@ static unsigned long __ftrace_return_to_handler(struct 
fgraph_ret_regs *ret_regs
 #endif
 
bitmap = get_bitmap_bits(current, offset);
-   for (i = 0; i < FGRAPH_ARRAY_SIZE; i++) {
+
+   for_each_set_bit(i, , sizeof(bitmap) * BITS_PER_BYTE) {
struct fgraph_ops *gops = fgraph_array[i];
 
-   if (!(bitmap & BIT(i)))
-   continue;
if (gops == _stub)
continue;
 
-- 
2.43.0

[PATCH 17/20] function_graph: Add selftest for passing local variables

2024-05-24 Thread Steven Rostedt

From: "Steven Rostedt (VMware)" 

Add boot up selftest that passes variables from a function entry to a
function exit, and make sure that they do get passed around.

Co-developed with Masami Hiramatsu:
Link: 
https://lore.kernel.org/linux-trace-kernel/171509110271.162236.11047551496319744627.stgit@devnote2

Signed-off-by: Steven Rostedt (VMware) 
Signed-off-by: Masami Hiramatsu (Google) 
Signed-off-by: Steven Rostedt (Google) 
---
 kernel/trace/trace_selftest.c | 169 ++
 1 file changed, 169 insertions(+)

diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index f8f55fd79e53..fcdc744c245e 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -756,6 +756,173 @@ trace_selftest_startup_function(struct tracer *trace, 
struct trace_array *tr)
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 
+#ifdef CONFIG_DYNAMIC_FTRACE
+
+#define BYTE_NUMBER 123
+#define SHORT_NUMBER 12345
+#define WORD_NUMBER 1234567890
+#define LONG_NUMBER 1234567890123456789LL
+
+static int fgraph_store_size __initdata;
+static const char *fgraph_store_type_name __initdata;
+static char *fgraph_error_str __initdata;
+static char fgraph_error_str_buf[128] __initdata;
+
+static __init int store_entry(struct ftrace_graph_ent *trace,
+ struct fgraph_ops *gops)
+{
+   const char *type = fgraph_store_type_name;
+   int size = fgraph_store_size;
+   void *p;
+
+   p = fgraph_reserve_data(gops->idx, size);
+   if (!p) {
+   snprintf(fgraph_error_str_buf, sizeof(fgraph_error_str_buf),
+"Failed to reserve %s\n", type);
+   fgraph_error_str = fgraph_error_str_buf;
+   return 0;
+   }
+
+   switch (fgraph_store_size) {
+   case 1:
+   *(char *)p = BYTE_NUMBER;
+   break;
+   case 2:
+   *(short *)p = SHORT_NUMBER;
+   break;
+   case 4:
+   *(int *)p = WORD_NUMBER;
+   break;
+   case 8:
+   *(long long *)p = LONG_NUMBER;
+   break;
+   }
+
+   return 1;
+}
+
+static __init void store_return(struct ftrace_graph_ret *trace,
+   struct fgraph_ops *gops)
+{
+   const char *type = fgraph_store_type_name;
+   long long expect = 0;
+   long long found = -1;
+   int size;
+   char *p;
+
+   p = fgraph_retrieve_data(gops->idx, );
+   if (!p) {
+   snprintf(fgraph_error_str_buf, sizeof(fgraph_error_str_buf),
+"Failed to retrieve %s\n", type);
+   fgraph_error_str = fgraph_error_str_buf;
+   return;
+   }
+   if (fgraph_store_size > size) {
+   snprintf(fgraph_error_str_buf, sizeof(fgraph_error_str_buf),
+"Retrieved size %d is smaller than expected %d\n",
+size, (int)fgraph_store_size);
+   fgraph_error_str = fgraph_error_str_buf;
+   return;
+   }
+
+   switch (fgraph_store_size) {
+   case 1:
+   expect = BYTE_NUMBER;
+   found = *(char *)p;
+   break;
+   case 2:
+   expect = SHORT_NUMBER;
+   found = *(short *)p;
+   break;
+   case 4:
+   expect = WORD_NUMBER;
+   found = *(int *)p;
+   break;
+   case 8:
+   expect = LONG_NUMBER;
+   found = *(long long *)p;
+   break;
+   }
+
+   if (found != expect) {
+   snprintf(fgraph_error_str_buf, sizeof(fgraph_error_str_buf),
+"%s returned not %lld but %lld\n", type, expect, 
found);
+   fgraph_error_str = fgraph_error_str_buf;
+   return;
+   }
+   fgraph_error_str = NULL;
+}
+
+static struct fgraph_ops store_bytes __initdata = {
+   .entryfunc  = store_entry,
+   .retfunc= store_return,
+};
+
+static int __init test_graph_storage_type(const char *name, int size)
+{
+   char *func_name;
+   int len;
+   int ret;
+
+   fgraph_store_type_name = name;
+   fgraph_store_size = size;
+
+   snprintf(fgraph_error_str_buf, sizeof(fgraph_error_str_buf),
+"Failed to execute storage %s\n", name);
+   fgraph_error_str = fgraph_error_str_buf;
+
+   pr_cont("PASSED\n");
+   pr_info("Testing fgraph storage of %d byte%s: ", size, size > 1 ? "s" : 
"");
+
+   func_name = "*" __stringify(DYN_FTRACE_TEST_NAME);
+   len = strlen(func_name);
+
+   ret = ftrace_set_filter(_bytes.ops, func_name, len, 1);
+   if (ret && ret != -ENODEV) {
+   pr_cont("*Could not set filter* ");
+   return -1;
+   }
+
+   ret = register_ftrace_graph(_bytes);
+   if (ret) {
+   pr_warn("Failed to init store_bytes fgraph tracing\n");
+   return -1;

[PATCH 16/20] function_graph: Implement fgraph_reserve_data() and fgraph_retrieve_data()

2024-05-24 Thread Steven Rostedt

From: "Steven Rostedt (VMware)" 

Added functions that can be called by a fgraph_ops entryfunc and retfunc to
store state between the entry of the function being traced to the exit of
the same function. The fgraph_ops entryfunc() may call
fgraph_reserve_data() to store up to 32 words onto the task's shadow
ret_stack and this then can be retrieved by fgraph_retrieve_data() called
by the corresponding retfunc().

Co-developed with Masami Hiramatsu:
Link: 
https://lore.kernel.org/linux-trace-kernel/171509109089.162236.11372474169781184034.stgit@devnote2

Signed-off-by: Steven Rostedt (VMware) 
Signed-off-by: Masami Hiramatsu (Google) 
Signed-off-by: Steven Rostedt (Google) 
---
 include/linux/ftrace.h |   3 +
 kernel/trace/fgraph.c  | 196 +++--
 2 files changed, 190 insertions(+), 9 deletions(-)

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 80eb1ab3cae3..1f6a6dc1e140 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -1046,6 +1046,9 @@ struct fgraph_ops {
int idx;
 };
 
+void *fgraph_reserve_data(int idx, int size_bytes);
+void *fgraph_retrieve_data(int idx, int *size_bytes);
+
 /*
  * Stack of return addresses for functions
  * of a thread.
diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index 0d536a48f696..4d503b3e45ad 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -32,12 +32,11 @@
  * holds a bitmask and a type (called "bitmap"). The bitmap is defined as:
  *
  * bits:  0 -  9   offset in words from the previous ftrace_ret_stack
- * Currently, this will always be set to 
FGRAPH_FRAME_OFFSET
- * to get to the fgraph frame.
  *
  * bits: 10 - 11   Type of storage
  *   0 - reserved
  *   1 - bitmap of fgraph_array index
+ *   2 - reserved data
  *
  * For type with "bitmap of fgraph_array index" (FGRAPH_TYPE_BITMAP):
  *  bits: 12 - 27  The bitmap of fgraph_ops fgraph_array index
@@ -50,10 +49,15 @@
  * The top of the ret_stack (when not empty) will always have a reference
  * word that points to the last fgraph frame that was saved.
  *
+ * For reserved data:
+ *  bits: 12 - 17  The size in words that is stored
+ *  bits: 18 - 23  The index of fgraph_array, which shows who is stored
+ *
  * That is, at the end of function_graph_enter, if the first and forth
  * fgraph_ops on the fgraph_array[] (index 0 and 3) needs their retfunc called
- * on the return of the function being traced, this is what will be on the
- * task's shadow ret_stack: (the stack grows upward)
+ * on the return of the function being traced, and the forth fgraph_ops
+ * stored two words of data, this is what will be on the task's shadow
+ * ret_stack: (the stack grows upward)
  *
  *  ret_stack[SHADOW_STACK_OFFSET]
  * | SHADOW_STACK_TASK_VARS(ret_stack)[15]  |
@@ -63,11 +67,21 @@
  * ...
  * || <- task->curr_ret_stack
  * ++
+ * | (3 << 12) | (3 << 10) | FGRAPH_FRAME_OFFSET|
+ * | *or put another way*   |
+ * | (3 << FGRAPH_DATA_INDEX_SHIFT)| \  | This is for fgraph_ops[3].
+ * | ((2 - 1) << FGRAPH_DATA_SHIFT)| \  | The data size is 2 words.
+ * | (FGRAPH_TYPE_DATA << FGRAPH_TYPE_SHIFT)| \ |
+ * | (offset2:FGRAPH_FRAME_OFFSET+3)| <- the offset2 is from here
+ * ++ ( It is 4 words from the 
ret_stack)
+ * |STORED DATA WORD 2  |
+ * |STORED DATA WORD 1  |
+ * ++
  * | (9 << 12) | (1 << 10) | FGRAPH_FRAME_OFFSET|
  * | *or put another way*   |
  * | (BIT(3)|BIT(0)) << FGRAPH_INDEX_SHIFT | \  |
  * | FGRAPH_TYPE_BITMAP << FGRAPH_TYPE_SHIFT| \ |
- * | (offset:FGRAPH_FRAME_OFFSET)   | <- the offset is from here
+ * | (offset1:FGRAPH_FRAME_OFFSET)  | <- the offset1 is from here
  * ++
  * | struct ftrace_ret_stack|
  * |   (stores the saved ret pointer)   | <- the offset points here
@@ -101,6 +115,7 @@
 enum {
FGRAPH_TYPE_RESERVED= 0,
FGRAPH_TYPE_BITMAP  = 1,
+   FGRAPH_TYPE_DATA= 2,
 };
 
 /*
@@ -111,6 +126,26 @@ enum {
 #define FGRAPH_INDEX_MASK  GENMASK(FGRAPH_INDEX_BITS - 1, 0)
 #define FGRAPH_INDEX_SHIFT (FGRAPH_TYPE_SHIFT + FGRAPH_TYPE_BITS)
 
+/*
+ * For DATA type:
+ *  FGRAPH_DATA (12-17) bits hold the size of data (in words)
+ *  FGRAPH_INDEX (18-23) bits hold the index for which gops->idx the data is 
for
+ *
+ * Note:
+ *  data_size == 0 means 1 word, and 31 (=2^5 - 1) means 32 words.
+ */
+#define FGRAPH_DATA_BITS   5
+#define FGRAPH_DATA_MASK   GENMASK(FGRAPH_DATA_BITS - 1, 0)
+#define FGRAPH_DATA_SHIFT  (FGRAPH_TYPE_SHIFT +

[PATCH 15/20] function_graph: Move graph notrace bit to shadow stack global var

2024-05-24 Thread Steven Rostedt

From: "Steven Rostedt (VMware)" 

The use of the task->trace_recursion for the logic used for the function
graph no-trace was a bit of an abuse of that variable. Now that there
exists global vars that are per stack for registered graph traces, use
that instead.

Link: 
https://lore.kernel.org/linux-trace-kernel/171509107907.162236.6564679266777519065.stgit@devnote2

Signed-off-by: Steven Rostedt (VMware) 
Signed-off-by: Masami Hiramatsu (Google) 
Signed-off-by: Steven Rostedt (Google) 
---
 include/linux/trace_recursion.h  |  7 ---
 kernel/trace/trace.h |  9 +
 kernel/trace/trace_functions_graph.c | 10 ++
 3 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/include/linux/trace_recursion.h b/include/linux/trace_recursion.h
index fdfb6f66718a..ae04054a1be3 100644
--- a/include/linux/trace_recursion.h
+++ b/include/linux/trace_recursion.h
@@ -44,13 +44,6 @@ enum {
  */
TRACE_IRQ_BIT,
 
-   /*
-* To implement set_graph_notrace, if this bit is set, we ignore
-* function graph tracing of called functions, until the return
-* function is called to clear it.
-*/
-   TRACE_GRAPH_NOTRACE_BIT,
-
/* Used to prevent recursion recording from recursing. */
TRACE_RECORD_RECURSION_BIT,
 };
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 5398b6889027..c91953fb58f5 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -919,8 +919,17 @@ enum {
 
TRACE_GRAPH_DEPTH_START_BIT,
TRACE_GRAPH_DEPTH_END_BIT,
+
+   /*
+* To implement set_graph_notrace, if this bit is set, we ignore
+* function graph tracing of called functions, until the return
+* function is called to clear it.
+*/
+   TRACE_GRAPH_NOTRACE_BIT,
 };
 
+#define TRACE_GRAPH_NOTRACE(1 << TRACE_GRAPH_NOTRACE_BIT)
+
 static inline unsigned long ftrace_graph_depth(unsigned long *task_var)
 {
return (*task_var >> TRACE_GRAPH_DEPTH_START_BIT) & 3;
diff --git a/kernel/trace/trace_functions_graph.c 
b/kernel/trace/trace_functions_graph.c
index 66cce73e94f8..13d0387ac6a6 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -130,6 +130,7 @@ static inline int ftrace_graph_ignore_irqs(void)
 int trace_graph_entry(struct ftrace_graph_ent *trace,
  struct fgraph_ops *gops)
 {
+   unsigned long *task_var = fgraph_get_task_var(gops);
struct trace_array *tr = gops->private;
struct trace_array_cpu *data;
unsigned long flags;
@@ -138,7 +139,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace,
int ret;
int cpu;
 
-   if (trace_recursion_test(TRACE_GRAPH_NOTRACE_BIT))
+   if (*task_var & TRACE_GRAPH_NOTRACE)
return 0;
 
/*
@@ -149,7 +150,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace,
 * returning from the function.
 */
if (ftrace_graph_notrace_addr(trace->func)) {
-   trace_recursion_set(TRACE_GRAPH_NOTRACE_BIT);
+   *task_var |= TRACE_GRAPH_NOTRACE_BIT;
/*
 * Need to return 1 to have the return called
 * that will clear the NOTRACE bit.
@@ -240,6 +241,7 @@ void __trace_graph_return(struct trace_array *tr,
 void trace_graph_return(struct ftrace_graph_ret *trace,
struct fgraph_ops *gops)
 {
+   unsigned long *task_var = fgraph_get_task_var(gops);
struct trace_array *tr = gops->private;
struct trace_array_cpu *data;
unsigned long flags;
@@ -249,8 +251,8 @@ void trace_graph_return(struct ftrace_graph_ret *trace,
 
ftrace_graph_addr_finish(gops, trace);
 
-   if (trace_recursion_test(TRACE_GRAPH_NOTRACE_BIT)) {
-   trace_recursion_clear(TRACE_GRAPH_NOTRACE_BIT);
+   if (*task_var & TRACE_GRAPH_NOTRACE) {
+   *task_var &= ~TRACE_GRAPH_NOTRACE;
return;
}
 
-- 
2.43.0

[PATCH 14/20] function_graph: Move graph depth stored data to shadow stack global var

2024-05-24 Thread Steven Rostedt

From: "Steven Rostedt (VMware)" 

The use of the task->trace_recursion for the logic used for the function
graph depth was a bit of an abuse of that variable. Now that there
exists global vars that are per stack for registered graph traces, use that
instead.

Link: 
https://lore.kernel.org/linux-trace-kernel/171509106728.162236.2398372644430125344.stgit@devnote2

Signed-off-by: Steven Rostedt (VMware) 
Signed-off-by: Masami Hiramatsu (Google) 
Signed-off-by: Steven Rostedt (Google) 
---
 include/linux/trace_recursion.h | 29 
 kernel/trace/trace.h| 34 +++--
 2 files changed, 32 insertions(+), 31 deletions(-)

diff --git a/include/linux/trace_recursion.h b/include/linux/trace_recursion.h
index 02e6afc6d7fe..fdfb6f66718a 100644
--- a/include/linux/trace_recursion.h
+++ b/include/linux/trace_recursion.h
@@ -44,25 +44,6 @@ enum {
  */
TRACE_IRQ_BIT,
 
-   /*
-* In the very unlikely case that an interrupt came in
-* at a start of graph tracing, and we want to trace
-* the function in that interrupt, the depth can be greater
-* than zero, because of the preempted start of a previous
-* trace. In an even more unlikely case, depth could be 2
-* if a softirq interrupted the start of graph tracing,
-* followed by an interrupt preempting a start of graph
-* tracing in the softirq, and depth can even be 3
-* if an NMI came in at the start of an interrupt function
-* that preempted a softirq start of a function that
-* preempted normal context Luckily, it can't be
-* greater than 3, so the next two bits are a mask
-* of what the depth is when we set TRACE_GRAPH_FL
-*/
-
-   TRACE_GRAPH_DEPTH_START_BIT,
-   TRACE_GRAPH_DEPTH_END_BIT,
-
/*
 * To implement set_graph_notrace, if this bit is set, we ignore
 * function graph tracing of called functions, until the return
@@ -78,16 +59,6 @@ enum {
 #define trace_recursion_clear(bit) do { (current)->trace_recursion &= 
~(1<<(bit)); } while (0)
 #define trace_recursion_test(bit)  ((current)->trace_recursion & 
(1<<(bit)))
 
-#define trace_recursion_depth() \
-   (((current)->trace_recursion >> TRACE_GRAPH_DEPTH_START_BIT) & 3)
-#define trace_recursion_set_depth(depth) \
-   do {\
-   current->trace_recursion &= \
-   ~(3 << TRACE_GRAPH_DEPTH_START_BIT);\
-   current->trace_recursion |= \
-   ((depth) & 3) << TRACE_GRAPH_DEPTH_START_BIT;   \
-   } while (0)
-
 #define TRACE_CONTEXT_BITS 4
 
 #define TRACE_FTRACE_START TRACE_FTRACE_BIT
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 6f889cc6097e..5398b6889027 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -900,8 +900,38 @@ extern void free_fgraph_ops(struct trace_array *tr);
 
 enum {
TRACE_GRAPH_FL  = 1,
+
+   /*
+* In the very unlikely case that an interrupt came in
+* at a start of graph tracing, and we want to trace
+* the function in that interrupt, the depth can be greater
+* than zero, because of the preempted start of a previous
+* trace. In an even more unlikely case, depth could be 2
+* if a softirq interrupted the start of graph tracing,
+* followed by an interrupt preempting a start of graph
+* tracing in the softirq, and depth can even be 3
+* if an NMI came in at the start of an interrupt function
+* that preempted a softirq start of a function that
+* preempted normal context Luckily, it can't be
+* greater than 3, so the next two bits are a mask
+* of what the depth is when we set TRACE_GRAPH_FL
+*/
+
+   TRACE_GRAPH_DEPTH_START_BIT,
+   TRACE_GRAPH_DEPTH_END_BIT,
 };
 
+static inline unsigned long ftrace_graph_depth(unsigned long *task_var)
+{
+   return (*task_var >> TRACE_GRAPH_DEPTH_START_BIT) & 3;
+}
+
+static inline void ftrace_graph_set_depth(unsigned long *task_var, int depth)
+{
+   *task_var &= ~(3 << TRACE_GRAPH_DEPTH_START_BIT);
+   *task_var |= (depth & 3) << TRACE_GRAPH_DEPTH_START_BIT;
+}
+
 #ifdef CONFIG_DYNAMIC_FTRACE
 extern struct ftrace_hash __rcu *ftrace_graph_hash;
 extern struct ftrace_hash __rcu *ftrace_graph_notrace_hash;
@@ -934,7 +964,7 @@ ftrace_graph_addr(unsigned long *task_var, struct 
ftrace_graph_ent *trace)
 * when the depth is zero.
 */
*task_var |= TRACE_GRAPH_FL;
-   trace_recursion_set_depth(trace->depth);
+   ftrace_graph_set_depth(task_var, trace->depth);
 
/*
 * If no irqs are to be traced, but a set_graph_function
@@ -959,7 +989,7 @@

[PATCH 13/20] function_graph: Move set_graph_function tests to shadow stack global var

2024-05-24 Thread Steven Rostedt

From: "Steven Rostedt (VMware)" 

The use of the task->trace_recursion for the logic used for the
set_graph_funnction was a bit of an abuse of that variable. Now that there
exists global vars that are per stack for registered graph traces, use that
instead.

Link: 
https://lore.kernel.org/linux-trace-kernel/171509105520.162236.10339831553995971290.stgit@devnote2

Signed-off-by: Steven Rostedt (VMware) 
Signed-off-by: Masami Hiramatsu (Google) 
Signed-off-by: Steven Rostedt (Google) 
---
 include/linux/trace_recursion.h  |  5 +
 kernel/trace/trace.h | 32 ++--
 kernel/trace/trace_functions_graph.c |  6 +++---
 kernel/trace/trace_irqsoff.c |  4 ++--
 kernel/trace/trace_sched_wakeup.c|  4 ++--
 5 files changed, 29 insertions(+), 22 deletions(-)

diff --git a/include/linux/trace_recursion.h b/include/linux/trace_recursion.h
index 24ea8ac049b4..02e6afc6d7fe 100644
--- a/include/linux/trace_recursion.h
+++ b/include/linux/trace_recursion.h
@@ -44,9 +44,6 @@ enum {
  */
TRACE_IRQ_BIT,
 
-   /* Set if the function is in the set_graph_function file */
-   TRACE_GRAPH_BIT,
-
/*
 * In the very unlikely case that an interrupt came in
 * at a start of graph tracing, and we want to trace
@@ -60,7 +57,7 @@ enum {
 * that preempted a softirq start of a function that
 * preempted normal context Luckily, it can't be
 * greater than 3, so the next two bits are a mask
-* of what the depth is when we set TRACE_GRAPH_BIT
+* of what the depth is when we set TRACE_GRAPH_FL
 */
 
TRACE_GRAPH_DEPTH_START_BIT,
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index ced61eacf40f..6f889cc6097e 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -898,11 +898,16 @@ extern void init_array_fgraph_ops(struct trace_array *tr, 
struct ftrace_ops *ops
 extern int allocate_fgraph_ops(struct trace_array *tr, struct ftrace_ops *ops);
 extern void free_fgraph_ops(struct trace_array *tr);
 
+enum {
+   TRACE_GRAPH_FL  = 1,
+};
+
 #ifdef CONFIG_DYNAMIC_FTRACE
 extern struct ftrace_hash __rcu *ftrace_graph_hash;
 extern struct ftrace_hash __rcu *ftrace_graph_notrace_hash;
 
-static inline int ftrace_graph_addr(struct ftrace_graph_ent *trace)
+static inline int
+ftrace_graph_addr(unsigned long *task_var, struct ftrace_graph_ent *trace)
 {
unsigned long addr = trace->func;
int ret = 0;
@@ -924,12 +929,11 @@ static inline int ftrace_graph_addr(struct 
ftrace_graph_ent *trace)
}
 
if (ftrace_lookup_ip(hash, addr)) {
-
/*
 * This needs to be cleared on the return functions
 * when the depth is zero.
 */
-   trace_recursion_set(TRACE_GRAPH_BIT);
+   *task_var |= TRACE_GRAPH_FL;
trace_recursion_set_depth(trace->depth);
 
/*
@@ -949,11 +953,14 @@ static inline int ftrace_graph_addr(struct 
ftrace_graph_ent *trace)
return ret;
 }
 
-static inline void ftrace_graph_addr_finish(struct ftrace_graph_ret *trace)
+static inline void
+ftrace_graph_addr_finish(struct fgraph_ops *gops, struct ftrace_graph_ret 
*trace)
 {
-   if (trace_recursion_test(TRACE_GRAPH_BIT) &&
+   unsigned long *task_var = fgraph_get_task_var(gops);
+
+   if ((*task_var & TRACE_GRAPH_FL) &&
trace->depth == trace_recursion_depth())
-   trace_recursion_clear(TRACE_GRAPH_BIT);
+   *task_var &= ~TRACE_GRAPH_FL;
 }
 
 static inline int ftrace_graph_notrace_addr(unsigned long addr)
@@ -980,7 +987,7 @@ static inline int ftrace_graph_notrace_addr(unsigned long 
addr)
 }
 
 #else
-static inline int ftrace_graph_addr(struct ftrace_graph_ent *trace)
+static inline int ftrace_graph_addr(unsigned long *task_var, struct 
ftrace_graph_ent *trace)
 {
return 1;
 }
@@ -989,17 +996,20 @@ static inline int ftrace_graph_notrace_addr(unsigned long 
addr)
 {
return 0;
 }
-static inline void ftrace_graph_addr_finish(struct ftrace_graph_ret *trace)
+static inline void ftrace_graph_addr_finish(struct fgraph_ops *gops, struct 
ftrace_graph_ret *trace)
 { }
 #endif /* CONFIG_DYNAMIC_FTRACE */
 
 extern unsigned int fgraph_max_depth;
 
-static inline bool ftrace_graph_ignore_func(struct ftrace_graph_ent *trace)
+static inline bool
+ftrace_graph_ignore_func(struct fgraph_ops *gops, struct ftrace_graph_ent 
*trace)
 {
+   unsigned long *task_var = fgraph_get_task_var(gops);
+
/* trace it when it is-nested-in or is a function enabled. */
-   return !(trace_recursion_test(TRACE_GRAPH_BIT) ||
-ftrace_graph_addr(trace)) ||
+   return !((*task_var & TRACE_GRAPH_FL) ||
+ftrace_graph_addr(task_var, trace)) ||
(trace->depth < 0) ||
(fgraph_max_depth && trace->depth >= fgraph_max_depth);
 }
diff --git a/kernel/trace/trace_functions_graph.c

[PATCH 12/20] function_graph: Add "task variables" per task for fgraph_ops

2024-05-24 Thread Steven Rostedt

From: "Steven Rostedt (VMware)" 

Add a "task variables" array on the tasks shadow ret_stack that is the
size of longs for each possible registered fgraph_ops. That's a total
of 16, taking up 8 * 16 = 128 bytes (out of a page size 4k).

This will allow for fgraph_ops to do specific features on a per task basis
having a way to maintain state for each task.

Co-developed with Masami Hiramatsu:
Link: 
https://lore.kernel.org/linux-trace-kernel/171509104383.162236.12239656156685718550.stgit@devnote2

Signed-off-by: Steven Rostedt (VMware) 
Signed-off-by: Masami Hiramatsu (Google) 
Signed-off-by: Steven Rostedt (Google) 
---
 include/linux/ftrace.h |  1 +
 kernel/trace/fgraph.c  | 74 +-
 2 files changed, 74 insertions(+), 1 deletion(-)

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index e5b41683ffb9..80eb1ab3cae3 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -1087,6 +1087,7 @@ ftrace_graph_get_ret_stack(struct task_struct *task, int 
skip);
 
 unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx,
unsigned long ret, unsigned long *retp);
+unsigned long *fgraph_get_task_var(struct fgraph_ops *gops);
 
 /*
  * Sometimes we don't want to trace a function with the function
diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index 8e029d5e94f6..0d536a48f696 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -55,6 +55,10 @@
  * on the return of the function being traced, this is what will be on the
  * task's shadow ret_stack: (the stack grows upward)
  *
+ *  ret_stack[SHADOW_STACK_OFFSET]
+ * | SHADOW_STACK_TASK_VARS(ret_stack)[15]  |
+ * ...
+ * | SHADOW_STACK_TASK_VARS(ret_stack)[0]   |
  *  ret_stack[SHADOW_STACK_MAX_OFFSET]
  * ...
  * || <- task->curr_ret_stack
@@ -117,11 +121,19 @@ enum {
 #define SHADOW_STACK_SIZE  (PAGE_SIZE)
 #define SHADOW_STACK_OFFSET(SHADOW_STACK_SIZE / sizeof(long))
 /* Leave on a buffer at the end */
-#define SHADOW_STACK_MAX_OFFSET (SHADOW_STACK_OFFSET - (FGRAPH_FRAME_OFFSET + 
1))
+#define SHADOW_STACK_MAX_OFFSET\
+   (SHADOW_STACK_OFFSET - (FGRAPH_FRAME_OFFSET + 1 + FGRAPH_ARRAY_SIZE))
 
 /* RET_STACK():Return the frame from a given @offset from task 
@t */
 #define RET_STACK(t, offset) ((struct ftrace_ret_stack 
*)(&(t)->ret_stack[offset]))
 
+/*
+ * Each fgraph_ops has a reservered unsigned long at the end (top) of the
+ * ret_stack to store task specific state.
+ */
+#define SHADOW_STACK_TASK_VARS(ret_stack) \
+   ((unsigned long *)(&(ret_stack)[SHADOW_STACK_OFFSET - 
FGRAPH_ARRAY_SIZE]))
+
 DEFINE_STATIC_KEY_FALSE(kill_ftrace_graph);
 int ftrace_graph_active;
 
@@ -212,6 +224,44 @@ static void return_run(struct ftrace_graph_ret *trace, 
struct fgraph_ops *ops)
 {
 }
 
+static void ret_stack_set_task_var(struct task_struct *t, int idx, long val)
+{
+   unsigned long *gvals = SHADOW_STACK_TASK_VARS(t->ret_stack);
+
+   gvals[idx] = val;
+}
+
+static unsigned long *
+ret_stack_get_task_var(struct task_struct *t, int idx)
+{
+   unsigned long *gvals = SHADOW_STACK_TASK_VARS(t->ret_stack);
+
+   return [idx];
+}
+
+static void ret_stack_init_task_vars(unsigned long *ret_stack)
+{
+   unsigned long *gvals = SHADOW_STACK_TASK_VARS(ret_stack);
+
+   memset(gvals, 0, sizeof(*gvals) * FGRAPH_ARRAY_SIZE);
+}
+
+/**
+ * fgraph_get_task_var - retrieve a task specific state variable
+ * @gops: The ftrace_ops that owns the task specific variable
+ *
+ * Every registered fgraph_ops has a task state variable
+ * reserved on the task's ret_stack. This function returns the
+ * address to that variable.
+ *
+ * Returns the address to the fgraph_ops @gops tasks specific
+ * unsigned long variable.
+ */
+unsigned long *fgraph_get_task_var(struct fgraph_ops *gops)
+{
+   return ret_stack_get_task_var(current, gops->idx);
+}
+
 /*
  * @offset: The offset into @t->ret_stack to find the ret_stack entry
  * @frame_offset: Where to place the offset into @t->ret_stack of that entry
@@ -803,6 +853,7 @@ static int alloc_retstack_tasklist(unsigned long 
**ret_stack_list)
 
if (t->ret_stack == NULL) {
atomic_set(>trace_overrun, 0);
+   ret_stack_init_task_vars(ret_stack_list[start]);
t->curr_ret_stack = 0;
t->curr_ret_depth = -1;
/* Make sure the tasks see the 0 first: */
@@ -863,6 +914,7 @@ static void
 graph_init_task(struct task_struct *t, unsigned long *ret_stack)
 {
atomic_set(>trace_overrun, 0);
+   ret_stack_init_task_vars(ret_stack);
t->ftrace_timestamp = 0;
t->curr_ret_stack = 0;
t->curr_ret_depth = -1;
@@ -961,6 +1013,24 @@ static int start_graph_tracing(void)
return ret;
 }
 
+static void init_task_vars(int idx)
+{
+   struct

[PATCH 10/20] function_graph: Have the instances use their own ftrace_ops for filtering

2024-05-24 Thread Steven Rostedt

From: "Steven Rostedt (VMware)" 

Allow for instances to have their own ftrace_ops part of the fgraph_ops
that makes the funtion_graph tracer filter on the set_ftrace_filter file
of the instance and not the top instance.

Note that this also requires to update ftrace_graph_func() to call new
function_graph_enter_ops() instead of function_graph_enter() so that
it avoid pushing on shadow stack multiple times on the same function.

Co-developed with Masami Hiramatsu:
Link: 
https://lore.kernel.org/linux-trace-kernel/171509102088.162236.15758883237657317789.stgit@devnote2

Signed-off-by: Steven Rostedt (VMware) 
Signed-off-by: Masami Hiramatsu (Google) 
Signed-off-by: Steven Rostedt (Google) 
---
 arch/arm64/kernel/ftrace.c   |  21 -
 arch/loongarch/kernel/ftrace_dyn.c   |  15 ++-
 arch/powerpc/kernel/trace/ftrace.c   |   3 +-
 arch/riscv/kernel/ftrace.c   |  15 ++-
 arch/x86/kernel/ftrace.c |  19 +++-
 include/linux/ftrace.h   |   6 ++
 kernel/trace/fgraph.c| 131 ---
 kernel/trace/ftrace.c|   4 +-
 kernel/trace/trace.h |  16 ++--
 kernel/trace/trace_functions.c   |   2 +-
 kernel/trace/trace_functions_graph.c |   8 +-
 11 files changed, 190 insertions(+), 50 deletions(-)

diff --git a/arch/arm64/kernel/ftrace.c b/arch/arm64/kernel/ftrace.c
index a650f5e11fc5..b96740829798 100644
--- a/arch/arm64/kernel/ftrace.c
+++ b/arch/arm64/kernel/ftrace.c
@@ -481,7 +481,26 @@ void prepare_ftrace_return(unsigned long self_addr, 
unsigned long *parent,
 void ftrace_graph_func(unsigned long ip, unsigned long parent_ip,
   struct ftrace_ops *op, struct ftrace_regs *fregs)
 {
-   prepare_ftrace_return(ip, >lr, fregs->fp);
+   struct fgraph_ops *gops = container_of(op, struct fgraph_ops, ops);
+   unsigned long frame_pointer = fregs->fp;
+   unsigned long *parent = >lr;
+   int bit;
+
+   if (unlikely(ftrace_graph_is_dead()))
+   return;
+
+   if (unlikely(atomic_read(>tracing_graph_pause)))
+   return;
+
+   bit = ftrace_test_recursion_trylock(ip, *parent);
+   if (bit < 0)
+   return;
+
+   if (!function_graph_enter_ops(*parent, ip, frame_pointer,
+ (void *)frame_pointer, gops))
+   *parent = (unsigned long)_to_handler;
+
+   ftrace_test_recursion_unlock(bit);
 }
 #else
 /*
diff --git a/arch/loongarch/kernel/ftrace_dyn.c 
b/arch/loongarch/kernel/ftrace_dyn.c
index bff058317062..77acb69ad153 100644
--- a/arch/loongarch/kernel/ftrace_dyn.c
+++ b/arch/loongarch/kernel/ftrace_dyn.c
@@ -241,10 +241,21 @@ void prepare_ftrace_return(unsigned long self_addr, 
unsigned long *parent)
 void ftrace_graph_func(unsigned long ip, unsigned long parent_ip,
   struct ftrace_ops *op, struct ftrace_regs *fregs)
 {
+   struct fgraph_ops *gops = container_of(op, struct fgraph_ops, ops);
+   unsigned long return_hooker = (unsigned long)_to_handler;
struct pt_regs *regs = >regs;
-   unsigned long *parent = (unsigned long *)>regs[1];
+   unsigned long *parent;
+   unsigned long old;
+
+   parent = (unsigned long *)>regs[1];
 
-   prepare_ftrace_return(ip, (unsigned long *)parent);
+   if (unlikely(atomic_read(>tracing_graph_pause)))
+   return;
+
+   old = *parent;
+
+   if (!function_graph_enter_ops(old, ip, 0, parent, gops))
+   *parent = return_hooker;
 }
 #else
 static int ftrace_modify_graph_caller(bool enable)
diff --git a/arch/powerpc/kernel/trace/ftrace.c 
b/arch/powerpc/kernel/trace/ftrace.c
index d8d6b4fd9a14..4a9294821c0d 100644
--- a/arch/powerpc/kernel/trace/ftrace.c
+++ b/arch/powerpc/kernel/trace/ftrace.c
@@ -421,6 +421,7 @@ int __init ftrace_dyn_arch_init(void)
 void ftrace_graph_func(unsigned long ip, unsigned long parent_ip,
   struct ftrace_ops *op, struct ftrace_regs *fregs)
 {
+   struct fgraph_ops *gops = container_of(op, struct fgraph_ops, ops);
unsigned long sp = fregs->regs.gpr[1];
int bit;
 
@@ -434,7 +435,7 @@ void ftrace_graph_func(unsigned long ip, unsigned long 
parent_ip,
if (bit < 0)
goto out;
 
-   if (!function_graph_enter(parent_ip, ip, 0, (unsigned long *)sp))
+   if (!function_graph_enter_ops(parent_ip, ip, 0, (unsigned long *)sp, 
gops))
parent_ip = ppc_function_entry(return_to_handler);
 
ftrace_test_recursion_unlock(bit);
diff --git a/arch/riscv/kernel/ftrace.c b/arch/riscv/kernel/ftrace.c
index 4f4987a6d83d..7bcb5f321523 100644
--- a/arch/riscv/kernel/ftrace.c
+++ b/arch/riscv/kernel/ftrace.c
@@ -218,10 +218,23 @@ void prepare_ftrace_return(unsigned long *parent, 
unsigned long self_addr,
 void ftrace_graph_func(unsigned long ip, unsigned long parent_ip,
   struct ftrace_ops *op, struct ftrace_regs *fregs)
 {
+   struct fgraph_ops *gops

[PATCH 11/20] function_graph: Use a simple LRU for fgraph_array index number

2024-05-24 Thread Steven Rostedt

From: "Masami Hiramatsu (Google)" 

Since the fgraph_array index is used for the bitmap on the shadow
stack, it may leave some entries after a function_graph instance is
removed. Thus if another instance reuses the fgraph_array index soon
after releasing it, the fgraph may confuse to call the newer callback
for the entries which are pushed by the older instance.
To avoid reusing the fgraph_array index soon after releasing, introduce
a simple LRU table for managing the index number. This will reduce the
possibility of this confusion.

Link: 
https://lore.kernel.org/linux-trace-kernel/171509103267.162236.6885097397289135378.stgit@devnote2

Signed-off-by: Masami Hiramatsu (Google) 
Signed-off-by: Steven Rostedt (Google) 
---
 kernel/trace/fgraph.c | 71 ++-
 1 file changed, 50 insertions(+), 21 deletions(-)

diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index 67fa7fbf6aac..8e029d5e94f6 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -125,10 +125,48 @@ enum {
 DEFINE_STATIC_KEY_FALSE(kill_ftrace_graph);
 int ftrace_graph_active;
 
-static int fgraph_array_cnt;
-
 static struct fgraph_ops *fgraph_array[FGRAPH_ARRAY_SIZE];
 
+/* LRU index table for fgraph_array */
+static int fgraph_lru_table[FGRAPH_ARRAY_SIZE];
+static int fgraph_lru_next;
+static int fgraph_lru_last;
+
+/* Initialize fgraph_lru_table with unused index */
+static void fgraph_lru_init(void)
+{
+   int i;
+
+   for (i = 0; i < FGRAPH_ARRAY_SIZE; i++)
+   fgraph_lru_table[i] = i;
+}
+
+/* Release the used index to the LRU table */
+static int fgraph_lru_release_index(int idx)
+{
+   if (idx < 0 || idx >= FGRAPH_ARRAY_SIZE ||
+   WARN_ON_ONCE(fgraph_lru_table[fgraph_lru_last] != -1))
+   return -1;
+
+   fgraph_lru_table[fgraph_lru_last] = idx;
+   fgraph_lru_last = (fgraph_lru_last + 1) % FGRAPH_ARRAY_SIZE;
+   return 0;
+}
+
+/* Allocate a new index from LRU table */
+static int fgraph_lru_alloc_index(void)
+{
+   int idx = fgraph_lru_table[fgraph_lru_next];
+
+   /* No id is available */
+   if (idx == -1)
+   return -1;
+
+   fgraph_lru_table[fgraph_lru_next] = -1;
+   fgraph_lru_next = (fgraph_lru_next + 1) % FGRAPH_ARRAY_SIZE;
+   return idx;
+}
+
 /* Get the FRAME_OFFSET from the word from the @offset on ret_stack */
 static inline int get_frame_offset(struct task_struct *t, int offset)
 {
@@ -375,7 +413,7 @@ int function_graph_enter(unsigned long ret, unsigned long 
func,
if (offset < 0)
goto out;
 
-   for (i = 0; i < fgraph_array_cnt; i++) {
+   for (i = 0; i < FGRAPH_ARRAY_SIZE; i++) {
struct fgraph_ops *gops = fgraph_array[i];
 
if (gops == _stub)
@@ -927,7 +965,7 @@ int register_ftrace_graph(struct fgraph_ops *gops)
 {
int command = 0;
int ret = 0;
-   int i;
+   int i = -1;
 
mutex_lock(_lock);
 
@@ -943,21 +981,16 @@ int register_ftrace_graph(struct fgraph_ops *gops)
/* The array must always have real data on it */
for (i = 0; i < FGRAPH_ARRAY_SIZE; i++)
fgraph_array[i] = _stub;
+   fgraph_lru_init();
}
 
-   /* Look for an available spot */
-   for (i = 0; i < FGRAPH_ARRAY_SIZE; i++) {
-   if (fgraph_array[i] == _stub)
-   break;
-   }
-   if (i >= FGRAPH_ARRAY_SIZE) {
+   i = fgraph_lru_alloc_index();
+   if (i < 0 || WARN_ON_ONCE(fgraph_array[i] != _stub)) {
ret = -ENOSPC;
goto out;
}
 
fgraph_array[i] = gops;
-   if (i + 1 > fgraph_array_cnt)
-   fgraph_array_cnt = i + 1;
gops->idx = i;
 
ftrace_graph_active++;
@@ -981,6 +1014,7 @@ int register_ftrace_graph(struct fgraph_ops *gops)
if (ret) {
fgraph_array[i] = _stub;
ftrace_graph_active--;
+   fgraph_lru_release_index(i);
}
 out:
mutex_unlock(_lock);
@@ -990,25 +1024,20 @@ int register_ftrace_graph(struct fgraph_ops *gops)
 void unregister_ftrace_graph(struct fgraph_ops *gops)
 {
int command = 0;
-   int i;
 
mutex_lock(_lock);
 
if (unlikely(!ftrace_graph_active))
goto out;
 
-   if (unlikely(gops->idx < 0 || gops->idx >= fgraph_array_cnt))
+   if (unlikely(gops->idx < 0 || gops->idx >= FGRAPH_ARRAY_SIZE ||
+fgraph_array[gops->idx] != gops))
goto out;
 
-   WARN_ON_ONCE(fgraph_array[gops->idx] != gops);
+   if (fgraph_lru_release_index(gops->idx) < 0)
+   goto out;
 
fgraph_array[gops->idx] = _stub;
-   if (gops->idx + 1 == fgraph_array_cnt) {
-   i = gops->idx;
-   while (i >= 0 && fgraph_array[i] == _stub)
-   i--;
-   fgraph_array_cnt = i + 1;
-   }

[PATCH 09/20] ftrace: Allow ftrace startup flags to exist without dynamic ftrace

2024-05-24 Thread Steven Rostedt

From: "Steven Rostedt (VMware)" 

Some of the flags for ftrace_startup() may be exposed even when
CONFIG_DYNAMIC_FTRACE is not configured in. This is fine as the difference
between dynamic ftrace and static ftrace is done within the internals of
ftrace itself. No need to have use cases fail to compile because dynamic
ftrace is disabled.

This change is needed to move some of the logic of what is passed to
ftrace_startup() out of the parameters of ftrace_startup().

Link: 
https://lore.kernel.org/linux-trace-kernel/171509100890.162236.436235034254912.stgit@devnote2

Signed-off-by: Steven Rostedt (VMware) 
Signed-off-by: Masami Hiramatsu (Google) 
Signed-off-by: Steven Rostedt (Google) 
---
 include/linux/ftrace.h | 18 +-
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index f7a948337d39..942a1f767280 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -509,6 +509,15 @@ static inline void stack_tracer_disable(void) { }
 static inline void stack_tracer_enable(void) { }
 #endif
 
+enum {
+   FTRACE_UPDATE_CALLS = (1 << 0),
+   FTRACE_DISABLE_CALLS= (1 << 1),
+   FTRACE_UPDATE_TRACE_FUNC= (1 << 2),
+   FTRACE_START_FUNC_RET   = (1 << 3),
+   FTRACE_STOP_FUNC_RET= (1 << 4),
+   FTRACE_MAY_SLEEP= (1 << 5),
+};
+
 #ifdef CONFIG_DYNAMIC_FTRACE
 
 void ftrace_arch_code_modify_prepare(void);
@@ -603,15 +612,6 @@ void ftrace_set_global_notrace(unsigned char *buf, int 
len, int reset);
 void ftrace_free_filter(struct ftrace_ops *ops);
 void ftrace_ops_set_global_filter(struct ftrace_ops *ops);
 
-enum {
-   FTRACE_UPDATE_CALLS = (1 << 0),
-   FTRACE_DISABLE_CALLS= (1 << 1),
-   FTRACE_UPDATE_TRACE_FUNC= (1 << 2),
-   FTRACE_START_FUNC_RET   = (1 << 3),
-   FTRACE_STOP_FUNC_RET= (1 << 4),
-   FTRACE_MAY_SLEEP= (1 << 5),
-};
-
 /*
  * The FTRACE_UPDATE_* enum is used to pass information back
  * from the ftrace_update_record() and ftrace_test_record()
-- 
2.43.0

[PATCH 08/20] ftrace: Allow function_graph tracer to be enabled in instances

2024-05-24 Thread Steven Rostedt

From: "Steven Rostedt (VMware)" 

Now that function graph tracing can handle more than one user, allow it to
be enabled in the ftrace instances. Note, the filtering of the functions is
still joined by the top level set_ftrace_filter and friends, as well as the
graph and nograph files.

Co-developed with Masami Hiramatsu:
Link: 
https://lore.kernel.org/linux-trace-kernel/171509099743.162236.1699959255446248163.stgit@devnote2

Signed-off-by: Steven Rostedt (VMware) 
Signed-off-by: Masami Hiramatsu (Google) 
Signed-off-by: Steven Rostedt (Google) 
---
 include/linux/ftrace.h   |  1 +
 kernel/trace/ftrace.c|  1 +
 kernel/trace/trace.h | 13 +-
 kernel/trace/trace_functions.c   |  8 
 kernel/trace/trace_functions_graph.c | 65 +---
 kernel/trace/trace_selftest.c|  4 +-
 6 files changed, 64 insertions(+), 28 deletions(-)

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 032974f55c5b..f7a948337d39 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -1041,6 +1041,7 @@ extern int ftrace_graph_entry_stub(struct 
ftrace_graph_ent *trace, struct fgraph
 struct fgraph_ops {
trace_func_graph_ent_t  entryfunc;
trace_func_graph_ret_t  retfunc;
+   void*private;
int idx;
 };
 
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index d18387c0642d..b85f00b0ffe7 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -7327,6 +7327,7 @@ __init void ftrace_init_global_array_ops(struct 
trace_array *tr)
tr->ops = _ops;
tr->ops->private = tr;
ftrace_init_trace_array(tr);
+   init_array_fgraph_ops(tr);
 }
 
 void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func)
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 2575ec243350..a5070f9b977b 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -397,6 +397,9 @@ struct trace_array {
struct ftrace_ops   *ops;
struct trace_pid_list   __rcu *function_pids;
struct trace_pid_list   __rcu *function_no_pids;
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+   struct fgraph_ops   *gops;
+#endif
 #ifdef CONFIG_DYNAMIC_FTRACE
/* All of these are protected by the ftrace_lock */
struct list_headfunc_probes;
@@ -681,7 +684,6 @@ void print_trace_header(struct seq_file *m, struct 
trace_iterator *iter);
 
 void trace_graph_return(struct ftrace_graph_ret *trace, struct fgraph_ops 
*gops);
 int trace_graph_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops);
-void set_graph_array(struct trace_array *tr);
 
 void tracing_start_cmdline_record(void);
 void tracing_stop_cmdline_record(void);
@@ -892,6 +894,9 @@ extern int __trace_graph_entry(struct trace_array *tr,
 extern void __trace_graph_return(struct trace_array *tr,
 struct ftrace_graph_ret *trace,
 unsigned int trace_ctx);
+extern void init_array_fgraph_ops(struct trace_array *tr);
+extern int allocate_fgraph_ops(struct trace_array *tr);
+extern void free_fgraph_ops(struct trace_array *tr);
 
 #ifdef CONFIG_DYNAMIC_FTRACE
 extern struct ftrace_hash __rcu *ftrace_graph_hash;
@@ -1004,6 +1009,12 @@ print_graph_function_flags(struct trace_iterator *iter, 
u32 flags)
 {
return TRACE_TYPE_UNHANDLED;
 }
+static inline void init_array_fgraph_ops(struct trace_array *tr) { }
+static inline int allocate_fgraph_ops(struct trace_array *tr)
+{
+   return 0;
+}
+static inline void free_fgraph_ops(struct trace_array *tr) { }
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
 
 extern struct list_head ftrace_pids;
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 9f1bfbe105e8..8e8da0d0ee52 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -80,6 +80,7 @@ void ftrace_free_ftrace_ops(struct trace_array *tr)
 int ftrace_create_function_files(struct trace_array *tr,
 struct dentry *parent)
 {
+   int ret;
/*
 * The top level array uses the "global_ops", and the files are
 * created on boot up.
@@ -90,6 +91,12 @@ int ftrace_create_function_files(struct trace_array *tr,
if (!tr->ops)
return -EINVAL;
 
+   ret = allocate_fgraph_ops(tr);
+   if (ret) {
+   kfree(tr->ops);
+   return ret;
+   }
+
ftrace_create_filter_files(tr->ops, parent);
 
return 0;
@@ -99,6 +106,7 @@ void ftrace_destroy_function_files(struct trace_array *tr)
 {
ftrace_destroy_filter_files(tr->ops);
ftrace_free_ftrace_ops(tr);
+   free_fgraph_ops(tr);
 }
 
 static ftrace_func_t select_trace_function(u32 flags_val)
diff --git a/kernel/trace/trace_functions_graph.c 
b/kernel/trace/trace_functions_graph.c
index b7b142b65299..9ccc904a7703 100644
---

[PATCH 07/20] ftrace/function_graph: Pass fgraph_ops to function graph callbacks

2024-05-24 Thread Steven Rostedt

From: "Steven Rostedt (VMware)" 

Pass the fgraph_ops structure to the function graph callbacks. This will
allow callbacks to add a descriptor to a fgraph_ops private field that wil
be added in the future and use it for the callbacks. This will be useful
when more than one callback can be registered to the function graph tracer.

Co-developed with Masami Hiramatsu:
Link: 
https://lore.kernel.org/linux-trace-kernel/171509098588.162236.4787930115997357578.stgit@devnote2

Signed-off-by: Steven Rostedt (VMware) 
Signed-off-by: Masami Hiramatsu (Google) 
Signed-off-by: Steven Rostedt (Google) 
---
 include/linux/ftrace.h   | 10 +++---
 kernel/trace/fgraph.c| 16 +---
 kernel/trace/ftrace.c|  6 --
 kernel/trace/trace.h |  4 ++--
 kernel/trace/trace_functions_graph.c | 11 +++
 kernel/trace/trace_irqsoff.c |  6 --
 kernel/trace/trace_sched_wakeup.c|  6 --
 kernel/trace/trace_selftest.c|  5 +++--
 8 files changed, 40 insertions(+), 24 deletions(-)

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 4d4c146fbfbc..032974f55c5b 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -1026,11 +1026,15 @@ struct ftrace_graph_ret {
unsigned long long rettime;
 } __packed;
 
+struct fgraph_ops;
+
 /* Type of the callback handlers for tracing function graph*/
-typedef void (*trace_func_graph_ret_t)(struct ftrace_graph_ret *); /* return */
-typedef int (*trace_func_graph_ent_t)(struct ftrace_graph_ent *); /* entry */
+typedef void (*trace_func_graph_ret_t)(struct ftrace_graph_ret *,
+  struct fgraph_ops *); /* return */
+typedef int (*trace_func_graph_ent_t)(struct ftrace_graph_ent *,
+ struct fgraph_ops *); /* entry */
 
-extern int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace);
+extern int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace, struct 
fgraph_ops *gops);
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 
diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index 2b52afa03ab4..54ed2ed2036b 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -164,13 +164,13 @@ set_bitmap(struct task_struct *t, int offset, unsigned 
long bitmap)
 }
 
 /* ftrace_graph_entry set to this to tell some archs to run function graph */
-static int entry_run(struct ftrace_graph_ent *trace)
+static int entry_run(struct ftrace_graph_ent *trace, struct fgraph_ops *ops)
 {
return 0;
 }
 
 /* ftrace_graph_return set to this to tell some archs to run function graph */
-static void return_run(struct ftrace_graph_ret *trace)
+static void return_run(struct ftrace_graph_ret *trace, struct fgraph_ops *ops)
 {
 }
 
@@ -234,12 +234,14 @@ int __weak ftrace_disable_ftrace_graph_caller(void)
 }
 #endif
 
-int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace)
+int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace,
+   struct fgraph_ops *gops)
 {
return 0;
 }
 
-static void ftrace_graph_ret_stub(struct ftrace_graph_ret *trace)
+static void ftrace_graph_ret_stub(struct ftrace_graph_ret *trace,
+ struct fgraph_ops *gops)
 {
 }
 
@@ -379,7 +381,7 @@ int function_graph_enter(unsigned long ret, unsigned long 
func,
if (gops == _stub)
continue;
 
-   if (gops->entryfunc())
+   if (gops->entryfunc(, gops))
bitmap |= BIT(i);
}
 
@@ -527,7 +529,7 @@ static unsigned long __ftrace_return_to_handler(struct 
fgraph_ret_regs *ret_regs
if (gops == _stub)
continue;
 
-   gops->retfunc();
+   gops->retfunc(, gops);
}
 
/*
@@ -681,7 +683,7 @@ void ftrace_graph_sleep_time_control(bool enable)
  * Simply points to ftrace_stub, but with the proper protocol.
  * Defined by the linker script in linux/vmlinux.lds.h
  */
-extern void ftrace_stub_graph(struct ftrace_graph_ret *);
+void ftrace_stub_graph(struct ftrace_graph_ret *trace, struct fgraph_ops 
*gops);
 
 /* The callbacks that hook a function */
 trace_func_graph_ret_t ftrace_graph_return = ftrace_stub_graph;
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 789950a4f977..d18387c0642d 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -815,7 +815,8 @@ void ftrace_graph_graph_time_control(bool enable)
fgraph_graph_time = enable;
 }
 
-static int profile_graph_entry(struct ftrace_graph_ent *trace)
+static int profile_graph_entry(struct ftrace_graph_ent *trace,
+  struct fgraph_ops *gops)
 {
struct ftrace_ret_stack *ret_stack;
 
@@ -832,7 +833,8 @@ static int profile_graph_entry(struct ftrace_graph_ent 
*trace)
return 1;
 }
 
-static void profile_graph_return(struct ftrace_graph_ret *trace)
+static void profile_graph_return(struct ftrace_graph_ret

[PATCH 05/20] function_graph: Handle tail calls for stack unwinding

2024-05-24 Thread Steven Rostedt

From: "Masami Hiramatsu (Google)" 

For the tail-call, there would be 2 or more ftrace_ret_stacks on the
ret_stack, which records "return_to_handler" as the return address except
for the last one.  But on the real stack, there should be 1 entry because
tail-call reuses the return address on the stack and jump to the next
function.

In ftrace_graph_ret_addr() that is used for stack unwinding, skip tail
calls as a real stack unwinder would do.

Link: 
https://lore.kernel.org/linux-trace-kernel/171509096221.162236.8806372072523195752.stgit@devnote2

Signed-off-by: Masami Hiramatsu (Google) 
Signed-off-by: Steven Rostedt (Google) 
---
 kernel/trace/fgraph.c | 19 ---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index aae51f746828..8de2a2662281 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -594,16 +594,26 @@ unsigned long ftrace_graph_ret_addr(struct task_struct 
*task, int *idx,
unsigned long ret, unsigned long *retp)
 {
struct ftrace_ret_stack *ret_stack;
+   unsigned long return_handler = (unsigned 
long)dereference_kernel_function_descriptor(return_to_handler);
int i = task->curr_ret_stack;
 
-   if (ret != (unsigned 
long)dereference_kernel_function_descriptor(return_to_handler))
+   if (ret != return_handler)
return ret;
 
while (i > 0) {
ret_stack = get_ret_stack(current, i, );
if (!ret_stack)
break;
-   if (ret_stack->retp == retp)
+   /*
+* For the tail-call, there would be 2 or more 
ftrace_ret_stacks on
+* the ret_stack, which records "return_to_handler" as the 
return
+* address except for the last one.
+* But on the real stack, there should be 1 entry because 
tail-call
+* reuses the return address on the stack and jump to the next 
function.
+* Thus we will continue to find real return address.
+*/
+   if (ret_stack->retp == retp &&
+   ret_stack->ret != return_handler)
return ret_stack->ret;
}
 
@@ -614,10 +624,11 @@ unsigned long ftrace_graph_ret_addr(struct task_struct 
*task, int *idx,
unsigned long ret, unsigned long *retp)
 {
struct ftrace_ret_stack *ret_stack;
+   unsigned long return_handler = (unsigned 
long)dereference_kernel_function_descriptor(return_to_handler);
int offset = task->curr_ret_stack;
int i;
 
-   if (ret != (unsigned 
long)dereference_kernel_function_descriptor(return_to_handler))
+   if (ret != return_handler)
return ret;
 
if (!idx)
@@ -626,6 +637,8 @@ unsigned long ftrace_graph_ret_addr(struct task_struct 
*task, int *idx,
i = *idx;
do {
ret_stack = get_ret_stack(task, offset, );
+   if (ret_stack && ret_stack->ret == return_handler)
+   continue;
i--;
} while (i >= 0 && ret_stack);
 
-- 
2.43.0

[PATCH 06/20] function_graph: Remove logic around ftrace_graph_entry and return

2024-05-24 Thread Steven Rostedt

From: "Steven Rostedt (VMware)" 

The function pointers ftrace_graph_entry and ftrace_graph_return are no
longer called via the function_graph tracer. Instead, an array structure is
now used that will allow for multiple users of the function_graph
infrastructure. The variables are still used by the architecture code for
non dynamic ftrace configs, where a test is made against them to see if
they point to the default stub function or not. This is how the static
function tracing knows to call into the function graph tracer
infrastructure or not.

Two new stub functions are made. entry_run() and return_run(). The
ftrace_graph_entry and ftrace_graph_return are set to them respectively
when the function graph tracer is enabled, and this will trigger the
architecture specific function graph code to be executed.

This also requires checking the global_ops hash for all calls into the
function_graph tracer.

Co-developed with Masami Hiramatsu:
Link: 
https://lore.kernel.org/linux-trace-kernel/171509097408.162236.17387844142114638932.stgit@devnote2

Signed-off-by: Steven Rostedt (VMware) 
Signed-off-by: Masami Hiramatsu (Google) 
Signed-off-by: Steven Rostedt (Google) 
---
 kernel/trace/fgraph.c  | 67 --
 kernel/trace/ftrace.c  |  2 -
 kernel/trace/ftrace_internal.h |  2 -
 3 files changed, 15 insertions(+), 56 deletions(-)

diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index 8de2a2662281..2b52afa03ab4 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -163,6 +163,17 @@ set_bitmap(struct task_struct *t, int offset, unsigned 
long bitmap)
(FGRAPH_TYPE_BITMAP << FGRAPH_TYPE_SHIFT) | FGRAPH_FRAME_OFFSET;
 }
 
+/* ftrace_graph_entry set to this to tell some archs to run function graph */
+static int entry_run(struct ftrace_graph_ent *trace)
+{
+   return 0;
+}
+
+/* ftrace_graph_return set to this to tell some archs to run function graph */
+static void return_run(struct ftrace_graph_ret *trace)
+{
+}
+
 /*
  * @offset: The offset into @t->ret_stack to find the ret_stack entry
  * @frame_offset: Where to place the offset into @t->ret_stack of that entry
@@ -675,7 +686,6 @@ extern void ftrace_stub_graph(struct ftrace_graph_ret *);
 /* The callbacks that hook a function */
 trace_func_graph_ret_t ftrace_graph_return = ftrace_stub_graph;
 trace_func_graph_ent_t ftrace_graph_entry = ftrace_graph_entry_stub;
-static trace_func_graph_ent_t __ftrace_graph_entry = ftrace_graph_entry_stub;
 
 /* Try to assign a return stack array on FTRACE_RETSTACK_ALLOC_SIZE tasks. */
 static int alloc_retstack_tasklist(unsigned long **ret_stack_list)
@@ -758,46 +768,6 @@ ftrace_graph_probe_sched_switch(void *ignore, bool preempt,
}
 }
 
-static int ftrace_graph_entry_test(struct ftrace_graph_ent *trace)
-{
-   if (!ftrace_ops_test(_ops, trace->func, NULL))
-   return 0;
-   return __ftrace_graph_entry(trace);
-}
-
-/*
- * The function graph tracer should only trace the functions defined
- * by set_ftrace_filter and set_ftrace_notrace. If another function
- * tracer ops is registered, the graph tracer requires testing the
- * function against the global ops, and not just trace any function
- * that any ftrace_ops registered.
- */
-void update_function_graph_func(void)
-{
-   struct ftrace_ops *op;
-   bool do_test = false;
-
-   /*
-* The graph and global ops share the same set of functions
-* to test. If any other ops is on the list, then
-* the graph tracing needs to test if its the function
-* it should call.
-*/
-   do_for_each_ftrace_op(op, ftrace_ops_list) {
-   if (op != _ops && op != _ops &&
-   op != _list_end) {
-   do_test = true;
-   /* in double loop, break out with goto */
-   goto out;
-   }
-   } while_for_each_ftrace_op(op);
- out:
-   if (do_test)
-   ftrace_graph_entry = ftrace_graph_entry_test;
-   else
-   ftrace_graph_entry = __ftrace_graph_entry;
-}
-
 static DEFINE_PER_CPU(unsigned long *, idle_ret_stack);
 
 static void
@@ -939,18 +909,12 @@ int register_ftrace_graph(struct fgraph_ops *gops)
ftrace_graph_active--;
goto out;
}
-
-   ftrace_graph_return = gops->retfunc;
-
/*
-* Update the indirect function to the entryfunc, and the
-* function that gets called to the entry_test first. Then
-* call the update fgraph entry function to determine if
-* the entryfunc should be called directly or not.
+* Some archs just test to see if these are not
+* the default function
 */
-   __ftrace_graph_entry = gops->entryfunc;
-   ftrace_graph_entry = ftrace_graph_entry_test;
-

[PATCH 04/20] function_graph: Allow multiple users to attach to function graph

2024-05-24 Thread Steven Rostedt

From: "Steven Rostedt (VMware)" 

Allow for multiple users to attach to function graph tracer at the same
time. Only 16 simultaneous users can attach to the tracer. This is because
there's an array that stores the pointers to the attached fgraph_ops. When
a function being traced is entered, each of the ftrace_ops entryfunc is
called and if it returns non zero, its index into the array will be added
to the shadow stack.

On exit of the function being traced, the shadow stack will contain the
indexes of the ftrace_ops on the array that want their retfunc to be
called.

Because a function may sleep for a long time (if a task sleeps itself),
the return of the function may be literally days later. If the ftrace_ops
is removed, its place on the array is replaced with a ftrace_ops that
contains the stub functions and that will be called when the function
finally returns.

If another ftrace_ops is added that happens to get the same index into the
array, its return function may be called. But that's actually the way
things current work with the old function graph tracer. If one tracer is
removed and another is added, the new one will get the return calls of the
function traced by the previous one, thus this is not a regression. This
can be fixed by adding a counter to each time the array item is updated and
save that on the shadow stack as well, such that it won't be called if the
index saved does not match the index on the array.

Note, being able to filter functions when both are called is not completely
handled yet, but that shouldn't be too hard to manage.

Co-developed with Masami Hiramatsu:
Link: 
https://lore.kernel.org/linux-trace-kernel/171509096221.162236.8806372072523195752.stgit@devnote2

Signed-off-by: Steven Rostedt (VMware) 
Signed-off-by: Masami Hiramatsu (Google) 
Signed-off-by: Steven Rostedt (Google) 
---
 include/linux/ftrace.h |   3 +-
 kernel/trace/fgraph.c  | 379 -
 2 files changed, 304 insertions(+), 78 deletions(-)

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index e3a83ebd1b33..4d4c146fbfbc 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -1037,6 +1037,7 @@ extern int ftrace_graph_entry_stub(struct 
ftrace_graph_ent *trace);
 struct fgraph_ops {
trace_func_graph_ent_t  entryfunc;
trace_func_graph_ret_t  retfunc;
+   int idx;
 };
 
 /*
@@ -1071,7 +1072,7 @@ function_graph_enter(unsigned long ret, unsigned long 
func,
 unsigned long frame_pointer, unsigned long *retp);
 
 struct ftrace_ret_stack *
-ftrace_graph_get_ret_stack(struct task_struct *task, int idx);
+ftrace_graph_get_ret_stack(struct task_struct *task, int skip);
 
 unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx,
unsigned long ret, unsigned long *retp);
diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index d2ce5d651cf0..aae51f746828 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -7,6 +7,7 @@
  *
  * Highly modified by Steven Rostedt (VMware).
  */
+#include 
 #include 
 #include 
 #include 
@@ -28,35 +29,177 @@
 /*
  * FGRAPH_FRAME_SIZE:  Size in bytes of the meta data on the shadow stack
  * FGRAPH_FRAME_OFFSET:Size in long words of the meta data frame
- * SHADOW_STACK_SIZE:  The size in bytes of the entire shadow stack
- * SHADOW_STACK_OFFSET:The size in long words of the shadow stack
- * SHADOW_STACK_MAX_OFFSET: The max offset of the stack for a new frame to be 
added
  */
 #define FGRAPH_FRAME_SIZE  sizeof(struct ftrace_ret_stack)
 #define FGRAPH_FRAME_OFFSETDIV_ROUND_UP(FGRAPH_FRAME_SIZE, sizeof(long))
-#define SHADOW_STACK_SIZE (PAGE_SIZE)
-#define SHADOW_STACK_OFFSET\
-   (ALIGN(SHADOW_STACK_SIZE, sizeof(long)) / sizeof(long))
-/* Leave on a buffer at the end */
-#define SHADOW_STACK_MAX_INDEX (SHADOW_STACK_OFFSET - FGRAPH_FRAME_OFFSET)
 
 /*
- * RET_STACK():Return the frame from a given @offset from task 
@t
- * RET_STACK_INC():Reserve one frame size on the stack.
- * RET_STACK_DEC():Remove one frame size from the stack.
+ * On entry to a function (via function_graph_enter()), a new fgraph frame
+ * (ftrace_ret_stack) is pushed onto the stack as well as a word that
+ * holds a bitmask and a type (called "bitmap"). The bitmap is defined as:
+ *
+ * bits:  0 -  9   offset in words from the previous ftrace_ret_stack
+ * Currently, this will always be set to 
FGRAPH_FRAME_OFFSET
+ * to get to the fgraph frame.
+ *
+ * bits: 10 - 11   Type of storage
+ *   0 - reserved
+ *   1 - bitmap of fgraph_array index
+ *
+ * For type with "bitmap of fgraph_array index" (FGRAPH_TYPE_BITMAP):
+ *  bits: 12 - 27  The bitmap of fgraph_ops fgraph_array index
+ * That is, it's a bitmask of 0-15 (16 bits)
+ *

[PATCH 03/20] function_graph: Add an array structure that will allow multiple callbacks

2024-05-24 Thread Steven Rostedt

From: "Steven Rostedt (VMware)" 

Add an array structure that will eventually allow the function graph tracer
to have up to 16 simultaneous callbacks attached. It's an array of 16
fgraph_ops pointers, that is assigned when one is registered. On entry of a
function the entry of the first item in the array is called, and if it
returns zero, then the callback returns non zero if it wants the return
callback to be called on exit of the function.

The array will simplify the process of having more than one callback
attached to the same function, as its index into the array can be stored on
the shadow stack. We need to only save the index, because this will allow
the fgraph_ops to be freed before the function returns (which may happen if
the function call schedule for a long time).

Co-developed with Masami Hiramatsu:
Link: 
https://lore.kernel.org/linux-trace-kernel/171509095075.162236.8272148192748284581.stgit@devnote2

Signed-off-by: Steven Rostedt (VMware) 
Signed-off-by: Masami Hiramatsu (Google) 
Signed-off-by: Steven Rostedt (Google) 
---
 kernel/trace/fgraph.c | 114 ++
 1 file changed, 81 insertions(+), 33 deletions(-)

diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index fdb206aeffe3..d2ce5d651cf0 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -52,6 +52,11 @@
 DEFINE_STATIC_KEY_FALSE(kill_ftrace_graph);
 int ftrace_graph_active;
 
+static int fgraph_array_cnt;
+#define FGRAPH_ARRAY_SIZE  16
+
+static struct fgraph_ops *fgraph_array[FGRAPH_ARRAY_SIZE];
+
 /* Both enabled by default (can be cleared by function_graph tracer flags */
 static bool fgraph_sleep_time = true;
 
@@ -75,6 +80,20 @@ int __weak ftrace_disable_ftrace_graph_caller(void)
 }
 #endif
 
+int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace)
+{
+   return 0;
+}
+
+static void ftrace_graph_ret_stub(struct ftrace_graph_ret *trace)
+{
+}
+
+static struct fgraph_ops fgraph_stub = {
+   .entryfunc = ftrace_graph_entry_stub,
+   .retfunc = ftrace_graph_ret_stub,
+};
+
 /**
  * ftrace_graph_stop - set to permanently disable function graph tracing
  *
@@ -161,7 +180,7 @@ int function_graph_enter(unsigned long ret, unsigned long 
func,
goto out;
 
/* Only trace if the calling function expects to */
-   if (!ftrace_graph_entry())
+   if (!fgraph_array[0]->entryfunc())
goto out_ret;
 
return 0;
@@ -276,7 +295,7 @@ static unsigned long __ftrace_return_to_handler(struct 
fgraph_ret_regs *ret_regs
trace.retval = fgraph_ret_regs_return_value(ret_regs);
 #endif
trace.rettime = trace_clock_local();
-   ftrace_graph_return();
+   fgraph_array[0]->retfunc();
/*
 * The ftrace_graph_return() may still access the current
 * ret_stack structure, we need to make sure the update of
@@ -412,11 +431,6 @@ void ftrace_graph_sleep_time_control(bool enable)
fgraph_sleep_time = enable;
 }
 
-int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace)
-{
-   return 0;
-}
-
 /*
  * Simply points to ftrace_stub, but with the proper protocol.
  * Defined by the linker script in linux/vmlinux.lds.h
@@ -654,37 +668,54 @@ static int start_graph_tracing(void)
 int register_ftrace_graph(struct fgraph_ops *gops)
 {
int ret = 0;
+   int i;
 
mutex_lock(_lock);
 
-   /* we currently allow only one tracer registered at a time */
-   if (ftrace_graph_active) {
+   if (!fgraph_array[0]) {
+   /* The array must always have real data on it */
+   for (i = 0; i < FGRAPH_ARRAY_SIZE; i++)
+   fgraph_array[i] = _stub;
+   }
+
+   /* Look for an available spot */
+   for (i = 0; i < FGRAPH_ARRAY_SIZE; i++) {
+   if (fgraph_array[i] == _stub)
+   break;
+   }
+   if (i >= FGRAPH_ARRAY_SIZE) {
ret = -EBUSY;
goto out;
}
 
-   register_pm_notifier(_suspend_notifier);
+   fgraph_array[i] = gops;
+   if (i + 1 > fgraph_array_cnt)
+   fgraph_array_cnt = i + 1;
 
ftrace_graph_active++;
-   ret = start_graph_tracing();
-   if (ret) {
-   ftrace_graph_active--;
-   goto out;
-   }
 
-   ftrace_graph_return = gops->retfunc;
+   if (ftrace_graph_active == 1) {
+   register_pm_notifier(_suspend_notifier);
+   ret = start_graph_tracing();
+   if (ret) {
+   ftrace_graph_active--;
+   goto out;
+   }
+
+   ftrace_graph_return = gops->retfunc;
 
-   /*
-* Update the indirect function to the entryfunc, and the
-* function that gets called to the entry_test first. Then
-* call the update fgraph entry function to determine if
-* the entryfunc should be called directly or not.
-*/
-   __ftrace_graph_entry =

[PATCH 01/20] function_graph: Convert ret_stack to a series of longs

2024-05-24 Thread Steven Rostedt

From: "Steven Rostedt (VMware)" 

In order to make it possible to have multiple callbacks registered with the
function_graph tracer, the retstack needs to be converted from an array of
ftrace_ret_stack structures to an array of longs. This will allow to store
the list of callbacks on the stack for the return side of the functions.

Link: 
https://lore.kernel.org/linux-trace-kernel/171509092742.162236.4427737821399314856.stgit@devnote2

Signed-off-by: Steven Rostedt (VMware) 
Signed-off-by: Masami Hiramatsu (Google) 
Signed-off-by: Steven Rostedt (Google) 
---
 include/linux/sched.h |   2 +-
 kernel/trace/fgraph.c | 136 +-
 2 files changed, 83 insertions(+), 55 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 61591ac6eab6..352939dab3a5 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1402,7 +1402,7 @@ struct task_struct {
int curr_ret_depth;
 
/* Stack of return addresses for return function tracing: */
-   struct ftrace_ret_stack *ret_stack;
+   unsigned long   *ret_stack;
 
/* Timestamp for last schedule: */
unsigned long long  ftrace_timestamp;
diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index a130b2d898f7..c62e6db718a0 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -25,6 +25,30 @@
 #define ASSIGN_OPS_HASH(opsname, val)
 #endif
 
+/*
+ * FGRAPH_FRAME_SIZE:  Size in bytes of the meta data on the shadow stack
+ * FGRAPH_FRAME_OFFSET:Size in long words of the meta data frame
+ * SHADOW_STACK_SIZE:  The size in bytes of the entire shadow stack
+ * SHADOW_STACK_OFFSET:The size in long words of the shadow stack
+ * SHADOW_STACK_MAX_OFFSET: The max offset of the stack for a new frame to be 
added
+ */
+#define FGRAPH_FRAME_SIZE  sizeof(struct ftrace_ret_stack)
+#define FGRAPH_FRAME_OFFSET(ALIGN(FGRAPH_FRAME_SIZE, sizeof(long)) / 
sizeof(long))
+#define SHADOW_STACK_SIZE (PAGE_SIZE)
+#define SHADOW_STACK_OFFSET\
+   (ALIGN(SHADOW_STACK_SIZE, sizeof(long)) / sizeof(long))
+/* Leave on a buffer at the end */
+#define SHADOW_STACK_MAX_INDEX (SHADOW_STACK_OFFSET - FGRAPH_FRAME_OFFSET)
+
+/*
+ * RET_STACK():Return the frame from a given @offset from task 
@t
+ * RET_STACK_INC():Reserve one frame size on the stack.
+ * RET_STACK_DEC():Remove one frame size from the stack.
+ */
+#define RET_STACK(t, index) ((struct ftrace_ret_stack 
*)(&(t)->ret_stack[index]))
+#define RET_STACK_INC(c) ({ c += FGRAPH_FRAME_OFFSET; })
+#define RET_STACK_DEC(c) ({ c -= FGRAPH_FRAME_OFFSET; })
+
 DEFINE_STATIC_KEY_FALSE(kill_ftrace_graph);
 int ftrace_graph_active;
 
@@ -69,6 +93,7 @@ static int
 ftrace_push_return_trace(unsigned long ret, unsigned long func,
 unsigned long frame_pointer, unsigned long *retp)
 {
+   struct ftrace_ret_stack *ret_stack;
unsigned long long calltime;
int index;
 
@@ -85,23 +110,25 @@ ftrace_push_return_trace(unsigned long ret, unsigned long 
func,
smp_rmb();
 
/* The return trace stack is full */
-   if (current->curr_ret_stack == FTRACE_RETFUNC_DEPTH - 1) {
+   if (current->curr_ret_stack >= SHADOW_STACK_MAX_INDEX) {
atomic_inc(>trace_overrun);
return -EBUSY;
}
 
calltime = trace_clock_local();
 
-   index = ++current->curr_ret_stack;
+   index = current->curr_ret_stack;
+   RET_STACK_INC(current->curr_ret_stack);
+   ret_stack = RET_STACK(current, index);
barrier();
-   current->ret_stack[index].ret = ret;
-   current->ret_stack[index].func = func;
-   current->ret_stack[index].calltime = calltime;
+   ret_stack->ret = ret;
+   ret_stack->func = func;
+   ret_stack->calltime = calltime;
 #ifdef HAVE_FUNCTION_GRAPH_FP_TEST
-   current->ret_stack[index].fp = frame_pointer;
+   ret_stack->fp = frame_pointer;
 #endif
 #ifdef HAVE_FUNCTION_GRAPH_RET_ADDR_PTR
-   current->ret_stack[index].retp = retp;
+   ret_stack->retp = retp;
 #endif
return 0;
 }
@@ -137,7 +164,7 @@ int function_graph_enter(unsigned long ret, unsigned long 
func,
 
return 0;
  out_ret:
-   current->curr_ret_stack--;
+   RET_STACK_DEC(current->curr_ret_stack);
  out:
current->curr_ret_depth--;
return -EBUSY;
@@ -148,11 +175,13 @@ static void
 ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret,
unsigned long frame_pointer)
 {
+   struct ftrace_ret_stack *ret_stack;
int index;
 
index = current->curr_ret_stack;
+   RET_STACK_DEC(index);
 
-   if (unlikely(index < 0 || index >= FTRACE_RETFUNC_DEPTH)) {
+   if (unlikely(index < 0 || index > SHADOW_STACK_MAX_INDEX)) {
ftrace_graph_stop();
WARN_ON(1);
/* Might as

[PATCH 02/20] fgraph: Use BUILD_BUG_ON() to make sure we have structures divisible by long

2024-05-24 Thread Steven Rostedt

From: "Steven Rostedt (VMware)" 

Instead of using "ALIGN()", use BUILD_BUG_ON() as the structures should
always be divisible by sizeof(long).

Co-developed with Masami Hiramatsu:
Link: 
https://lore.kernel.org/linux-trace-kernel/171509093949.162236.14518699447151894536.stgit@devnote2
Link: 
http://lkml.kernel.org/r/2019052444.gi2...@hirez.programming.kicks-ass.net

Suggested-by: Peter Zijlstra 
Signed-off-by: Steven Rostedt (VMware) 
Signed-off-by: Masami Hiramatsu (Google) 
Signed-off-by: Steven Rostedt (Google) 
---
 kernel/trace/fgraph.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index c62e6db718a0..fdb206aeffe3 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -33,7 +33,7 @@
  * SHADOW_STACK_MAX_OFFSET: The max offset of the stack for a new frame to be 
added
  */
 #define FGRAPH_FRAME_SIZE  sizeof(struct ftrace_ret_stack)
-#define FGRAPH_FRAME_OFFSET(ALIGN(FGRAPH_FRAME_SIZE, sizeof(long)) / 
sizeof(long))
+#define FGRAPH_FRAME_OFFSETDIV_ROUND_UP(FGRAPH_FRAME_SIZE, sizeof(long))
 #define SHADOW_STACK_SIZE (PAGE_SIZE)
 #define SHADOW_STACK_OFFSET\
(ALIGN(SHADOW_STACK_SIZE, sizeof(long)) / sizeof(long))
@@ -103,6 +103,8 @@ ftrace_push_return_trace(unsigned long ret, unsigned long 
func,
if (!current->ret_stack)
return -EBUSY;
 
+   BUILD_BUG_ON(SHADOW_STACK_SIZE % sizeof(long));
+
/*
 * We must make sure the ret_stack is tested before we read
 * anything else.
@@ -326,6 +328,8 @@ ftrace_graph_get_ret_stack(struct task_struct *task, int 
idx)
 {
int index = task->curr_ret_stack;
 
+   BUILD_BUG_ON(FGRAPH_FRAME_SIZE % sizeof(long));
+
index -= FGRAPH_FRAME_OFFSET * (idx + 1);
if (index < 0)
return NULL;
-- 
2.43.0

[PATCH 00/20] function_graph: Allow multiple users for function graph tracing

2024-05-24 Thread Steven Rostedt

[
  Resend for some of you as I missed a comma in the Cc and
  quilt died sending this out.
]

This is a continuation of the function graph multi user code.
I wrote a proof of concept back in 2019 of this code[1] and
Masami started cleaning it up. I started from Masami's work v10
that can be found here:

 
https://lore.kernel.org/linux-trace-kernel/171509088006.162236.7227326999861366050.stgit@devnote2/

This is *only* the code that allows multiple users of function
graph tracing. This is not the fprobe work that Masami is working
to add on top of it. As Masami took my proof of concept, there
was still several things I disliked about that code. Instead of
having Masami clean it up even more, I decided to take over on just
my code and change it up a bit.

The biggest changes from where Masami left off is mostly renaming more
variables, macros, and function names. I fixed up the current comments
and added more to make the code a bit more understandable.

At the end of the series, I added two patches to optimize the entry
and exit. On entry, there was a loop that iterated the 16 elements
of the fgraph_array[] looking for any that may have a gops registered
to it. It's quite a waste to do that loop if there's only one
registered user. To fix that, I added a fgraph_array_bitmask that has
its bits set that correspond to the elements of the array. Then
a simple for_each_set_bit() is used for the iteration. I do the same
thing at the exit callback of the function where it iterates over the
bits of the bitmap saved on the ret_stack.

I also noticed that Masami added code to handle tail calls in the
unwinder and had that in one of my patches. I took that code out
and made it a separate patch with Masami as the author.

The diff between this and Masami's last update is at the end of this email.

Based on Linus commit: 0eb03c7e8e2a4cc3653eb5eeb2d2001182071215

[1] https://lore.kernel.org/all/20190525031633.811342...@goodmis.org/

Masami Hiramatsu (Google) (3):
  function_graph: Handle tail calls for stack unwinding
  function_graph: Use a simple LRU for fgraph_array index number
  ftrace: Add multiple fgraph storage selftest

Steven Rostedt (Google) (2):
  function_graph: Use for_each_set_bit() in __ftrace_return_to_handler()
  function_graph: Use bitmask to loop on fgraph entry

Steven Rostedt (VMware) (15):
  function_graph: Convert ret_stack to a series of longs
  fgraph: Use BUILD_BUG_ON() to make sure we have structures divisible by 
long
  function_graph: Add an array structure that will allow multiple callbacks
  function_graph: Allow multiple users to attach to function graph
  function_graph: Remove logic around ftrace_graph_entry and return
  ftrace/function_graph: Pass fgraph_ops to function graph callbacks
  ftrace: Allow function_graph tracer to be enabled in instances
  ftrace: Allow ftrace startup flags to exist without dynamic ftrace
  function_graph: Have the instances use their own ftrace_ops for filtering
  function_graph: Add "task variables" per task for fgraph_ops
  function_graph: Move set_graph_function tests to shadow stack global var
  function_graph: Move graph depth stored data to shadow stack global var
  function_graph: Move graph notrace bit to shadow stack global var
  function_graph: Implement fgraph_reserve_data() and fgraph_retrieve_data()
  function_graph: Add selftest for passing local variables


 arch/arm64/kernel/ftrace.c   |  21 +-
 arch/loongarch/kernel/ftrace_dyn.c   |  15 +-
 arch/powerpc/kernel/trace/ftrace.c   |   3 +-
 arch/riscv/kernel/ftrace.c   |  15 +-
 arch/x86/kernel/ftrace.c |  19 +-
 include/linux/ftrace.h   |  42 +-
 include/linux/sched.h|   2 +-
 include/linux/trace_recursion.h  |  39 --
 kernel/trace/fgraph.c| 994 ---
 kernel/trace/ftrace.c|  11 +-
 kernel/trace/ftrace_internal.h   |   2 -
 kernel/trace/trace.h |  94 +++-
 kernel/trace/trace_functions.c   |   8 +
 kernel/trace/trace_functions_graph.c |  96 ++--
 kernel/trace/trace_irqsoff.c |  10 +-
 kernel/trace/trace_sched_wakeup.c|  10 +-
 kernel/trace/trace_selftest.c| 259 -
 17 files changed, 1330 insertions(+), 310 deletions(-)


diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index 3313e4b83aa2..1aae521e5997 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -18,29 +18,36 @@
 #include "ftrace_internal.h"
 #include "trace.h"
 
-#define FGRAPH_FRAME_SIZE sizeof(struct ftrace_ret_stack)
-#define FGRAPH_FRAME_OFFSET DIV_ROUND_UP(FGRAPH_FRAME_SIZE, sizeof(long))
 
 /*
- * On entry to a function (via function_graph_enter()), a new ftrace_ret_stack
- * is allocated on the task's ret_stack with bitmap entry, then each
- * fgraph_ops on the fgraph_array[]'s entryfunc is called and if that returns
- * non-zero, the index into the

[no subject]

2024-05-24 Thread Steven Rostedt

[no subject]

2024-05-24 Thread Steven Rostedt

Re: [PATCH v10 07/36] function_graph: Allow multiple users to attach to function graph

2024-05-24 Thread Steven Rostedt

On Tue,  7 May 2024 23:09:22 +0900
"Masami Hiramatsu (Google)"  wrote:

> @@ -109,6 +244,21 @@ ftrace_push_return_trace(unsigned long ret, unsigned 
> long func,
>   if (!current->ret_stack)
>   return -EBUSY;
>  
> + /*
> +  * At first, check whether the previous fgraph callback is pushed by
> +  * the fgraph on the same function entry.
> +  * But if @func is the self tail-call function, we also need to ensure
> +  * the ret_stack is not for the previous call by checking whether the
> +  * bit of @fgraph_idx is set or not.
> +  */
> + ret_stack = get_ret_stack(current, current->curr_ret_stack, );
> + if (ret_stack && ret_stack->func == func &&
> + get_fgraph_type(current, offset + FGRAPH_FRAME_OFFSET) == 
> FGRAPH_TYPE_BITMAP &&
> + !is_fgraph_index_set(current, offset + FGRAPH_FRAME_OFFSET, 
> fgraph_idx))
> + return offset + FGRAPH_FRAME_OFFSET;
> +
> + val = (FGRAPH_TYPE_RESERVED << FGRAPH_TYPE_SHIFT) | FGRAPH_FRAME_OFFSET;
> +
>   BUILD_BUG_ON(SHADOW_STACK_SIZE % sizeof(long));

I'm trying to figure out what the above is trying to do. This gets called
once in function_graph_enter() (or function_graph_enter_ops()). What
exactly are you trying to catch here?

Is it from this email:

  
https://lore.kernel.org/all/20231110105154.df937bf9f200a0c16806c...@kernel.org/

As that's the last version before you added the above code.

But you also noticed it may not be needed, but triggered a crash without it
in v3:

  
https://lore.kernel.org/all/20231205234511.3839128259dfec153ea7d...@kernel.org/

I removed this code in my version and it runs just fine. Perhaps there was
another bug that this was hiding that you fixed in later versions?

-- Steve

Re: [PATCH v10 00/36] tracing: fprobe: function_graph: Multi-function graph and fprobe on fgraph

2024-05-24 Thread Steven Rostedt

On Tue,  7 May 2024 23:08:00 +0900
"Masami Hiramatsu (Google)"  wrote:

> Steven Rostedt (VMware) (15):
>   function_graph: Convert ret_stack to a series of longs
>   fgraph: Use BUILD_BUG_ON() to make sure we have structures divisible by 
> long
>   function_graph: Add an array structure that will allow multiple 
> callbacks
>   function_graph: Allow multiple users to attach to function graph
>   function_graph: Remove logic around ftrace_graph_entry and return
>   ftrace/function_graph: Pass fgraph_ops to function graph callbacks
>   ftrace: Allow function_graph tracer to be enabled in instances
>   ftrace: Allow ftrace startup flags exist without dynamic ftrace
>   function_graph: Have the instances use their own ftrace_ops for 
> filtering
>   function_graph: Add "task variables" per task for fgraph_ops
>   function_graph: Move set_graph_function tests to shadow stack global var
>   function_graph: Move graph depth stored data to shadow stack global var
>   function_graph: Move graph notrace bit to shadow stack global var
>   function_graph: Implement fgraph_reserve_data() and 
> fgraph_retrieve_data()
>   function_graph: Add selftest for passing local variables

Hi Masami,

While reviewing these patches, I realized there's several things I dislike
about the patches I wrote. So I took these patches and started cleaning
them up a little. Mostly renaming functions and adding comments.

As this is a major change to the function graph tracer, and I feel nervous
about building something on top of this, how about I take over these
patches and push them out for the next merge window. I'm hoping to get them
into linux-next by v6.10-rc2 (I spent the day working on them, and it's
mostly minor tweaks).

Then I can push it out to 6.11 and get some good testing against it. Then
we can add your stuff on top and get that merged in 6.12.

If all goes well, I'm hoping to get a series on just these patches (and
your selftest addition) by tonight.

Thoughts?

-- Steve

Re: [PATCH RFC 1/2] dt-bindings: soc: qcom,smsm: Allow specifying mboxes instead of qcom,ipc

2024-05-24 Thread Luca Weiss

On Donnerstag, 23. Mai 2024 08:19:11 MESZ Krzysztof Kozlowski wrote:
> On 23/05/2024 08:16, Luca Weiss wrote:
> > On Donnerstag, 23. Mai 2024 08:02:13 MESZ Krzysztof Kozlowski wrote:
> >> On 22/05/2024 19:34, Luca Weiss wrote:
> >>> On Mittwoch, 22. Mai 2024 08:49:43 MESZ Krzysztof Kozlowski wrote:
>  On 21/05/2024 22:35, Luca Weiss wrote:
> > On Dienstag, 21. Mai 2024 10:58:07 MESZ Krzysztof Kozlowski wrote:
> >> On 20/05/2024 17:11, Luca Weiss wrote:
> >>> Hi Krzysztof
> >>>
> >>> Ack, sounds good.
> >>>
> >>> Maybe also from you, any opinion between these two binding styles?
> >>>
> >>> So first using index of mboxes for the numbering, where for the known
> >>> usages the first element (and sometimes the 3rd - ipc-2) are empty <>.
> >>>
> >>> The second variant is using mbox-names to get the correct channel-mbox
> >>> mapping.
> >>>
> >>> -   qcom,ipc-1 = < 8 13>;
> >>> -   qcom,ipc-2 = < 8 9>;
> >>> -   qcom,ipc-3 = < 8 19>;
> >>> +   mboxes = <0>, < 13>, < 9>, < 19>;
> >>>
> >>> vs.
> >>>
> >>> -   qcom,ipc-1 = < 8 13>;
> >>> -   qcom,ipc-2 = < 8 9>;
> >>> -   qcom,ipc-3 = < 8 19>;
> >>> +   mboxes = < 13>, < 9>, < 19>;
> >>> +   mbox-names = "ipc-1", "ipc-2", "ipc-3";
> >>
> >> Sorry, don't get, ipc-1 is the first mailbox, so why would there be <0>
> >> in first case?
> >
> > Actually not, ipc-0 would be permissible by the driver, used for the 
> > 0th host
> >
> > e.g. from:
> >
> > /* Iterate over all hosts to check whom wants a kick */
> > for (host = 0; host < smsm->num_hosts; host++) {
> > hostp = >hosts[host];
> >
> > Even though no mailbox is specified in any upstream dts for this 0th 
> > host I
> > didn't want the bindings to restrict that, that's why in the first 
> > example
> > there's an empty element (<0>) for the 0th smsm host
> >
> >> Anyway, the question is if you need to know that some
> >> mailbox is missing. But then it is weird to name them "ipc-1" etc.
> >
> > In either case we'd just query the mbox (either by name or index) and 
> > then
> > see if it's there? Not quite sure I understand the sentence..
> > Pretty sure either binding would work the same way.
> 
>  The question is: does the driver care only about having some mailboxes
>  or the driver cares about each specific mailbox? IOW, is skipping ipc-0
>  important for the driver?
> >>>
> >>> There's nothing special from driver side about any mailbox. Some SoCs have
> >>> a mailbox for e.g. hosts 1&2&3, some have only 1&3, and apq8064 even has
> >>> 1&2&3&4.
> >>>
> >>> And if the driver doesn't find a mailbox for a host, it just ignores it
> >>> but then of course it can't 'ring' the mailbox for that host when 
> >>> necessary.
> >>>
> >>> Not sure how much more I can add here, to be fair I barely understand what
> >>> this driver is doing myself apart from the obvious.
> >>
> >> From what you said, it looks like it is enough to just list mailboxes,
> >> e.g. for ipc-1, ipc-2 and ipc-4 (so no ipc-0 and ipc-3):
> > 
> > No, for sure we need also the possibility to list ipc-3.
> 
> ? You can list it, what's the problem>

Maybe we're talking past each other...

You asked why this wouldn't work:

  e.g. for ipc-1, ipc-2 and ipc-4 (so no ipc-0 and ipc-3):
  mboxes = < 13>, < 9>, < 19>;

How would we know that the 3rd mailbox ( 19) is for the 4th host
(previous ipc-4)?

1. If we use mboxes with indexes we'd need to have <0> values for
"smsm hosts" where we don't have a mailbox for - this is at least
for the 2nd smsm host (qcom,ipc-2) for a bunch of SoCs.

2. If we use mboxes with mbox-names then we could skip that since we
can directly specify which "smsm host" a given mailbox is for.

My only question really is whether 1. or 2. is a better idea.

Is this clearer now or still not?


> 
> > 
> > And my point is that I'm not sure if any platform will ever need ipc-0, but
> > the code to use that if it ever exists is there - the driver always
> > tries getting an mbox (currently just syscon of course) for every host
> > from 0 to n.
> > 
> > These are the current (non-mbox-API) mboxes provided to smsm:
> > 
> > $ git grep qcom,ipc- arch/
> > arch/arm/boot/dts/qcom/qcom-apq8064.dtsi:   qcom,ipc-1 = < 
> > 8 4>;
> > arch/arm/boot/dts/qcom/qcom-apq8064.dtsi:   qcom,ipc-2 = < 
> > 8 14>;
> > arch/arm/boot/dts/qcom/qcom-apq8064.dtsi:   qcom,ipc-3 = < 
> > 8 23>;
> > arch/arm/boot/dts/qcom/qcom-apq8064.dtsi:   qcom,ipc-4 = 
> > <_sic_non_secure 0x4094 0>;
> > arch/arm/boot/dts/qcom/qcom-msm8974.dtsi:   qcom,ipc-1 = < 
> > 8 13>;
> > arch/arm/boot/dts/qcom/qcom-msm8974.dtsi:   qcom,ipc-2 = < 
> > 8 9>;
> >

Re: Bug in Kernel 6.8.x, 6.9.x Causing Trace/Panic During Shutdown/Reboot

2024-05-24 Thread Steven Rostedt

On Fri, 24 May 2024 12:50:08 +0200
"Linux regression tracking (Thorsten Leemhuis)"  
wrote:

> > - Affected Versions: Before kernel version 6.8.10, the bug caused a
> > quick display of a kernel trace dump before the shutdown/reboot
> > completed. Starting from version 6.8.10 and continuing into version
> > 6.9.0 and 6.9.1, this issue has escalated to a kernel panic,
> > preventing the shutdown or reboot from completing and leaving the
> > machine stuck.

Ah, I bet it was this commit: baa23a8d4360d ("tracefs: Reset permissions on
remount if permissions are options"), which added a "iput" callback to the
dentry without calling iput, leaving stale inodes around.

This is fixed with:

  0bcfd9aa4dafa ("tracefs: Clear EVENT_INODE flag in tracefs_drop_inode()")

Try adding just that patch. It will at least make it go back to what was
happening before 6.8.10 (I hope!).

-- Steve

Re: Bug in Kernel 6.8.x, 6.9.x Causing Trace/Panic During Shutdown/Reboot

2024-05-24 Thread Steven Rostedt

On Fri, 24 May 2024 12:50:08 +0200
"Linux regression tracking (Thorsten Leemhuis)"  
wrote:

> [CCing a few people]
> 

Thanks for the Cc.

> On 24.05.24 12:31, Ilkka Naulapää wrote:
> > 
> > I have encountered a critical bug in the Linux vanilla kernel that
> > leads to a kernel panic during the shutdown or reboot process. The
> > issue arises after all services, including `journald`, have been
> > stopped. As a result, the machine fails to complete the shutdown or
> > reboot procedure, effectively causing the system to hang and not shut
> > down or reboot.  

To understand this, did you do anything with tracing? Before shutting down,
is there anything in /sys/kernel/tracing/instances directory?
Were any of the files/directories permissions in /sys/kernel/tracing changed?

> 
> Thx for the report. Not my area of expertise, so take this with a gain
> of salt. But given the versions your mention in your report and the
> screenshot that mentioned tracefs_free_inode I suspect this is caused by
> baa23a8d4360d ("tracefs: Reset permissions on remount if permissions are
> options"). A few fixes for it will soon hit mainline and are meant to be
> backported to affected stable trees:
> 
> https://lore.kernel.org/all/20240523212406.254317...@goodmis.org/
> https://lore.kernel.org/all/20240523174419.1e588...@gandalf.local.home/
> 
> You might want to try them – or recheck once they hit the stable trees
> you are about. If they don't work, please report back.

There's been quite a bit of updates in this code, but this looks new to me.
I have more fixes that were just pulled by Linus today.

  https://git.kernel.org/torvalds/c/0eb03c7e8e2a4cc3653eb5eeb2d2001182071215

I'm not sure how relevant that is for this. But if you can reproduce it
with that commit, then this is a new bug.

-- Steve

[PATCH v5 2/2] misc: fastrpc: use coherent pool for untranslated Compute Banks

2024-05-24 Thread Dylan Van Assche

Use fastrpc_remote_heap_alloc to allocate from the FastRPC device
instead of the Compute Bank when the session ID is 0. This ensures
that the allocation is inside the coherent DMA pool which is already
accessible to the DSP. This is necessary to support FastRPC devices
which do not have dedicated Compute Banks such as the SLPI on the SDM845.
The latter uses an allocated CMA region instead of FastRPC Compute Banks.

Signed-off-by: Dylan Van Assche 
Reviewed-by: Caleb Connolly 
---
 drivers/misc/fastrpc.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/misc/fastrpc.c b/drivers/misc/fastrpc.c
index c06667b29055..f53d20e2e07e 100644
--- a/drivers/misc/fastrpc.c
+++ b/drivers/misc/fastrpc.c
@@ -953,7 +953,10 @@ static int fastrpc_get_args(u32 kernel, struct 
fastrpc_invoke_ctx *ctx)
 
ctx->msg_sz = pkt_size;
 
-   err = fastrpc_buf_alloc(ctx->fl, dev, pkt_size, >buf);
+   if (ctx->fl->sctx->sid)
+   err = fastrpc_buf_alloc(ctx->fl, dev, pkt_size, >buf);
+   else
+   err = fastrpc_remote_heap_alloc(ctx->fl, dev, pkt_size, 
>buf);
if (err)
return err;
 
-- 
2.45.1

[PATCH v5 1/2] misc: fastrpc: support complete DMA pool access to the DSP

2024-05-24 Thread Dylan Van Assche

To support FastRPC Context Banks which aren't mapped via the SMMU,
make the whole reserved memory region available to the DSP to allow
access to coherent buffers.

This is performed by assigning the memory to the DSP via a hypervisor
call to set the correct permissions for the Virtual Machines on the DSP.
This is only necessary when a memory region is provided for SLPI DSPs
so guard this with a domain ID check.

Signed-off-by: Dylan Van Assche 
Reviewed-by: Caleb Connolly 
---
 drivers/misc/fastrpc.c | 19 +++
 1 file changed, 19 insertions(+)

diff --git a/drivers/misc/fastrpc.c b/drivers/misc/fastrpc.c
index 4c67e2c5a82e..c06667b29055 100644
--- a/drivers/misc/fastrpc.c
+++ b/drivers/misc/fastrpc.c
@@ -2255,6 +2255,8 @@ static int fastrpc_rpmsg_probe(struct rpmsg_device *rpdev)
int i, err, domain_id = -1, vmcount;
const char *domain;
bool secure_dsp;
+   struct device_node *rmem_node;
+   struct reserved_mem *rmem;
unsigned int vmids[FASTRPC_MAX_VMIDS];
 
err = of_property_read_string(rdev->of_node, "label", );
@@ -2297,6 +2299,23 @@ static int fastrpc_rpmsg_probe(struct rpmsg_device 
*rpdev)
}
}
 
+   rmem_node = of_parse_phandle(rdev->of_node, "memory-region", 0);
+   if (domain_id == SDSP_DOMAIN_ID && rmem_node) {
+   u64 src_perms;
+
+   rmem = of_reserved_mem_lookup(rmem_node);
+   if (!rmem) {
+   err = -EINVAL;
+   goto fdev_error;
+   }
+
+   src_perms = BIT(QCOM_SCM_VMID_HLOS);
+
+   qcom_scm_assign_mem(rmem->base, rmem->size, _perms,
+   data->vmperms, data->vmcount);
+
+   }
+
secure_dsp = !(of_property_read_bool(rdev->of_node, 
"qcom,non-secure-domain"));
data->secure = secure_dsp;
 
-- 
2.45.1

[PATCH v5 0/2] misc: fastrpc: FastRPC reserved memory assignment for SDM845 SLPI

2024-05-24 Thread Dylan Van Assche

* About *

The Qualcomm SDM845 SoC has a separate SLPI (Sensor Low Power Island)
DSP for sensors connected to the SoC which is responsible for exposing
sensors to userspace, power saving, and other features. 
While sensors are connected to GPIOs of the SoC, they cannot be used
because the hypervisor blocks direct access to the sensors, thus the 
DSP must be used to access any sensor on this SoC. The SLPI DSP uses a
GLink edge (dsps) to communicate with the host and has a FastRPC interface
to load files from the host filesystem such as sensor configuration files.
The FastRPC interface does not use regular FastRPC Compute Banks
but instead uses an allocated CMA region through which communication happens.

* Changes *

This patchseries add support to the FastRPC for assigning a coherent memory
region to a DSP via the hypervisor with the correct permissions.
This is necessary to support the SLPI found in the Qualcomm SDM845 SoC which
does not have dedicated FastRPC Compute Banks, in contrast to newer SoCs,
but uses a memory region instead when allocating buffers.

* Related patches *

1. Remoteproc changes to support the SLPI DSP in SDM845 (v3), needs to be 
applied:
https://lore.kernel.org/linux-remoteproc/20230330164633.117335-1...@dylanvanassche.be
2. DTS changes (v5), already applied:
https://lore.kernel.org/linux-devicetree/20230406173148.28309-1...@dylanvanassche.be

This serie does not depend on any serie, but all of them are necessary
to enable the feature in the end.

* Changelog *

Changes in v5:
- Adjusted to FastRPC driver changes in 6.9.X.

Changes in v4:
- Fixed possible memory leak when driver encounters an error during probing.

Changes in v3:
- Dropped debug prints.
- Added Reviewed-By tags from v2.

Changes in v2:

- Removed double blank lines
- Dropped dt-bindings property as it is not needed for driver behavior
- Add additional patch to allocate buffers via CMA memory for DSPs
  without dedicated FastRPC Compute Banks.

Dylan Van Assche (2):
  misc: fastrpc: support complete DMA pool access to the DSP
  misc: fastrpc: use coherent pool for untranslated Compute Banks

 drivers/misc/fastrpc.c | 24 +++-
 1 file changed, 23 insertions(+), 1 deletion(-)

-- 
2.45.1

Re: [PATCH v10 03/36] x86: tracing: Add ftrace_regs definition in the header

2024-05-24 Thread Steven Rostedt

On Fri, 24 May 2024 10:37:54 +0900
Masami Hiramatsu (Google)  wrote:
> > >  
> > >  #ifdef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS
> > >  struct ftrace_regs {
> > > + /*
> > > +  * On the x86_64, the ftrace_regs saves;
> > > +  * rax, rcx, rdx, rdi, rsi, r8, r9, rbp, rip and rsp.
> > > +  * Also orig_ax is used for passing direct trampoline address.
> > > +  * x86_32 doesn't support ftrace_regs.  
> > 
> > Should add a comment that if fregs->regs.cs is set, then all of the pt_regs
> > is valid.  
> 
> But what about rbx and r1*? Only regs->cs should be care for pt_regs?
> Or, did you mean "the ftrace_regs is valid"?

Yeah, on x86_64 ftrace_regs uses regs.cs to denote if it is valid or not:

static __always_inline struct pt_regs *
arch_ftrace_get_regs(struct ftrace_regs *fregs)
{
/* Only when FL_SAVE_REGS is set, cs will be non zero */
if (!fregs->regs.cs)
return NULL;
return >regs;
}


-- Steve

Re: How to properly fix reading user pointers in bpf in android kernel 4.9?

2024-05-24 Thread Bagas Sanjaya

[also Cc: bpf maintainers and get_maintainer output]

On Thu, May 23, 2024 at 07:52:22PM +0300, Marcel wrote:
> This seems that it was a long standing problem with the Linux kernel in 
> general. bpf_probe_read should have worked for both kernel and user pointers 
> but it fails with access error when reading an user one instead. 
> 
> I know there's a patch upstream that fixes this by introducing new helpers 
> for reading kernel and userspace pointers and I tried to back port them back 
> to my kernel but with no success. Tools like bcc fail to use them and instead 
> they report that the arguments sent to the helpers are invalid. I assume this 
> is due to the arguments ARG_CONST_STACK_SIZE and ARG_PTR_TO_RAW_STACK handle 
> data different in the 4.9 android version and the upstream version but I'm 
> not sure that this is the cause. I left the patch I did below and with a link 
> to the kernel I'm working on and maybe someone can take a look and give me an 
> hand (the patch isn't applied yet)

What upstream patch? Has it already been in mainline?

> 
> 
> 
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 744b4763b80e..de94c13b7193 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -559,6 +559,43 @@ enum bpf_func_id {
> */
> BPF_FUNC_probe_read_user,
>  
> +   /**
> +   * int bpf_probe_read_kernel(void *dst, int size, void *src)
> +   * Read a kernel pointer safely.
> +   * Return: 0 on success or negative error
> +   */
> +   BPF_FUNC_probe_read_kernel,
> +
> + /**
> +  * int bpf_probe_read_str(void *dst, int size, const void *unsafe_ptr)
> +  * Copy a NUL terminated string from user unsafe address. In case 
> the string
> +  * length is smaller than size, the target is not padded with 
> further NUL
> +  * bytes. In case the string length is larger than size, just 
> count-1
> +  * bytes are copied and the last byte is set to NUL.
> +  * @dst: destination address
> +  * @size: maximum number of bytes to copy, including the trailing 
> NUL
> +  * @unsafe_ptr: unsafe address
> +  * Return:
> +  *   > 0 length of the string including the trailing NUL on success
> +  *   < 0 error
> +  */
> + BPF_FUNC_probe_read_user_str,
> +
> + /**
> +  * int bpf_probe_read_str(void *dst, int size, const void *unsafe_ptr)
> +  * Copy a NUL terminated string from unsafe address. In case the 
> string
> +  * length is smaller than size, the target is not padded with 
> further NUL
> +  * bytes. In case the string length is larger than size, just 
> count-1
> +  * bytes are copied and the last byte is set to NUL.
> +  * @dst: destination address
> +  * @size: maximum number of bytes to copy, including the trailing 
> NUL
> +  * @unsafe_ptr: unsafe address
> +  * Return:
> +  *   > 0 length of the string including the trailing NUL on success
> +  *   < 0 error
> +  */
> + BPF_FUNC_probe_read_kernel_str,
> +
>   __BPF_FUNC_MAX_ID,
>  };
>  
> diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
> index a1e37a5d8c88..3478ca744a45 100644
> --- a/kernel/trace/bpf_trace.c
> +++ b/kernel/trace/bpf_trace.c
> @@ -94,7 +94,7 @@ static const struct bpf_func_proto bpf_probe_read_proto = {
>   .arg3_type  = ARG_ANYTHING,
>  };
>  
> -BPF_CALL_3(bpf_probe_read_user, void *, dst, u32, size, const void *, 
> unsafe_ptr)
> +BPF_CALL_3(bpf_probe_read_user, void *, dst, u32, size, const void  __user 
> *, unsafe_ptr)
>  {
>   int ret;
>  
> @@ -115,6 +115,27 @@ static const struct bpf_func_proto 
> bpf_probe_read_user_proto = {
>  };
>  
>  
> +BPF_CALL_3(bpf_probe_read_kernel, void *, dst, u32, size, const void *, 
> unsafe_ptr)
> +{
> + int ret;
> +
> + ret = probe_kernel_read(dst, unsafe_ptr, size);
> + if (unlikely(ret < 0))
> + memset(dst, 0, size);
> +
> + return ret;
> +}
> +
> +static const struct bpf_func_proto bpf_probe_read_kernel_proto = {
> + .func   = bpf_probe_read_kernel,
> + .gpl_only   = true,
> + .ret_type   = RET_INTEGER,
> + .arg1_type  = ARG_PTR_TO_RAW_STACK,
> + .arg2_type  = ARG_CONST_STACK_SIZE,
> + .arg3_type  = ARG_ANYTHING,
> +};
> +
> +
>  BPF_CALL_3(bpf_probe_write_user, void *, unsafe_ptr, const void *, src,
>  u32, size)
>  {
> @@ -487,6 +508,69 @@ static const struct bpf_func_proto 
> bpf_probe_read_str_proto = {
>   .arg3_type  = ARG_ANYTHING,
>  };
>  
> +
> +
> +BPF_CALL_3(bpf_probe_read_user_str, void *, dst, u32, size,
> +const void __user *, unsafe_ptr)
> +{
> + int ret;
> +
> + /*
> +  * The strncpy_from_unsafe() call will likely not fill the entire
> +  * buffer, but that's okay in this circumstance as we're probing
> +  *

Re: Bug in Kernel 6.8.x, 6.9.x Causing Trace/Panic During Shutdown/Reboot

2024-05-24 Thread Linux regression tracking (Thorsten Leemhuis)

[CCing a few people]

On 24.05.24 12:31, Ilkka Naulapää wrote:
> 
> I have encountered a critical bug in the Linux vanilla kernel that
> leads to a kernel panic during the shutdown or reboot process. The
> issue arises after all services, including `journald`, have been
> stopped. As a result, the machine fails to complete the shutdown or
> reboot procedure, effectively causing the system to hang and not shut
> down or reboot.

Thx for the report. Not my area of expertise, so take this with a gain
of salt. But given the versions your mention in your report and the
screenshot that mentioned tracefs_free_inode I suspect this is caused by
baa23a8d4360d ("tracefs: Reset permissions on remount if permissions are
options"). A few fixes for it will soon hit mainline and are meant to be
backported to affected stable trees:

https://lore.kernel.org/all/20240523212406.254317...@goodmis.org/
https://lore.kernel.org/all/20240523174419.1e588...@gandalf.local.home/

You might want to try them – or recheck once they hit the stable trees
you are about. If they don't work, please report back.

Ciao, Thorsten (wearing his 'the Linux kernel's regression tracker' hat)
--
Everything you wanna know about Linux kernel regression tracking:
https://linux-regtracking.leemhuis.info/about/#tldr
If I did something stupid, please tell me, as explained on that page.

> Here are the details of the issue:
> 
> - Affected Versions: Before kernel version 6.8.10, the bug caused a
> quick display of a kernel trace dump before the shutdown/reboot
> completed. Starting from version 6.8.10 and continuing into version
> 6.9.0 and 6.9.1, this issue has escalated to a kernel panic,
> preventing the shutdown or reboot from completing and leaving the
> machine stuck.
> 
> - Symptoms:
>   - In normal shutdown/reboot scenarios, the kernel trace dump briefly
> appears as the last message on the screen.
>   - In rescue mode, the kernel panic message is displayed. Normally it
> is not shown.
> 
> Since `journald` is stopped before this issue occurs, no textual logs
> are available. However, I have captured two pictures illustrating
> these related issues, which I am attaching to this email for your
> reference. Also added my custom kernel config.
> 
> Thank you for your attention to this matter. Please let me know if any
> additional information is required to assist in diagnosing and
> resolving this bug.
> 
> Best regards,
> 
> Ilkka Naulapää

Re: [PATCH] livepatch: introduce klp_func called interface

2024-05-24 Thread zhang warden




> On May 23, 2024, at 22:22, Dan Carpenter  wrote:
> 
> Always run your patches through checkpatch.
> 
> So this patch is so that testers can see if a function has been called?
> Can you not get the same information from gcov or ftrace?
> 
> There are style issues with the patch, but it's not so important until
> the design is agreed on.
> 
> regards,
> dan carpenter

Hi, Dan.

This patch have format issues as Markus said. A newer version of this patch is 
sent which is checked by ./scripts/checkpatch.pl

Thanks for your suggestions.

Regards,
Wardenjohn

Re: [PATCH] livepatch: introduce klp_func called interface

2024-05-24 Thread zhang warden

> On May 21, 2024, at 16:04, Petr Mladek  wrote:
> 
> Another motivation to use ftrace for testing is that it does not
> affect the performance in production.
> 
> We should keep klp_ftrace_handler() as fast as possible so that we
> could livepatch also performance sensitive functions.
> 

How about using unlikely() for branch testing? If we use unlikely, maybe there 
is no negative effect to klp_ftrace_handler() once this function is called.

Regards,
Wardenjohn

Re: [RFC PATCH 00/20] Introduce the famfs shared-memory file system

2024-05-24 Thread Miklos Szeredi

On Fri, 24 May 2024 at 02:47, John Groves  wrote:

> Apologies, but I'm short on time at the moment - going into a long holiday
> weekend in the US with family plans. I should be focused again by middle of
> next week.

NP.

Obviously I'll need to test it before anything is merged, other than
that this is not urgent at all...

> But can you check /proc/cmdline to see of the memmap arg got through without
> getting mangled? The '$' tends to get fubar'd. You might need \$, or I've seen
> the need for \\\$. If it's un-mangled, there should be a dax device.

/proc/cmdline shows the option correctly:

root@kvm:~# cat /proc/cmdline
root=/dev/vda console=hvc0 memmap=4G$4G

> If that doesn't work, it's worth trying '!' instead, which I think would give
> you a pmem device - if the arg gets through (but ! is less likely to get
> horked). That pmem device can be converted to devdax...

That doesn't work either.  No device created in /dev  (dax or pmem).

free(1) does show that the reserved memory is gone in both cases, so
something does happen.

Attaching my .config as well.

Thanks,
Miklos

.config
Description: Binary data

[PATCH v4 2/2] LoongArch: Add steal time support in guest side

2024-05-24 Thread Bibo Mao

Percpu struct kvm_steal_time is added here, its size is 64 bytes and
also defined as 64 bytes, so that the whole structure is in one physical
page.

When vcpu is onlined, function pv_enable_steal_time() is called. This
function will pass guest physical address of struct kvm_steal_time and
tells hypervisor to enable steal time. When vcpu is offline, physical
address is set as 0 and tells hypervisor to disable steal time.

Here is output of vmstat on guest when there is workload on both host
and guest. It shows steal time stat information.

procs ---memory-- -io -system-- --cpu-
 r  b   swpd   free  inact active   bibo   in   cs us sy id wa st
15  1  0 7583616 184112  72208200  162   52 31  6 43  0 20
17  0  0 7583616 184704  721920 0 6318 6885  5 60  8  5 22
16  0  0 7583616 185392  721440 0 1766 1081  0 49  0  1 50
16  0  0 7583616 184816  723040 0 6300 6166  4 62 12  2 20
18  0  0 7583632 184480  722400 0 2814 1754  2 58  4  1 35

Signed-off-by: Bibo Mao 
---
 .../admin-guide/kernel-parameters.txt |   2 +-
 arch/loongarch/Kconfig|  11 ++
 arch/loongarch/include/asm/paravirt.h |   5 +
 arch/loongarch/kernel/paravirt.c  | 133 ++
 arch/loongarch/kernel/time.c  |   2 +
 5 files changed, 152 insertions(+), 1 deletion(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt 
b/Documentation/admin-guide/kernel-parameters.txt
index ef25e06ec08c..3435eaab392b 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -4057,7 +4057,7 @@
prediction) vulnerability. System may allow data
leaks with this option.
 
-   no-steal-acc[X86,PV_OPS,ARM64,PPC/PSERIES,RISCV,EARLY] Disable
+   no-steal-acc[X86,PV_OPS,ARM64,PPC/PSERIES,RISCV,LOONGARCH,EARLY] 
Disable
paravirtualized steal time accounting. steal time is
computed, but won't influence scheduler behaviour
 
diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig
index 4bda59525a13..a1c4b0080039 100644
--- a/arch/loongarch/Kconfig
+++ b/arch/loongarch/Kconfig
@@ -641,6 +641,17 @@ config PARAVIRT
  over full virtualization.  However, when run without a hypervisor
  the kernel is theoretically slower and slightly larger.
 
+config PARAVIRT_TIME_ACCOUNTING
+   bool "Paravirtual steal time accounting"
+   select PARAVIRT
+   help
+ Select this option to enable fine granularity task steal time
+ accounting. Time spent executing other tasks in parallel with
+ the current vCPU is discounted from the vCPU power. To account for
+ that, there can be a small performance impact.
+
+ If in doubt, say N here.
+
 endmenu
 
 config ARCH_SELECT_MEMORY_MODEL
diff --git a/arch/loongarch/include/asm/paravirt.h 
b/arch/loongarch/include/asm/paravirt.h
index 0965710f47f2..dddec49671ae 100644
--- a/arch/loongarch/include/asm/paravirt.h
+++ b/arch/loongarch/include/asm/paravirt.h
@@ -18,6 +18,7 @@ static inline u64 paravirt_steal_clock(int cpu)
 }
 
 int __init pv_ipi_init(void);
+int __init pv_time_init(void);
 
 #else
 
@@ -26,5 +27,9 @@ static inline int pv_ipi_init(void)
return 0;
 }
 
+static inline int pv_time_init(void)
+{
+   return 0;
+}
 #endif // CONFIG_PARAVIRT
 #endif
diff --git a/arch/loongarch/kernel/paravirt.c b/arch/loongarch/kernel/paravirt.c
index 1633ed4f692f..68f3cd75862a 100644
--- a/arch/loongarch/kernel/paravirt.c
+++ b/arch/loongarch/kernel/paravirt.c
@@ -4,11 +4,14 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 
 struct static_key paravirt_steal_enabled;
 struct static_key paravirt_steal_rq_enabled;
+static DEFINE_PER_CPU(struct kvm_steal_time, steal_time) __aligned(64);
+static int has_steal_clock;
 
 static u64 native_steal_clock(int cpu)
 {
@@ -17,6 +20,57 @@ static u64 native_steal_clock(int cpu)
 
 DEFINE_STATIC_CALL(pv_steal_clock, native_steal_clock);
 
+static bool steal_acc = true;
+static int __init parse_no_stealacc(char *arg)
+{
+   steal_acc = false;
+   return 0;
+}
+early_param("no-steal-acc", parse_no_stealacc);
+
+static u64 para_steal_clock(int cpu)
+{
+   u64 steal;
+   struct kvm_steal_time *src;
+   int version;
+
+   src = _cpu(steal_time, cpu);
+   do {
+
+   version = src->version;
+   /* Make sure that the version is read before the steal */
+   virt_rmb();
+   steal = src->steal;
+   /* Make sure that the steal is read before the next version */
+   virt_rmb();
+
+   } while ((version & 1) || (version != src->version));
+   return steal;
+}
+
+static int pv_enable_steal_time(void)
+{
+   int cpu = smp_processor_id();
+   struct kvm_steal_time *st;
+   unsigned

[PATCH v4 1/2] LoongArch: KVM: Add steal time support in kvm side

2024-05-24 Thread Bibo Mao

Steal time feature is added here in kvm side, VM can search supported
features provided by KVM hypervisor, feature KVM_FEATURE_STEAL_TIME
is added here. Like x86, steal time structure is saved in guest memory,
one hypercall function KVM_HCALL_FUNC_NOTIFY is added to notify KVM to
enable the feature.

One cpu attr ioctl command KVM_LOONGARCH_VCPU_PVTIME_CTRL is added to
save and restore base address of steal time structure when VM is migrated.

Signed-off-by: Bibo Mao 
---
 arch/loongarch/include/asm/kvm_host.h  |   7 ++
 arch/loongarch/include/asm/kvm_para.h  |  10 ++
 arch/loongarch/include/asm/kvm_vcpu.h  |   4 +
 arch/loongarch/include/asm/loongarch.h |   1 +
 arch/loongarch/include/uapi/asm/kvm.h  |   4 +
 arch/loongarch/kvm/Kconfig |   1 +
 arch/loongarch/kvm/exit.c  |  38 +++-
 arch/loongarch/kvm/vcpu.c  | 124 +
 8 files changed, 187 insertions(+), 2 deletions(-)

diff --git a/arch/loongarch/include/asm/kvm_host.h 
b/arch/loongarch/include/asm/kvm_host.h
index c87b6ea0ec47..2eb2f7572023 100644
--- a/arch/loongarch/include/asm/kvm_host.h
+++ b/arch/loongarch/include/asm/kvm_host.h
@@ -30,6 +30,7 @@
 #define KVM_PRIVATE_MEM_SLOTS  0
 
 #define KVM_HALT_POLL_NS_DEFAULT   50
+#define KVM_REQ_STEAL_UPDATE   KVM_ARCH_REQ(1)
 
 #define KVM_GUESTDBG_SW_BP_MASK\
(KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)
@@ -201,6 +202,12 @@ struct kvm_vcpu_arch {
struct kvm_mp_state mp_state;
/* cpucfg */
u32 cpucfg[KVM_MAX_CPUCFG_REGS];
+   /* paravirt steal time */
+   struct {
+   u64 guest_addr;
+   u64 last_steal;
+   struct gfn_to_hva_cache cache;
+   } st;
 };
 
 static inline unsigned long readl_sw_gcsr(struct loongarch_csrs *csr, int reg)
diff --git a/arch/loongarch/include/asm/kvm_para.h 
b/arch/loongarch/include/asm/kvm_para.h
index 4ba2312e5f8c..a9ba8185d4af 100644
--- a/arch/loongarch/include/asm/kvm_para.h
+++ b/arch/loongarch/include/asm/kvm_para.h
@@ -14,6 +14,7 @@
 
 #define KVM_HCALL_SERVICE  HYPERCALL_ENCODE(HYPERVISOR_KVM, 
KVM_HCALL_CODE_SERVICE)
 #define  KVM_HCALL_FUNC_IPI1
+#define  KVM_HCALL_FUNC_NOTIFY 2
 
 #define KVM_HCALL_SWDBG
HYPERCALL_ENCODE(HYPERVISOR_KVM, KVM_HCALL_CODE_SWDBG)
 
@@ -24,6 +25,15 @@
 #define KVM_HCALL_INVALID_CODE -1UL
 #define KVM_HCALL_INVALID_PARAMETER-2UL
 
+#define KVM_STEAL_PHYS_VALID   BIT_ULL(0)
+#define KVM_STEAL_PHYS_MASKGENMASK_ULL(63, 6)
+struct kvm_steal_time {
+   __u64 steal;
+   __u32 version;
+   __u32 flags;
+   __u32 pad[12];
+};
+
 /*
  * Hypercall interface for KVM hypervisor
  *
diff --git a/arch/loongarch/include/asm/kvm_vcpu.h 
b/arch/loongarch/include/asm/kvm_vcpu.h
index 590a92cb5416..d7e51300a89f 100644
--- a/arch/loongarch/include/asm/kvm_vcpu.h
+++ b/arch/loongarch/include/asm/kvm_vcpu.h
@@ -120,4 +120,8 @@ static inline void kvm_write_reg(struct kvm_vcpu *vcpu, int 
num, unsigned long v
vcpu->arch.gprs[num] = val;
 }
 
+static inline bool kvm_pvtime_supported(void)
+{
+   return !!sched_info_on();
+}
 #endif /* __ASM_LOONGARCH_KVM_VCPU_H__ */
diff --git a/arch/loongarch/include/asm/loongarch.h 
b/arch/loongarch/include/asm/loongarch.h
index eb09adda54b7..7a4633ef284b 100644
--- a/arch/loongarch/include/asm/loongarch.h
+++ b/arch/loongarch/include/asm/loongarch.h
@@ -169,6 +169,7 @@
 #define  KVM_SIGNATURE "KVM\0"
 #define CPUCFG_KVM_FEATURE (CPUCFG_KVM_BASE + 4)
 #define  KVM_FEATURE_IPI   BIT(1)
+#define  KVM_FEATURE_STEAL_TIMEBIT(2)
 
 #ifndef __ASSEMBLY__
 
diff --git a/arch/loongarch/include/uapi/asm/kvm.h 
b/arch/loongarch/include/uapi/asm/kvm.h
index f9abef382317..ddc5cab0ffd0 100644
--- a/arch/loongarch/include/uapi/asm/kvm.h
+++ b/arch/loongarch/include/uapi/asm/kvm.h
@@ -81,7 +81,11 @@ struct kvm_fpu {
 #define LOONGARCH_REG_64(TYPE, REG)(TYPE | KVM_REG_SIZE_U64 | (REG << 
LOONGARCH_REG_SHIFT))
 #define KVM_IOC_CSRID(REG) LOONGARCH_REG_64(KVM_REG_LOONGARCH_CSR, 
REG)
 #define KVM_IOC_CPUCFG(REG)
LOONGARCH_REG_64(KVM_REG_LOONGARCH_CPUCFG, REG)
+
+/* Device Control API on vcpu fd */
 #define KVM_LOONGARCH_VCPU_CPUCFG  0
+#define KVM_LOONGARCH_VCPU_PVTIME_CTRL 1
+#define  KVM_LOONGARCH_VCPU_PVTIME_GPA 0
 
 struct kvm_debug_exit_arch {
 };
diff --git a/arch/loongarch/kvm/Kconfig b/arch/loongarch/kvm/Kconfig
index c4ef2b4d9797..248744b4d086 100644
--- a/arch/loongarch/kvm/Kconfig
+++ b/arch/loongarch/kvm/Kconfig
@@ -29,6 +29,7 @@ config KVM
select KVM_MMIO
select HAVE_KVM_READONLY_MEM
select KVM_XFER_TO_GUEST_WORK
+   select SCHED_INFO
help
  Support hosting virtualized guest machines using
  hardware virtualization extensions. You will need
diff --git a/arch/loongarch/kvm/exit.c b/arch/loongarch/kvm/exit.c

[PATCH v4 0/2] LoongArch: Add steal time support

2024-05-24 Thread Bibo Mao

Para-virt feature steal time is added in both kvm and guest kernel side.
It is silimar with other architectures, steal time structure comes from
guest memory, also pseduo register is used to save/restore base address
of steal time structure, so that when vm is migrated, kvm module on
target machine knows the base address of steal time.

---
v3 ... v4:
  1. To resolve compile dependency problem, enable SCHED_INFO option
section if KVM is enabled.
  2. Put new added option PARAVIRT_TIME_ACCOUNTING in one submemu with
PARAVIRT in file arch/loongarch/Kconfig.

v2 ... v3:
  1. Solve code confliction based on the kernel 6.9.0
  2. Add kernel parameter no-steal-acc support on LoongArch with file
Documentation/admin-guide/kernel-parameters.txt
  3. Add strict checking with pv stealtimer gpa address in function
kvm_save_notify() and kvm_loongarch_pvtime_set_attr()

v1 ... v2:
  1. Add PARAVIRT_TIME_ACCOUNTING kconfig option in file
arch/loongarch/Kconfig
  2. Function name change such as replace pv_register_steal_time with
pv_enable_steal_time etc

---

Bibo Mao (2):
  LoongArch: KVM: Add steal time support in kvm side
  LoongArch: Add steal time support in guest side

 .../admin-guide/kernel-parameters.txt |   2 +-
 arch/loongarch/Kconfig|  11 ++
 arch/loongarch/include/asm/kvm_host.h |   7 +
 arch/loongarch/include/asm/kvm_para.h |  10 ++
 arch/loongarch/include/asm/kvm_vcpu.h |   4 +
 arch/loongarch/include/asm/loongarch.h|   1 +
 arch/loongarch/include/asm/paravirt.h |   5 +
 arch/loongarch/include/uapi/asm/kvm.h |   4 +
 arch/loongarch/kernel/paravirt.c  | 133 ++
 arch/loongarch/kernel/time.c  |   2 +
 arch/loongarch/kvm/Kconfig|   1 +
 arch/loongarch/kvm/exit.c |  38 -
 arch/loongarch/kvm/vcpu.c | 124 
 13 files changed, 339 insertions(+), 3 deletions(-)


base-commit: 6e51b4b5bbc07e52b226017936874715629932d1
-- 
2.39.3

Re: [PATCH v10 03/36] x86: tracing: Add ftrace_regs definition in the header

2024-05-23 Thread Google

On Thu, 23 May 2024 19:14:59 -0400
Steven Rostedt  wrote:

> On Tue,  7 May 2024 23:08:35 +0900
> "Masami Hiramatsu (Google)"  wrote:
> 
> > From: Masami Hiramatsu (Google) 
> > 
> > Add ftrace_regs definition for x86_64 in the ftrace header to
> > clarify what register will be accessible from ftrace_regs.
> > 
> > Signed-off-by: Masami Hiramatsu (Google) 
> > ---
> >  Changes in v3:
> >   - Add rip to be saved.
> >  Changes in v2:
> >   - Newly added.
> > ---
> >  arch/x86/include/asm/ftrace.h |6 ++
> >  1 file changed, 6 insertions(+)
> > 
> > diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
> > index cf88cc8cc74d..c88bf47f46da 100644
> > --- a/arch/x86/include/asm/ftrace.h
> > +++ b/arch/x86/include/asm/ftrace.h
> > @@ -36,6 +36,12 @@ static inline unsigned long ftrace_call_adjust(unsigned 
> > long addr)
> >  
> >  #ifdef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS
> >  struct ftrace_regs {
> > +   /*
> > +* On the x86_64, the ftrace_regs saves;
> > +* rax, rcx, rdx, rdi, rsi, r8, r9, rbp, rip and rsp.
> > +* Also orig_ax is used for passing direct trampoline address.
> > +* x86_32 doesn't support ftrace_regs.
> 
> Should add a comment that if fregs->regs.cs is set, then all of the pt_regs
> is valid.

But what about rbx and r1*? Only regs->cs should be care for pt_regs?
Or, did you mean "the ftrace_regs is valid"?

> And x86_32 does support ftrace_regs, it just doesn't support
> having a subset of it.

Oh, thanks. I'll update the comment about x86_32.

Thank you,

> 
> -- Steve
> 
> 
> > +*/
> > struct pt_regs  regs;
> >  };
> >  
> 
> 


-- 
Masami Hiramatsu (Google)

Re: [PATCH v10 01/36] tracing: Add a comment about ftrace_regs definition

2024-05-23 Thread Google

On Thu, 23 May 2024 19:10:31 -0400
Steven Rostedt  wrote:

> On Tue,  7 May 2024 23:08:12 +0900
> "Masami Hiramatsu (Google)"  wrote:
> 
> > From: Masami Hiramatsu (Google) 
> > 
> > To clarify what will be expected on ftrace_regs, add a comment to the
> > architecture independent definition of the ftrace_regs.
> > 
> > Signed-off-by: Masami Hiramatsu (Google) 
> > Acked-by: Mark Rutland 
> > ---
> >  Changes in v8:
> >   - Update that the saved registers depends on the context.
> >  Changes in v3:
> >   - Add instruction pointer
> >  Changes in v2:
> >   - newly added.
> > ---
> >  include/linux/ftrace.h |   26 ++
> >  1 file changed, 26 insertions(+)
> > 
> > diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
> > index 54d53f345d14..b81f1afa82a1 100644
> > --- a/include/linux/ftrace.h
> > +++ b/include/linux/ftrace.h
> > @@ -118,6 +118,32 @@ extern int ftrace_enabled;
> >  
> >  #ifndef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS
> >  
> > +/**
> > + * ftrace_regs - ftrace partial/optimal register set
> > + *
> > + * ftrace_regs represents a group of registers which is used at the
> > + * function entry and exit. There are three types of registers.
> > + *
> > + * - Registers for passing the parameters to callee, including the stack
> > + *   pointer. (e.g. rcx, rdx, rdi, rsi, r8, r9 and rsp on x86_64)
> > + * - Registers for passing the return values to caller.
> > + *   (e.g. rax and rdx on x86_64)
> > + * - Registers for hooking the function call and return including the
> > + *   frame pointer (the frame pointer is architecture/config dependent)
> > + *   (e.g. rip, rbp and rsp for x86_64)
> > + *
> > + * Also, architecture dependent fields can be used for internal process.
> > + * (e.g. orig_ax on x86_64)
> > + *
> > + * On the function entry, those registers will be restored except for
> > + * the stack pointer, so that user can change the function parameters
> > + * and instruction pointer (e.g. live patching.)
> > + * On the function exit, only registers which is used for return values
> > + * are restored.
> 
> I wonder if we should also add a note about some architectures in some
> circumstances may store all pt_regs in ftrace_regs. For example, if an
> architecture supports FTRACE_WITH_REGS, it may pass the pt_regs within the
> ftrace_regs. If that is the case, then ftrace_get_regs() called on it will
> return a pointer to a valid pt_regs, or NULL if it is not supported or the
> ftrace_regs does not have a all the registers.

Agreed. That case also should be noted. Thanks for pointing!


> 
> -- Steve
> 
> 
> > + *
> > + * NOTE: user *must not* access regs directly, only do it via APIs, because
> > + * the member can be changed according to the architecture.
> > + */
> >  struct ftrace_regs {
> > struct pt_regs  regs;
> >  };
> 


-- 
Masami Hiramatsu (Google)

Re: [RFC PATCH 00/20] Introduce the famfs shared-memory file system

2024-05-23 Thread John Groves

On 24/05/23 03:57PM, Miklos Szeredi wrote:
> [trimming CC list]
> 
> On Thu, 23 May 2024 at 04:49, John Groves  wrote:
> 
> > - memmap=! will reserve a pretend pmem device at 
> > 
> > - memmap=$ will reserve a pretend dax device at 
> > 
> 
> Doesn't get me a /dev/dax or /dev/pmem
> 
> Complete qemu command line:
> 
> qemu-kvm -s -serial none -parallel none -kernel
> /home/mszeredi/git/linux/arch/x86/boot/bzImage -drive
> format=raw,file=/home/mszeredi/root_fs,index=0,if=virtio -drive
> format=raw,file=/home/mszeredi/images/ubd1,index=1,if=virtio -chardev
> stdio,id=virtiocon0,signal=off -device virtio-serial -device
> virtconsole,chardev=virtiocon0 -cpu host -m 8G -net user -net
> nic,model=virtio -fsdev local,security_model=none,id=fsdev0,path=/home
> -device virtio-9p-pci,fsdev=fsdev0,mount_tag=hostshare -device
> virtio-rng-pci -smp 4 -append 'root=/dev/vda console=hvc0
> memmap=4G$4G'
> 
> root@kvm:~/famfs# scripts/chk_efi.sh
> This system is neither Ubuntu nor Fedora. It is identified as debian.
> /sys/firmware/efi not found; probably not efi
>  not found; probably nof efi
> /boot/efi/EFI not found; probably not efi
> /boot/efi/EFI/BOOT not found; probably not efi
> /boot/efi/EFI/ not found; probably not efi
> /boot/efi/EFI//grub.cfg not found; probably nof efi
> Probably not efi; errs=6
> 
> Thanks,
> Miklos

Apologies, but I'm short on time at the moment - going into a long holiday
weekend in the US with family plans. I should be focused again by middle of
next week.

But can you check /proc/cmdline to see of the memmap arg got through without
getting mangled? The '$' tends to get fubar'd. You might need \$, or I've seen
the need for \\\$. If it's un-mangled, there should be a dax device.

If that doesn't work, it's worth trying '!' instead, which I think would give
you a pmem device - if the arg gets through (but ! is less likely to get
horked). That pmem device can be converted to devdax...

Regards,
John

Re: [PATCH] uprobes: prevent mutex_lock() under rcu_read_lock()

2024-05-23 Thread Google

On Mon, 20 May 2024 22:30:17 -0700
Andrii Nakryiko  wrote:

> Recent changes made uprobe_cpu_buffer preparation lazy, and moved it
> deeper into __uprobe_trace_func(). This is problematic because
> __uprobe_trace_func() is called inside rcu_read_lock()/rcu_read_unlock()
> block, which then calls prepare_uprobe_buffer() -> uprobe_buffer_get() ->
> mutex_lock(>mutex), leading to a splat about using mutex under
> non-sleepable RCU:
> 
>   BUG: sleeping function called from invalid context at 
> kernel/locking/mutex.c:585
>in_atomic(): 0, irqs_disabled(): 0, non_block: 0, pid: 98231, name: 
> stress-ng-sigq
>preempt_count: 0, expected: 0
>RCU nest depth: 1, expected: 0
>...
>Call Trace:
> 
> dump_stack_lvl+0x3d/0xe0
> __might_resched+0x24c/0x270
> ? prepare_uprobe_buffer+0xd5/0x1d0
> __mutex_lock+0x41/0x820
> ? ___perf_sw_event+0x206/0x290
> ? __perf_event_task_sched_in+0x54/0x660
> ? __perf_event_task_sched_in+0x54/0x660
> prepare_uprobe_buffer+0xd5/0x1d0
> __uprobe_trace_func+0x4a/0x140
> uprobe_dispatcher+0x135/0x280
> ? uprobe_dispatcher+0x94/0x280
> uprobe_notify_resume+0x650/0xec0
> ? atomic_notifier_call_chain+0x21/0x110
> ? atomic_notifier_call_chain+0xf8/0x110
> irqentry_exit_to_user_mode+0xe2/0x1e0
> asm_exc_int3+0x35/0x40
>RIP: 0033:0x7f7e1d4da390
>Code: 33 04 00 0f 1f 80 00 00 00 00 f3 0f 1e fa b9 01 00 00 00 e9 b2 fc ff 
> ff 66 90 f3 0f 1e fa 31 c9 e9 a5 fc ff ff 0f 1f 44 00 00  0f 1e fa b8 27 
> 00 00 00 0f 05 c3 0f 1f 40 00 f3 0f 1e fa b8 6e
>RSP: 002b:7ffd2abc3608 EFLAGS: 0246
>RAX:  RBX: 76d325f1 RCX: 
>RDX: 76d325f1 RSI: 000a RDI: 7ffd2abc3690
>RBP: 000a R08: 00017fb7 R09: 00017fb7
>R10: 00017fb7 R11: 0246 R12: 00017ff2
>R13: 7ffd2abc3610 R14:  R15: 7ffd2abc3780
> 
> 
> Luckily, it's easy to fix by moving prepare_uprobe_buffer() to be called
> slightly earlier: into uprobe_trace_func() and uretprobe_trace_func(), outside
> of RCU locked section. This still keeps this buffer preparation lazy and helps
> avoid the overhead when it's not needed. E.g., if there is only BPF uprobe
> handler installed on a given uprobe, buffer won't be initialized.
> 
> Note, the other user of prepare_uprobe_buffer(), __uprobe_perf_func(), is not
> affected, as it doesn't prepare buffer under RCU read lock.
> 

Oops, good catch! This looks good to me. Let me pick it.
Let me add a simple uprobe test in ftracetest so that this error can
detect in selftests. (I could reproduced it.)

Thank you,

> Fixes: 1b8f85defbc8 ("uprobes: prepare uprobe args buffer lazily")
> Reported-by: Breno Leitao 
> Signed-off-by: Andrii Nakryiko 
> ---
>  kernel/trace/trace_uprobe.c | 14 +-
>  1 file changed, 9 insertions(+), 5 deletions(-)
> 
> diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
> index 8541fa1494ae..c98e3b3386ba 100644
> --- a/kernel/trace/trace_uprobe.c
> +++ b/kernel/trace/trace_uprobe.c
> @@ -970,19 +970,17 @@ static struct uprobe_cpu_buffer 
> *prepare_uprobe_buffer(struct trace_uprobe *tu,
>  
>  static void __uprobe_trace_func(struct trace_uprobe *tu,
>   unsigned long func, struct pt_regs *regs,
> - struct uprobe_cpu_buffer **ucbp,
> + struct uprobe_cpu_buffer *ucb,
>   struct trace_event_file *trace_file)
>  {
>   struct uprobe_trace_entry_head *entry;
>   struct trace_event_buffer fbuffer;
> - struct uprobe_cpu_buffer *ucb;
>   void *data;
>   int size, esize;
>   struct trace_event_call *call = trace_probe_event_call(>tp);
>  
>   WARN_ON(call != trace_file->event_call);
>  
> - ucb = prepare_uprobe_buffer(tu, regs, ucbp);
>   if (WARN_ON_ONCE(ucb->dsize > PAGE_SIZE))
>   return;
>  
> @@ -1014,13 +1012,16 @@ static int uprobe_trace_func(struct trace_uprobe *tu, 
> struct pt_regs *regs,
>struct uprobe_cpu_buffer **ucbp)
>  {
>   struct event_file_link *link;
> + struct uprobe_cpu_buffer *ucb;
>  
>   if (is_ret_probe(tu))
>   return 0;
>  
> + ucb = prepare_uprobe_buffer(tu, regs, ucbp);
> +
>   rcu_read_lock();
>   trace_probe_for_each_link_rcu(link, >tp)
> - __uprobe_trace_func(tu, 0, regs, ucbp, link->file);
> + __uprobe_trace_func(tu, 0, regs, ucb, link->file);
>   rcu_read_unlock();
>  
>   return 0;
> @@ -1031,10 +1032,13 @@ static void uretprobe_trace_func(struct trace_uprobe 
> *tu, unsigned long func,
>struct uprobe_cpu_buffer **ucbp)
>  {
>   struct event_file_link *link;
> + struct uprobe_cpu_buffer *ucb;
> +
> + ucb = prepare_uprobe_buffer(tu, regs, ucbp);
>  
>   rcu_read_lock();
>

Re: [PATCH v10 03/36] x86: tracing: Add ftrace_regs definition in the header

2024-05-23 Thread Steven Rostedt

On Tue,  7 May 2024 23:08:35 +0900
"Masami Hiramatsu (Google)"  wrote:

> From: Masami Hiramatsu (Google) 
> 
> Add ftrace_regs definition for x86_64 in the ftrace header to
> clarify what register will be accessible from ftrace_regs.
> 
> Signed-off-by: Masami Hiramatsu (Google) 
> ---
>  Changes in v3:
>   - Add rip to be saved.
>  Changes in v2:
>   - Newly added.
> ---
>  arch/x86/include/asm/ftrace.h |6 ++
>  1 file changed, 6 insertions(+)
> 
> diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
> index cf88cc8cc74d..c88bf47f46da 100644
> --- a/arch/x86/include/asm/ftrace.h
> +++ b/arch/x86/include/asm/ftrace.h
> @@ -36,6 +36,12 @@ static inline unsigned long ftrace_call_adjust(unsigned 
> long addr)
>  
>  #ifdef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS
>  struct ftrace_regs {
> + /*
> +  * On the x86_64, the ftrace_regs saves;
> +  * rax, rcx, rdx, rdi, rsi, r8, r9, rbp, rip and rsp.
> +  * Also orig_ax is used for passing direct trampoline address.
> +  * x86_32 doesn't support ftrace_regs.

Should add a comment that if fregs->regs.cs is set, then all of the pt_regs
is valid. And x86_32 does support ftrace_regs, it just doesn't support
having a subset of it.

-- Steve


> +  */
>   struct pt_regs  regs;
>  };
>

Re: [PATCH v10 01/36] tracing: Add a comment about ftrace_regs definition

2024-05-23 Thread Steven Rostedt

On Tue,  7 May 2024 23:08:12 +0900
"Masami Hiramatsu (Google)"  wrote:

> From: Masami Hiramatsu (Google) 
> 
> To clarify what will be expected on ftrace_regs, add a comment to the
> architecture independent definition of the ftrace_regs.
> 
> Signed-off-by: Masami Hiramatsu (Google) 
> Acked-by: Mark Rutland 
> ---
>  Changes in v8:
>   - Update that the saved registers depends on the context.
>  Changes in v3:
>   - Add instruction pointer
>  Changes in v2:
>   - newly added.
> ---
>  include/linux/ftrace.h |   26 ++
>  1 file changed, 26 insertions(+)
> 
> diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
> index 54d53f345d14..b81f1afa82a1 100644
> --- a/include/linux/ftrace.h
> +++ b/include/linux/ftrace.h
> @@ -118,6 +118,32 @@ extern int ftrace_enabled;
>  
>  #ifndef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS
>  
> +/**
> + * ftrace_regs - ftrace partial/optimal register set
> + *
> + * ftrace_regs represents a group of registers which is used at the
> + * function entry and exit. There are three types of registers.
> + *
> + * - Registers for passing the parameters to callee, including the stack
> + *   pointer. (e.g. rcx, rdx, rdi, rsi, r8, r9 and rsp on x86_64)
> + * - Registers for passing the return values to caller.
> + *   (e.g. rax and rdx on x86_64)
> + * - Registers for hooking the function call and return including the
> + *   frame pointer (the frame pointer is architecture/config dependent)
> + *   (e.g. rip, rbp and rsp for x86_64)
> + *
> + * Also, architecture dependent fields can be used for internal process.
> + * (e.g. orig_ax on x86_64)
> + *
> + * On the function entry, those registers will be restored except for
> + * the stack pointer, so that user can change the function parameters
> + * and instruction pointer (e.g. live patching.)
> + * On the function exit, only registers which is used for return values
> + * are restored.

I wonder if we should also add a note about some architectures in some
circumstances may store all pt_regs in ftrace_regs. For example, if an
architecture supports FTRACE_WITH_REGS, it may pass the pt_regs within the
ftrace_regs. If that is the case, then ftrace_get_regs() called on it will
return a pointer to a valid pt_regs, or NULL if it is not supported or the
ftrace_regs does not have a all the registers.

-- Steve


> + *
> + * NOTE: user *must not* access regs directly, only do it via APIs, because
> + * the member can be changed according to the architecture.
> + */
>  struct ftrace_regs {
>   struct pt_regs  regs;
>  };

Re: [PATCH v2 1/1] x86/vector: Fix vector leak during CPU offline

2024-05-23 Thread Thomas Gleixner

On Wed, May 22 2024 at 15:02, Dongli Zhang wrote:
> The absence of IRQD_MOVE_PCNTXT prevents immediate effectiveness of
> interrupt affinity reconfiguration via procfs. Instead, the change is
> deferred until the next instance of the interrupt being triggered on the
> original CPU.
>
> When the interrupt next triggers on the original CPU, the new affinity is
> enforced within __irq_move_irq(). A vector is allocated from the new CPU,
> but if the old vector on the original CPU remains online, it is not
> immediately reclaimed. Instead, apicd->move_in_progress is flagged, and the
> reclaiming process is delayed until the next trigger of the interrupt on
> the new CPU.
>
> Upon the subsequent triggering of the interrupt on the new CPU,
> irq_complete_move() adds a task to the old CPU's vector_cleanup list if it
> remains online. Subsequently, the timer on the old CPU iterates over its
> vector_cleanup list, reclaiming old vectors.
>
> However, a rare scenario arises if the old CPU is outgoing before the
> interrupt triggers again on the new CPU. The irq_force_complete_move() may
> not have the chance to be invoked on the outgoing CPU to reclaim the old
> apicd->prev_vector. This is because the interrupt isn't currently affine to
> the outgoing CPU, and irq_needs_fixup() returns false. Even though
> __vector_schedule_cleanup() is later called on the new CPU, it doesn't
> reclaim apicd->prev_vector; instead, it simply resets both
> apicd->move_in_progress and apicd->prev_vector to 0.
>
> As a result, the vector remains unreclaimed in vector_matrix, leading to a
> CPU vector leak.
>
> To address this issue, move the invocation of irq_force_complete_move()
> before the irq_needs_fixup() call to reclaim apicd->prev_vector, if the
> interrupt is currently or used to affine to the outgoing CPU. Additionally,
> reclaim the vector in __vector_schedule_cleanup() as well, following a
> warning message, although theoretically it should never see
> apicd->move_in_progress with apicd->prev_cpu pointing to an offline CPU.

Nice change log!

Re: [GIT PULL v2] virtio: features, fixes, cleanups

2024-05-23 Thread pr-tracker-bot

The pull request you sent on Thu, 23 May 2024 02:00:17 -0400:

> https://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost.git tags/for_linus

has been merged into torvalds/linux.git:
https://git.kernel.org/torvalds/c/2ef32ad2241340565c35baf77fc95053c84eeeb0

Thank you!

-- 
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/prtracker.html

Re: [PATCH] x86/paravirt: Disable virt spinlock when CONFIG_PARAVIRT_SPINLOCKS disabled

2024-05-23 Thread Dave Hansen

On 5/23/24 11:39, Jürgen Groß wrote:
>>
>> Let's just keep it simple.  How about the attached patch?
> 
> Simple indeed. The attachment is empty. 

Let's try this again.diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 5358d43886ad..c193c9e60a1b 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -55,8 +55,7 @@ DEFINE_STATIC_KEY_TRUE(virt_spin_lock_key);
 
 void __init native_pv_lock_init(void)
 {
-	if (IS_ENABLED(CONFIG_PARAVIRT_SPINLOCKS) &&
-	!boot_cpu_has(X86_FEATURE_HYPERVISOR))
+	if (!boot_cpu_has(X86_FEATURE_HYPERVISOR))
 		static_branch_disable(_spin_lock_key);
 }

Re: [PATCH] x86/paravirt: Disable virt spinlock when CONFIG_PARAVIRT_SPINLOCKS disabled

2024-05-23 Thread Jürgen Groß


On 23.05.24 18:30, Dave Hansen wrote:

On 5/16/24 06:02, Chen Yu wrote:

Performance drop is reported when running encode/decode workload and
BenchSEE cache sub-workload.
Bisect points to commit ce0a1b608bfc ("x86/paravirt: Silence unused
native_pv_lock_init() function warning"). When CONFIG_PARAVIRT_SPINLOCKS
is disabled the virt_spin_lock_key is set to true on bare-metal.
The qspinlock degenerates to test-and-set spinlock, which decrease the
performance on bare-metal.

Fix this by disabling virt_spin_lock_key if CONFIG_PARAVIRT_SPINLOCKS
is not set, or it is on bare-metal.


This is missing some background:

The kernel can change spinlock behavior when running as a guest.  But
this guest-friendly behavior causes performance problems on bare metal.
So there's a 'virt_spin_lock_key' static key to switch between the two
modes.

The static key is always enabled by default (run in guest mode) and
should be disabled for bare metal (and in some guests that want native
behavior).

... then describe the regression and the fix


diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 5358d43886ad..ee51c0949ed8 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -55,7 +55,7 @@ DEFINE_STATIC_KEY_TRUE(virt_spin_lock_key);
  
  void __init native_pv_lock_init(void)

  {
-   if (IS_ENABLED(CONFIG_PARAVIRT_SPINLOCKS) &&
+   if (!IS_ENABLED(CONFIG_PARAVIRT_SPINLOCKS) ||
!boot_cpu_has(X86_FEATURE_HYPERVISOR))
static_branch_disable(_spin_lock_key);
  }

This gets used at a single site:

 if (pv_enabled())
 goto pv_queue;

 if (virt_spin_lock(lock))
 return;

which is logically:

if (IS_ENABLED(CONFIG_PARAVIRT_SPINLOCKS))
goto ...; // don't look at virt_spin_lock_key

if (virt_spin_lock_key)
return; // On virt, but non-paravirt.  Did Test-and-Set
// spinlock.

So I _think_ Arnd was trying to optimize native_pv_lock_init() away when
it's going to get skipped over anyway by the 'goto'.

But this took me at least 30 minutes of scratching my head and trying to
untangle the whole thing.  It's all far too subtle for my taste, and all
of that to save a few bytes of init text in a configuration that's
probably not even used very often (PARAVIRT=y, but PARAVIRT_SPINLOCKS=n).

Let's just keep it simple.  How about the attached patch?


Simple indeed. The attachment is empty. :-p


Juergen

Re: [PATCH] riscv: Fix early ftrace nop patching

2024-05-23 Thread patchwork-bot+linux-riscv

Hello:

This patch was applied to riscv/linux.git (for-next)
by Palmer Dabbelt :

On Thu, 23 May 2024 13:51:34 +0200 you wrote:
> Commit c97bf629963e ("riscv: Fix text patching when IPI are used")
> converted ftrace_make_nop() to use patch_insn_write() which does not
> emit any icache flush relying entirely on __ftrace_modify_code() to do
> that.
> 
> But we missed that ftrace_make_nop() was called very early directly when
> converting mcount calls into nops (actually on riscv it converts 2B nops
> emitted by the compiler into 4B nops).
> 
> [...]

Here is the summary with links:
  - riscv: Fix early ftrace nop patching
https://git.kernel.org/riscv/c/6ca445d8af0e

You are awesome, thank you!
-- 
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/patchwork/pwbot.html

Re: [PATCH v2 1/2] drivers: remoteproc: xlnx: add attach detach support

2024-05-23 Thread Tanmay Shah




On 5/23/24 12:05 PM, Mathieu Poirier wrote:
> On Wed, May 22, 2024 at 09:36:26AM -0500, Tanmay Shah wrote:
>> 
>> 
>> On 5/21/24 12:56 PM, Mathieu Poirier wrote:
>> > Hi Tanmay,
>> > 
>> > On Fri, May 10, 2024 at 05:51:25PM -0700, Tanmay Shah wrote:
>> >> It is possible that remote processor is already running before
>> >> linux boot or remoteproc platform driver probe. Implement required
>> >> remoteproc framework ops to provide resource table address and
>> >> connect or disconnect with remote processor in such case.
>> >> 
>> >> Signed-off-by: Tanmay Shah 
>> >> ---
>> >> 
>> >> Changes in v2:
>> >>   - Fix following sparse warnings
>> >> 
>> >> drivers/remoteproc/xlnx_r5_remoteproc.c:827:21: sparse:expected 
>> >> struct rsc_tbl_data *rsc_data_va
>> >> drivers/remoteproc/xlnx_r5_remoteproc.c:844:18: sparse:expected 
>> >> struct resource_table *rsc_addr
>> >> drivers/remoteproc/xlnx_r5_remoteproc.c:898:24: sparse:expected void 
>> >> volatile [noderef] __iomem *addr
>> >> 
>> >>  drivers/remoteproc/xlnx_r5_remoteproc.c | 164 +++-
>> >>  1 file changed, 160 insertions(+), 4 deletions(-)
>> >> 
>> >> diff --git a/drivers/remoteproc/xlnx_r5_remoteproc.c 
>> >> b/drivers/remoteproc/xlnx_r5_remoteproc.c
>> >> index 84243d1dff9f..039370cffa32 100644
>> >> --- a/drivers/remoteproc/xlnx_r5_remoteproc.c
>> >> +++ b/drivers/remoteproc/xlnx_r5_remoteproc.c
>> >> @@ -25,6 +25,10 @@
>> >>  /* RX mailbox client buffer max length */
>> >>  #define MBOX_CLIENT_BUF_MAX  (IPI_BUF_LEN_MAX + \
>> >>sizeof(struct zynqmp_ipi_message))
>> >> +
>> >> +#define RSC_TBL_XLNX_MAGIC   ((uint32_t)'x' << 24 | (uint32_t)'a' << 
>> >> 16 | \
>> >> +  (uint32_t)'m' << 8 | (uint32_t)'p')
>> >> +
>> >>  /*
>> >>   * settings for RPU cluster mode which
>> >>   * reflects possible values of xlnx,cluster-mode dt-property
>> >> @@ -73,6 +77,15 @@ struct mbox_info {
>> >>   struct mbox_chan *rx_chan;
>> >>  };
>> >>  
>> >> +/* Xilinx Platform specific data structure */
>> >> +struct rsc_tbl_data {
>> >> + const int version;
>> >> + const u32 magic_num;
>> >> + const u32 comp_magic_num;
>> > 
>> > Why is a complement magic number needed?
>> 
>> Actually magic number is 64-bit. There is good chance that
>> firmware can have 32-bit op-code or data same as magic number, but very less
>> chance of its complement in the next address. So, we can assume magic number
>> is 64-bit. 
>>
> 
> So why not having a magic number that is a u64?
> 
>> > 
>> >> + const u32 rsc_tbl_size;
>> >> + const uintptr_t rsc_tbl;
>> >> +} __packed;
>> >> +
>> >>  /*
>> >>   * Hardcoded TCM bank values. This will stay in driver to maintain 
>> >> backward
>> >>   * compatibility with device-tree that does not have TCM information.
>> >> @@ -95,20 +108,24 @@ static const struct mem_bank_data 
>> >> zynqmp_tcm_banks_lockstep[] = {
>> >>  /**
>> >>   * struct zynqmp_r5_core
>> >>   *
>> >> + * @rsc_tbl_va: resource table virtual address
>> >>   * @dev: device of RPU instance
>> >>   * @np: device node of RPU instance
>> >>   * @tcm_bank_count: number TCM banks accessible to this RPU
>> >>   * @tcm_banks: array of each TCM bank data
>> >>   * @rproc: rproc handle
>> >> + * @rsc_tbl_size: resource table size retrieved from remote
>> >>   * @pm_domain_id: RPU CPU power domain id
>> >>   * @ipi: pointer to mailbox information
>> >>   */
>> >>  struct zynqmp_r5_core {
>> >> + struct resource_table *rsc_tbl_va;
>> > 
>> > Shouldn't this be of type "void __iomem *"?  Did sparse give you trouble 
>> > on that
>> > one?
>> 
>> I fixed sparse warnings with typecast below [1].
>> 
> 
> My point is, ioremap_wc() returns a "void__iomem *" so why not using that
> instead of a "struct resource_table *"?

Ack.

> 
> 
>> > 
>> >>   struct device *dev;
>> >>   struct device_node *np;
>> >>   int tcm_bank_count;
>> >>   struct mem_bank_data **tcm_banks;
>> >>   struct rproc *rproc;
>> >> + u32 rsc_tbl_size;
>> >>   u32 pm_domain_id;
>> >>   struct mbox_info *ipi;
>> >>  };
>> >> @@ -621,10 +638,19 @@ static int zynqmp_r5_rproc_prepare(struct rproc 
>> >> *rproc)
>> >>  {
>> >>   int ret;
>> >>  
>> >> - ret = add_tcm_banks(rproc);
>> >> - if (ret) {
>> >> - dev_err(>dev, "failed to get TCM banks, err %d\n", ret);
>> >> - return ret;
>> >> + /**
>> > 
>> > Using "/**" is for comments that will endup in the documentation, which I 
>> > don't
>> > think is needed here.  Please correct throughout the patch.
>> 
>> Thanks. Ack, I will use only /* format.
>> 
>> > 
>> >> +  * For attach/detach use case, Firmware is already loaded so
>> >> +  * TCM isn't really needed at all. Also, for security TCM can be
>> >> +  * locked in such case and linux may not have access at all.
>> >> +  * So avoid adding TCM banks. TCM power-domains requested during attach
>> >> +  * callback.
>> >> +  */
>> >> + if (rproc->state != RPROC_DETACHED) {
>> >> + ret = add_tcm_banks(rproc);
>> >> + if

Re: [PATCH v2 1/2] drivers: remoteproc: xlnx: add attach detach support

2024-05-23 Thread Mathieu Poirier

On Wed, May 22, 2024 at 09:36:26AM -0500, Tanmay Shah wrote:
> 
> 
> On 5/21/24 12:56 PM, Mathieu Poirier wrote:
> > Hi Tanmay,
> > 
> > On Fri, May 10, 2024 at 05:51:25PM -0700, Tanmay Shah wrote:
> >> It is possible that remote processor is already running before
> >> linux boot or remoteproc platform driver probe. Implement required
> >> remoteproc framework ops to provide resource table address and
> >> connect or disconnect with remote processor in such case.
> >> 
> >> Signed-off-by: Tanmay Shah 
> >> ---
> >> 
> >> Changes in v2:
> >>   - Fix following sparse warnings
> >> 
> >> drivers/remoteproc/xlnx_r5_remoteproc.c:827:21: sparse:expected struct 
> >> rsc_tbl_data *rsc_data_va
> >> drivers/remoteproc/xlnx_r5_remoteproc.c:844:18: sparse:expected struct 
> >> resource_table *rsc_addr
> >> drivers/remoteproc/xlnx_r5_remoteproc.c:898:24: sparse:expected void 
> >> volatile [noderef] __iomem *addr
> >> 
> >>  drivers/remoteproc/xlnx_r5_remoteproc.c | 164 +++-
> >>  1 file changed, 160 insertions(+), 4 deletions(-)
> >> 
> >> diff --git a/drivers/remoteproc/xlnx_r5_remoteproc.c 
> >> b/drivers/remoteproc/xlnx_r5_remoteproc.c
> >> index 84243d1dff9f..039370cffa32 100644
> >> --- a/drivers/remoteproc/xlnx_r5_remoteproc.c
> >> +++ b/drivers/remoteproc/xlnx_r5_remoteproc.c
> >> @@ -25,6 +25,10 @@
> >>  /* RX mailbox client buffer max length */
> >>  #define MBOX_CLIENT_BUF_MAX   (IPI_BUF_LEN_MAX + \
> >> sizeof(struct zynqmp_ipi_message))
> >> +
> >> +#define RSC_TBL_XLNX_MAGIC((uint32_t)'x' << 24 | (uint32_t)'a' << 
> >> 16 | \
> >> +   (uint32_t)'m' << 8 | (uint32_t)'p')
> >> +
> >>  /*
> >>   * settings for RPU cluster mode which
> >>   * reflects possible values of xlnx,cluster-mode dt-property
> >> @@ -73,6 +77,15 @@ struct mbox_info {
> >>struct mbox_chan *rx_chan;
> >>  };
> >>  
> >> +/* Xilinx Platform specific data structure */
> >> +struct rsc_tbl_data {
> >> +  const int version;
> >> +  const u32 magic_num;
> >> +  const u32 comp_magic_num;
> > 
> > Why is a complement magic number needed?
> 
> Actually magic number is 64-bit. There is good chance that
> firmware can have 32-bit op-code or data same as magic number, but very less
> chance of its complement in the next address. So, we can assume magic number
> is 64-bit. 
>

So why not having a magic number that is a u64?

> > 
> >> +  const u32 rsc_tbl_size;
> >> +  const uintptr_t rsc_tbl;
> >> +} __packed;
> >> +
> >>  /*
> >>   * Hardcoded TCM bank values. This will stay in driver to maintain 
> >> backward
> >>   * compatibility with device-tree that does not have TCM information.
> >> @@ -95,20 +108,24 @@ static const struct mem_bank_data 
> >> zynqmp_tcm_banks_lockstep[] = {
> >>  /**
> >>   * struct zynqmp_r5_core
> >>   *
> >> + * @rsc_tbl_va: resource table virtual address
> >>   * @dev: device of RPU instance
> >>   * @np: device node of RPU instance
> >>   * @tcm_bank_count: number TCM banks accessible to this RPU
> >>   * @tcm_banks: array of each TCM bank data
> >>   * @rproc: rproc handle
> >> + * @rsc_tbl_size: resource table size retrieved from remote
> >>   * @pm_domain_id: RPU CPU power domain id
> >>   * @ipi: pointer to mailbox information
> >>   */
> >>  struct zynqmp_r5_core {
> >> +  struct resource_table *rsc_tbl_va;
> > 
> > Shouldn't this be of type "void __iomem *"?  Did sparse give you trouble on 
> > that
> > one?
> 
> I fixed sparse warnings with typecast below [1].
> 

My point is, ioremap_wc() returns a "void__iomem *" so why not using that
instead of a "struct resource_table *"?


> > 
> >>struct device *dev;
> >>struct device_node *np;
> >>int tcm_bank_count;
> >>struct mem_bank_data **tcm_banks;
> >>struct rproc *rproc;
> >> +  u32 rsc_tbl_size;
> >>u32 pm_domain_id;
> >>struct mbox_info *ipi;
> >>  };
> >> @@ -621,10 +638,19 @@ static int zynqmp_r5_rproc_prepare(struct rproc 
> >> *rproc)
> >>  {
> >>int ret;
> >>  
> >> -  ret = add_tcm_banks(rproc);
> >> -  if (ret) {
> >> -  dev_err(>dev, "failed to get TCM banks, err %d\n", ret);
> >> -  return ret;
> >> +  /**
> > 
> > Using "/**" is for comments that will endup in the documentation, which I 
> > don't
> > think is needed here.  Please correct throughout the patch.
> 
> Thanks. Ack, I will use only /* format.
> 
> > 
> >> +   * For attach/detach use case, Firmware is already loaded so
> >> +   * TCM isn't really needed at all. Also, for security TCM can be
> >> +   * locked in such case and linux may not have access at all.
> >> +   * So avoid adding TCM banks. TCM power-domains requested during attach
> >> +   * callback.
> >> +   */
> >> +  if (rproc->state != RPROC_DETACHED) {
> >> +  ret = add_tcm_banks(rproc);
> >> +  if (ret) {
> >> +  dev_err(>dev, "failed to get TCM banks, err 
> >> %d\n", ret);
> >> +  return ret;
> >> +  }
> >>}
> >>  
>

[PATCH] ipvs: Avoid unnecessary calls to skb_is_gso_sctp

2024-05-23 Thread Ismael Luceno

In the context of the SCTP SNAT/DNAT handler, these calls can only
return true.

Ref: e10d3ba4d434 ("ipvs: Fix checksumming on GSO of SCTP packets")
Signed-off-by: Ismael Luceno 
CC: Pablo Neira Ayuso 
CC: Michal Kubeček 
CC: Simon Horman 
CC: Julian Anastasov 
CC: lvs-de...@vger.kernel.org
CC: netfilter-de...@vger.kernel.org
CC: net...@vger.kernel.org
CC: coret...@netfilter.org
---
 net/netfilter/ipvs/ip_vs_proto_sctp.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c 
b/net/netfilter/ipvs/ip_vs_proto_sctp.c
index 1e689c714127..83e452916403 100644
--- a/net/netfilter/ipvs/ip_vs_proto_sctp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c
@@ -126,7 +126,7 @@ sctp_snat_handler(struct sk_buff *skb, struct 
ip_vs_protocol *pp,
if (sctph->source != cp->vport || payload_csum ||
skb->ip_summed == CHECKSUM_PARTIAL) {
sctph->source = cp->vport;
-   if (!skb_is_gso(skb) || !skb_is_gso_sctp(skb))
+   if (!skb_is_gso(skb))
sctp_nat_csum(skb, sctph, sctphoff);
} else {
skb->ip_summed = CHECKSUM_UNNECESSARY;
@@ -175,7 +175,7 @@ sctp_dnat_handler(struct sk_buff *skb, struct 
ip_vs_protocol *pp,
(skb->ip_summed == CHECKSUM_PARTIAL &&
 !(skb_dst(skb)->dev->features & NETIF_F_SCTP_CRC))) {
sctph->dest = cp->dport;
-   if (!skb_is_gso(skb) || !skb_is_gso_sctp(skb))
+   if (!skb_is_gso(skb))
sctp_nat_csum(skb, sctph, sctphoff);
} else if (skb->ip_summed != CHECKSUM_PARTIAL) {
skb->ip_summed = CHECKSUM_UNNECESSARY;
-- 
2.44.0

Re: [PATCH] x86/paravirt: Disable virt spinlock when CONFIG_PARAVIRT_SPINLOCKS disabled

2024-05-23 Thread Dave Hansen

On 5/16/24 06:02, Chen Yu wrote:
> Performance drop is reported when running encode/decode workload and
> BenchSEE cache sub-workload.
> Bisect points to commit ce0a1b608bfc ("x86/paravirt: Silence unused
> native_pv_lock_init() function warning"). When CONFIG_PARAVIRT_SPINLOCKS
> is disabled the virt_spin_lock_key is set to true on bare-metal.
> The qspinlock degenerates to test-and-set spinlock, which decrease the
> performance on bare-metal.
> 
> Fix this by disabling virt_spin_lock_key if CONFIG_PARAVIRT_SPINLOCKS
> is not set, or it is on bare-metal.

This is missing some background:

The kernel can change spinlock behavior when running as a guest.  But
this guest-friendly behavior causes performance problems on bare metal.
So there's a 'virt_spin_lock_key' static key to switch between the two
modes.

The static key is always enabled by default (run in guest mode) and
should be disabled for bare metal (and in some guests that want native
behavior).

... then describe the regression and the fix

> diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
> index 5358d43886ad..ee51c0949ed8 100644
> --- a/arch/x86/kernel/paravirt.c
> +++ b/arch/x86/kernel/paravirt.c
> @@ -55,7 +55,7 @@ DEFINE_STATIC_KEY_TRUE(virt_spin_lock_key);
>  
>  void __init native_pv_lock_init(void)
>  {
> - if (IS_ENABLED(CONFIG_PARAVIRT_SPINLOCKS) &&
> + if (!IS_ENABLED(CONFIG_PARAVIRT_SPINLOCKS) ||
>   !boot_cpu_has(X86_FEATURE_HYPERVISOR))
>   static_branch_disable(_spin_lock_key);
>  }
This gets used at a single site:

if (pv_enabled())
goto pv_queue;

if (virt_spin_lock(lock))
return;

which is logically:

if (IS_ENABLED(CONFIG_PARAVIRT_SPINLOCKS))
goto ...; // don't look at virt_spin_lock_key

if (virt_spin_lock_key)
return; // On virt, but non-paravirt.  Did Test-and-Set
// spinlock.

So I _think_ Arnd was trying to optimize native_pv_lock_init() away when
it's going to get skipped over anyway by the 'goto'.

But this took me at least 30 minutes of scratching my head and trying to
untangle the whole thing.  It's all far too subtle for my taste, and all
of that to save a few bytes of init text in a configuration that's
probably not even used very often (PARAVIRT=y, but PARAVIRT_SPINLOCKS=n).

Let's just keep it simple.  How about the attached patch?

Re: [PATCH v2] sched/rt: Clean up usage of rt_task()

2024-05-23 Thread Steven Rostedt

On Wed, 15 May 2024 23:05:36 +0100
Qais Yousef  wrote:
> diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h
> index df3aca89d4f5..5cb88b748ad6 100644
> --- a/include/linux/sched/deadline.h
> +++ b/include/linux/sched/deadline.h
> @@ -10,8 +10,6 @@
>  
>  #include 
>  
> -#define MAX_DL_PRIO  0
> -
>  static inline int dl_prio(int prio)
>  {
>   if (unlikely(prio < MAX_DL_PRIO))
> @@ -19,6 +17,10 @@ static inline int dl_prio(int prio)
>   return 0;
>  }
>  
> +/*
> + * Returns true if a task has a priority that belongs to DL class. PI-boosted
> + * tasks will return true. Use dl_policy() to ignore PI-boosted tasks.
> + */
>  static inline int dl_task(struct task_struct *p)
>  {
>   return dl_prio(p->prio);
> diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h
> index ab83d85e1183..6ab43b4f72f9 100644
> --- a/include/linux/sched/prio.h
> +++ b/include/linux/sched/prio.h
> @@ -14,6 +14,7 @@
>   */
>  
>  #define MAX_RT_PRIO  100
> +#define MAX_DL_PRIO  0
>  
>  #define MAX_PRIO (MAX_RT_PRIO + NICE_WIDTH)
>  #define DEFAULT_PRIO (MAX_RT_PRIO + NICE_WIDTH / 2)
> diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h
> index b2b9e6eb9683..a055dd68a77c 100644
> --- a/include/linux/sched/rt.h
> +++ b/include/linux/sched/rt.h
> @@ -7,18 +7,43 @@
>  struct task_struct;
>  
>  static inline int rt_prio(int prio)
> +{
> + if (unlikely(prio < MAX_RT_PRIO && prio >= MAX_DL_PRIO))
> + return 1;
> + return 0;
> +}
> +
> +static inline int realtime_prio(int prio)
>  {
>   if (unlikely(prio < MAX_RT_PRIO))
>   return 1;
>   return 0;
>  }

I'm thinking we should change the above to bool (separate patch), as
returning an int may give one the impression that it returns the actual
priority number. Having it return bool will clear that up.

In fact, if we are touching theses functions, might as well change all of
them to bool when returning true/false. Just to make it easier to
understand what they are doing.

>  
> +/*
> + * Returns true if a task has a priority that belongs to RT class. PI-boosted
> + * tasks will return true. Use rt_policy() to ignore PI-boosted tasks.
> + */
>  static inline int rt_task(struct task_struct *p)
>  {
>   return rt_prio(p->prio);
>  }
>  
> -static inline bool task_is_realtime(struct task_struct *tsk)
> +/*
> + * Returns true if a task has a priority that belongs to RT or DL classes.
> + * PI-boosted tasks will return true. Use realtime_task_policy() to ignore
> + * PI-boosted tasks.
> + */
> +static inline int realtime_task(struct task_struct *p)
> +{
> + return realtime_prio(p->prio);
> +}
> +
> +/*
> + * Returns true if a task has a policy that belongs to RT or DL classes.
> + * PI-boosted tasks will return false.
> + */
> +static inline bool realtime_task_policy(struct task_struct *tsk)
>  {
>   int policy = tsk->policy;
>  



> diff --git a/kernel/trace/trace_sched_wakeup.c 
> b/kernel/trace/trace_sched_wakeup.c
> index 0469a04a355f..19d737742e29 100644
> --- a/kernel/trace/trace_sched_wakeup.c
> +++ b/kernel/trace/trace_sched_wakeup.c
> @@ -545,7 +545,7 @@ probe_wakeup(void *ignore, struct task_struct *p)
>*  - wakeup_dl handles tasks belonging to sched_dl class only.
>*/
>   if (tracing_dl || (wakeup_dl && !dl_task(p)) ||
> - (wakeup_rt && !dl_task(p) && !rt_task(p)) ||
> + (wakeup_rt && !realtime_task(p)) ||
>   (!dl_task(p) && (p->prio >= wakeup_prio || p->prio >= 
> current->prio)))
>   return;
>  

Reviewed-by: Steven Rostedt (Google)

Re: [RFC PATCH 0/5] vsock/virtio: Add support for multi-devices

2024-05-23 Thread Michael S. Tsirkin

On Fri, May 17, 2024 at 10:46:02PM +0800, Xuewei Niu wrote:
>  include/linux/virtio_vsock.h|   2 +-
>  include/net/af_vsock.h  |  25 ++-
>  include/uapi/linux/virtio_vsock.h   |   1 +
>  include/uapi/linux/vm_sockets.h |  14 ++
>  net/vmw_vsock/af_vsock.c| 116 +--
>  net/vmw_vsock/virtio_transport.c| 255 ++--
>  net/vmw_vsock/virtio_transport_common.c |  16 +-
>  net/vmw_vsock/vsock_loopback.c  |   4 +-
>  8 files changed, 352 insertions(+), 81 deletions(-)

As any change to virtio device/driver interface, this has to
go through the virtio TC. Please subscribe at
virtio-comment+subscr...@lists.linux.dev and then
contact the TC at virtio-comm...@lists.linux.dev

You will likely eventually need to write a spec draft document, too.

-- 
MST

Re: [PATCH] livepatch: introduce klp_func called interface

2024-05-23 Thread Dan Carpenter

On Sun, May 19, 2024 at 03:43:43PM +0800, Wardenjohn wrote:
> Livepatch module usually used to modify kernel functions.
> If the patched function have bug, it may cause serious result
> such as kernel crash.
> 
> This commit introduce a read only interface of livepatch
> sysfs interface. If a livepatch function is called, this
> sysfs interface "called" of the patched function will
> set to be 1.
> 
> /sys/kernel/livepatchcalled
> 
> This value "called" is quite necessary for kernel stability assurance for 
> livepatching
> module of a running system. Testing process is important before a livepatch 
> module
> apply to a production system. With this interface, testing process can easily
> find out which function is successfully called. Any testing process can make 
> sure they
> have successfully cover all the patched function that changed with the help 
> of this interface.
> ---

Always run your patches through checkpatch.

So this patch is so that testers can see if a function has been called?
Can you not get the same information from gcov or ftrace?

There are style issues with the patch, but it's not so important until
the design is agreed on.

regards,
dan carpenter

Re: [PATCH] riscv: Fix early ftrace nop patching

2024-05-23 Thread Conor Dooley

On Thu, May 23, 2024 at 01:51:34PM +0200, Alexandre Ghiti wrote:
> Commit c97bf629963e ("riscv: Fix text patching when IPI are used")
> converted ftrace_make_nop() to use patch_insn_write() which does not
> emit any icache flush relying entirely on __ftrace_modify_code() to do
> that.
> 
> But we missed that ftrace_make_nop() was called very early directly when
> converting mcount calls into nops (actually on riscv it converts 2B nops
> emitted by the compiler into 4B nops).
> 
> This caused crashes on multiple HW as reported by Conor and Björn since
> the booting core could have half-patched instructions in its icache
> which would trigger an illegal instruction trap: fix this by emitting a
> local flush icache when early patching nops.
> 
> Fixes: c97bf629963e ("riscv: Fix text patching when IPI are used")
> Signed-off-by: Alexandre Ghiti 

Reported-by: Conor Dooley 
Tested-by: Conor Dooley 

Thanks for the quick fix Alex :)


signature.asc
Description: PGP signature

Re: [RFC PATCH 00/20] Introduce the famfs shared-memory file system

2024-05-23 Thread Miklos Szeredi

[trimming CC list]

On Thu, 23 May 2024 at 04:49, John Groves  wrote:

> - memmap=! will reserve a pretend pmem device at 
> 
> - memmap=$ will reserve a pretend dax device at 

Doesn't get me a /dev/dax or /dev/pmem

Complete qemu command line:

qemu-kvm -s -serial none -parallel none -kernel
/home/mszeredi/git/linux/arch/x86/boot/bzImage -drive
format=raw,file=/home/mszeredi/root_fs,index=0,if=virtio -drive
format=raw,file=/home/mszeredi/images/ubd1,index=1,if=virtio -chardev
stdio,id=virtiocon0,signal=off -device virtio-serial -device
virtconsole,chardev=virtiocon0 -cpu host -m 8G -net user -net
nic,model=virtio -fsdev local,security_model=none,id=fsdev0,path=/home
-device virtio-9p-pci,fsdev=fsdev0,mount_tag=hostshare -device
virtio-rng-pci -smp 4 -append 'root=/dev/vda console=hvc0
memmap=4G$4G'

root@kvm:~/famfs# scripts/chk_efi.sh
This system is neither Ubuntu nor Fedora. It is identified as debian.
/sys/firmware/efi not found; probably not efi
 not found; probably nof efi
/boot/efi/EFI not found; probably not efi
/boot/efi/EFI/BOOT not found; probably not efi
/boot/efi/EFI/ not found; probably not efi
/boot/efi/EFI//grub.cfg not found; probably nof efi
Probably not efi; errs=6

Thanks,
Miklos

Re: [PATCH] riscv: Fix early ftrace nop patching

2024-05-23 Thread Björn Töpel

Alexandre Ghiti  writes:

> Commit c97bf629963e ("riscv: Fix text patching when IPI are used")
> converted ftrace_make_nop() to use patch_insn_write() which does not
> emit any icache flush relying entirely on __ftrace_modify_code() to do
> that.
>
> But we missed that ftrace_make_nop() was called very early directly when
> converting mcount calls into nops (actually on riscv it converts 2B nops
> emitted by the compiler into 4B nops).
>
> This caused crashes on multiple HW as reported by Conor and Björn since
> the booting core could have half-patched instructions in its icache
> which would trigger an illegal instruction trap: fix this by emitting a
> local flush icache when early patching nops.
>
> Fixes: c97bf629963e ("riscv: Fix text patching when IPI are used")
> Signed-off-by: Alexandre Ghiti 

Nice!

I've manged to reproduce the crash on the VisionFive2 board (however
only triggered when CONFIG_RELOCATABLE=y), and can verify that this fix
solves the issue.

Reviewed-by: Björn Töpel 
Tested-by: Björn Töpel

[PATCHv7 9/9] man2: Add uretprobe syscall page

2024-05-23 Thread Jiri Olsa

Adding man page for new uretprobe syscall.

Reviewed-by: Alejandro Colomar 
Signed-off-by: Jiri Olsa 
---
 man/man2/uretprobe.2 | 56 
 1 file changed, 56 insertions(+)
 create mode 100644 man/man2/uretprobe.2

diff --git a/man/man2/uretprobe.2 b/man/man2/uretprobe.2
new file mode 100644
index ..cf1c2b0d852e
--- /dev/null
+++ b/man/man2/uretprobe.2
@@ -0,0 +1,56 @@
+.\" Copyright (C) 2024, Jiri Olsa 
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.TH uretprobe 2 (date) "Linux man-pages (unreleased)"
+.SH NAME
+uretprobe \- execute pending return uprobes
+.SH SYNOPSIS
+.nf
+.B int uretprobe(void)
+.fi
+.SH DESCRIPTION
+The
+.BR uretprobe ()
+system call is an alternative to breakpoint instructions for triggering return
+uprobe consumers.
+.P
+Calls to
+.BR uretprobe ()
+system call are only made from the user-space trampoline provided by the 
kernel.
+Calls from any other place result in a
+.BR SIGILL .
+.SH RETURN VALUE
+The
+.BR uretprobe ()
+system call return value is architecture-specific.
+.SH ERRORS
+.TP
+.B SIGILL
+The
+.BR uretprobe ()
+system call was called by a user-space program.
+.SH VERSIONS
+Details of the
+.BR uretprobe ()
+system call behavior vary across systems.
+.SH STANDARDS
+None.
+.SH HISTORY
+TBD
+.SH NOTES
+The
+.BR uretprobe ()
+system call was initially introduced for the x86_64 architecture
+where it was shown to be faster than breakpoint traps.
+It might be extended to other architectures.
+.P
+The
+.BR uretprobe ()
+system call exists only to allow the invocation of return uprobe consumers.
+It should
+.B never
+be called directly.
+Details of the arguments (if any) passed to
+.BR uretprobe ()
+and the return value are architecture-specific.
-- 
2.45.1

[PATCHv7 bpf-next 8/9] selftests/bpf: Add uretprobe shadow stack test

2024-05-23 Thread Jiri Olsa

Adding uretprobe shadow stack test that runs all existing
uretprobe tests with shadow stack enabled if it's available.

Signed-off-by: Jiri Olsa 
---
 .../selftests/bpf/prog_tests/uprobe_syscall.c | 60 +++
 1 file changed, 60 insertions(+)

diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c 
b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c
index 3ef324c2db50..fda456401284 100644
--- a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c
+++ b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c
@@ -9,6 +9,9 @@
 #include 
 #include 
 #include 
+#include 
+#include 
+#include 
 #include "uprobe_syscall.skel.h"
 #include "uprobe_syscall_executed.skel.h"
 
@@ -297,6 +300,56 @@ static void test_uretprobe_syscall_call(void)
close(go[1]);
close(go[0]);
 }
+
+/*
+ * Borrowed from tools/testing/selftests/x86/test_shadow_stack.c.
+ *
+ * For use in inline enablement of shadow stack.
+ *
+ * The program can't return from the point where shadow stack gets enabled
+ * because there will be no address on the shadow stack. So it can't use
+ * syscall() for enablement, since it is a function.
+ *
+ * Based on code from nolibc.h. Keep a copy here because this can't pull
+ * in all of nolibc.h.
+ */
+#define ARCH_PRCTL(arg1, arg2) \
+({ \
+   long _ret;  \
+   register long _num  asm("eax") = __NR_arch_prctl;   \
+   register long _arg1 asm("rdi") = (long)(arg1);  \
+   register long _arg2 asm("rsi") = (long)(arg2);  \
+   \
+   asm volatile (  \
+   "syscall\n" \
+   : "=a"(_ret)\
+   : "r"(_arg1), "r"(_arg2),   \
+ "0"(_num) \
+   : "rcx", "r11", "memory", "cc"  \
+   );  \
+   _ret;   \
+})
+
+#ifndef ARCH_SHSTK_ENABLE
+#define ARCH_SHSTK_ENABLE  0x5001
+#define ARCH_SHSTK_DISABLE 0x5002
+#define ARCH_SHSTK_SHSTK   (1ULL <<  0)
+#endif
+
+static void test_uretprobe_shadow_stack(void)
+{
+   if (ARCH_PRCTL(ARCH_SHSTK_ENABLE, ARCH_SHSTK_SHSTK)) {
+   test__skip();
+   return;
+   }
+
+   /* Run all of the uretprobe tests. */
+   test_uretprobe_regs_equal();
+   test_uretprobe_regs_change();
+   test_uretprobe_syscall_call();
+
+   ARCH_PRCTL(ARCH_SHSTK_DISABLE, ARCH_SHSTK_SHSTK);
+}
 #else
 static void test_uretprobe_regs_equal(void)
 {
@@ -312,6 +365,11 @@ static void test_uretprobe_syscall_call(void)
 {
test__skip();
 }
+
+static void test_uretprobe_shadow_stack(void)
+{
+   test__skip();
+}
 #endif
 
 void test_uprobe_syscall(void)
@@ -322,4 +380,6 @@ void test_uprobe_syscall(void)
test_uretprobe_regs_change();
if (test__start_subtest("uretprobe_syscall_call"))
test_uretprobe_syscall_call();
+   if (test__start_subtest("uretprobe_shadow_stack"))
+   test_uretprobe_shadow_stack();
 }
-- 
2.45.1

[PATCHv7 bpf-next 7/9] selftests/bpf: Add uretprobe syscall call from user space test

2024-05-23 Thread Jiri Olsa

Adding test to verify that when called from outside of the
trampoline provided by kernel, the uretprobe syscall will cause
calling process to receive SIGILL signal and the attached bpf
program is not executed.

Acked-by: Andrii Nakryiko 
Reviewed-by: Masami Hiramatsu (Google) 
Signed-off-by: Jiri Olsa 
---
 .../selftests/bpf/prog_tests/uprobe_syscall.c | 95 +++
 .../bpf/progs/uprobe_syscall_executed.c   | 17 
 2 files changed, 112 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/progs/uprobe_syscall_executed.c

diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c 
b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c
index 1a50cd35205d..3ef324c2db50 100644
--- a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c
+++ b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c
@@ -7,7 +7,10 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 #include "uprobe_syscall.skel.h"
+#include "uprobe_syscall_executed.skel.h"
 
 __naked unsigned long uretprobe_regs_trigger(void)
 {
@@ -209,6 +212,91 @@ static void test_uretprobe_regs_change(void)
}
 }
 
+#ifndef __NR_uretprobe
+#define __NR_uretprobe 462
+#endif
+
+__naked unsigned long uretprobe_syscall_call_1(void)
+{
+   /*
+* Pretend we are uretprobe trampoline to trigger the return
+* probe invocation in order to verify we get SIGILL.
+*/
+   asm volatile (
+   "pushq %rax\n"
+   "pushq %rcx\n"
+   "pushq %r11\n"
+   "movq $" __stringify(__NR_uretprobe) ", %rax\n"
+   "syscall\n"
+   "popq %r11\n"
+   "popq %rcx\n"
+   "retq\n"
+   );
+}
+
+__naked unsigned long uretprobe_syscall_call(void)
+{
+   asm volatile (
+   "call uretprobe_syscall_call_1\n"
+   "retq\n"
+   );
+}
+
+static void test_uretprobe_syscall_call(void)
+{
+   LIBBPF_OPTS(bpf_uprobe_multi_opts, opts,
+   .retprobe = true,
+   );
+   struct uprobe_syscall_executed *skel;
+   int pid, status, err, go[2], c;
+
+   if (ASSERT_OK(pipe(go), "pipe"))
+   return;
+
+   skel = uprobe_syscall_executed__open_and_load();
+   if (!ASSERT_OK_PTR(skel, "uprobe_syscall_executed__open_and_load"))
+   goto cleanup;
+
+   pid = fork();
+   if (!ASSERT_GE(pid, 0, "fork"))
+   goto cleanup;
+
+   /* child */
+   if (pid == 0) {
+   close(go[1]);
+
+   /* wait for parent's kick */
+   err = read(go[0], , 1);
+   if (err != 1)
+   exit(-1);
+
+   uretprobe_syscall_call();
+   _exit(0);
+   }
+
+   skel->links.test = bpf_program__attach_uprobe_multi(skel->progs.test, 
pid,
+   "/proc/self/exe",
+   
"uretprobe_syscall_call", );
+   if (!ASSERT_OK_PTR(skel->links.test, 
"bpf_program__attach_uprobe_multi"))
+   goto cleanup;
+
+   /* kick the child */
+   write(go[1], , 1);
+   err = waitpid(pid, , 0);
+   ASSERT_EQ(err, pid, "waitpid");
+
+   /* verify the child got killed with SIGILL */
+   ASSERT_EQ(WIFSIGNALED(status), 1, "WIFSIGNALED");
+   ASSERT_EQ(WTERMSIG(status), SIGILL, "WTERMSIG");
+
+   /* verify the uretprobe program wasn't called */
+   ASSERT_EQ(skel->bss->executed, 0, "executed");
+
+cleanup:
+   uprobe_syscall_executed__destroy(skel);
+   close(go[1]);
+   close(go[0]);
+}
 #else
 static void test_uretprobe_regs_equal(void)
 {
@@ -219,6 +307,11 @@ static void test_uretprobe_regs_change(void)
 {
test__skip();
 }
+
+static void test_uretprobe_syscall_call(void)
+{
+   test__skip();
+}
 #endif
 
 void test_uprobe_syscall(void)
@@ -227,4 +320,6 @@ void test_uprobe_syscall(void)
test_uretprobe_regs_equal();
if (test__start_subtest("uretprobe_regs_change"))
test_uretprobe_regs_change();
+   if (test__start_subtest("uretprobe_syscall_call"))
+   test_uretprobe_syscall_call();
 }
diff --git a/tools/testing/selftests/bpf/progs/uprobe_syscall_executed.c 
b/tools/testing/selftests/bpf/progs/uprobe_syscall_executed.c
new file mode 100644
index ..0d7f1a7db2e2
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/uprobe_syscall_executed.c
@@ -0,0 +1,17 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "vmlinux.h"
+#include 
+#include 
+
+struct pt_regs regs;
+
+char _license[] SEC("license") = "GPL";
+
+int executed = 0;
+
+SEC("uretprobe.multi")
+int test(struct pt_regs *regs)
+{
+   executed = 1;
+   return 0;
+}
-- 
2.45.1

[PATCHv7 bpf-next 6/9] selftests/bpf: Add uretprobe syscall test for regs changes

2024-05-23 Thread Jiri Olsa

Adding test that creates uprobe consumer on uretprobe which changes some
of the registers. Making sure the changed registers are propagated to the
user space when the ureptobe syscall trampoline is used on x86_64.

To be able to do this, adding support to bpf_testmod to create uprobe via
new attribute file:
  /sys/kernel/bpf_testmod_uprobe

This file is expecting file offset and creates related uprobe on current
process exe file and removes existing uprobe if offset is 0. The can be
only single uprobe at any time.

The uprobe has specific consumer that changes registers used in ureprobe
syscall trampoline and which are later checked in the test.

Acked-by: Andrii Nakryiko 
Reviewed-by: Masami Hiramatsu (Google) 
Signed-off-by: Jiri Olsa 
---
 .../selftests/bpf/bpf_testmod/bpf_testmod.c   | 123 +-
 .../selftests/bpf/prog_tests/uprobe_syscall.c |  67 ++
 2 files changed, 189 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c 
b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
index 2a18bd320e92..b0132a342bb5 100644
--- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
+++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
@@ -18,6 +18,7 @@
 #include 
 #include 
 #include 
+#include 
 #include "bpf_testmod.h"
 #include "bpf_testmod_kfunc.h"
 
@@ -358,6 +359,119 @@ static struct bin_attribute bin_attr_bpf_testmod_file 
__ro_after_init = {
.write = bpf_testmod_test_write,
 };
 
+/* bpf_testmod_uprobe sysfs attribute is so far enabled for x86_64 only,
+ * please see test_uretprobe_regs_change test
+ */
+#ifdef __x86_64__
+
+static int
+uprobe_ret_handler(struct uprobe_consumer *self, unsigned long func,
+  struct pt_regs *regs)
+
+{
+   regs->ax  = 0x12345678deadbeef;
+   regs->cx  = 0x87654321feebdaed;
+   regs->r11 = (u64) -1;
+   return true;
+}
+
+struct testmod_uprobe {
+   struct path path;
+   loff_t offset;
+   struct uprobe_consumer consumer;
+};
+
+static DEFINE_MUTEX(testmod_uprobe_mutex);
+
+static struct testmod_uprobe uprobe = {
+   .consumer.ret_handler = uprobe_ret_handler,
+};
+
+static int testmod_register_uprobe(loff_t offset)
+{
+   int err = -EBUSY;
+
+   if (uprobe.offset)
+   return -EBUSY;
+
+   mutex_lock(_uprobe_mutex);
+
+   if (uprobe.offset)
+   goto out;
+
+   err = kern_path("/proc/self/exe", LOOKUP_FOLLOW, );
+   if (err)
+   goto out;
+
+   err = uprobe_register_refctr(d_real_inode(uprobe.path.dentry),
+offset, 0, );
+   if (err)
+   path_put();
+   else
+   uprobe.offset = offset;
+
+out:
+   mutex_unlock(_uprobe_mutex);
+   return err;
+}
+
+static void testmod_unregister_uprobe(void)
+{
+   mutex_lock(_uprobe_mutex);
+
+   if (uprobe.offset) {
+   uprobe_unregister(d_real_inode(uprobe.path.dentry),
+ uprobe.offset, );
+   uprobe.offset = 0;
+   }
+
+   mutex_unlock(_uprobe_mutex);
+}
+
+static ssize_t
+bpf_testmod_uprobe_write(struct file *file, struct kobject *kobj,
+struct bin_attribute *bin_attr,
+char *buf, loff_t off, size_t len)
+{
+   unsigned long offset = 0;
+   int err = 0;
+
+   if (kstrtoul(buf, 0, ))
+   return -EINVAL;
+
+   if (offset)
+   err = testmod_register_uprobe(offset);
+   else
+   testmod_unregister_uprobe();
+
+   return err ?: strlen(buf);
+}
+
+static struct bin_attribute bin_attr_bpf_testmod_uprobe_file __ro_after_init = 
{
+   .attr = { .name = "bpf_testmod_uprobe", .mode = 0666, },
+   .write = bpf_testmod_uprobe_write,
+};
+
+static int register_bpf_testmod_uprobe(void)
+{
+   return sysfs_create_bin_file(kernel_kobj, 
_attr_bpf_testmod_uprobe_file);
+}
+
+static void unregister_bpf_testmod_uprobe(void)
+{
+   testmod_unregister_uprobe();
+   sysfs_remove_bin_file(kernel_kobj, _attr_bpf_testmod_uprobe_file);
+}
+
+#else
+static int register_bpf_testmod_uprobe(void)
+{
+   return 0;
+}
+
+static void unregister_bpf_testmod_uprobe(void) { }
+#endif
+
 BTF_KFUNCS_START(bpf_testmod_common_kfunc_ids)
 BTF_ID_FLAGS(func, bpf_iter_testmod_seq_new, KF_ITER_NEW)
 BTF_ID_FLAGS(func, bpf_iter_testmod_seq_next, KF_ITER_NEXT | KF_RET_NULL)
@@ -912,7 +1026,13 @@ static int bpf_testmod_init(void)
return -EINVAL;
sock = NULL;
mutex_init(_lock);
-   return sysfs_create_bin_file(kernel_kobj, _attr_bpf_testmod_file);
+   ret = sysfs_create_bin_file(kernel_kobj, _attr_bpf_testmod_file);
+   if (ret < 0)
+   return ret;
+   ret = register_bpf_testmod_uprobe();
+   if (ret < 0)
+   return ret;
+   return 0;
 }
 
 static void bpf_testmod_exit(void)
@@ -927,6 +1047,7 @@ static void bpf_testmod_exit(void)

[PATCHv7 bpf-next 5/9] selftests/bpf: Add uretprobe syscall test for regs integrity

2024-05-23 Thread Jiri Olsa

Add uretprobe syscall test that compares register values before
and after the uretprobe is hit. It also compares the register
values seen from attached bpf program.

Acked-by: Andrii Nakryiko 
Reviewed-by: Masami Hiramatsu (Google) 
Signed-off-by: Jiri Olsa 
---
 tools/include/linux/compiler.h|   4 +
 .../selftests/bpf/prog_tests/uprobe_syscall.c | 163 ++
 .../selftests/bpf/progs/uprobe_syscall.c  |  15 ++
 3 files changed, 182 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c
 create mode 100644 tools/testing/selftests/bpf/progs/uprobe_syscall.c

diff --git a/tools/include/linux/compiler.h b/tools/include/linux/compiler.h
index 8a63a9913495..6f7f22ac9da5 100644
--- a/tools/include/linux/compiler.h
+++ b/tools/include/linux/compiler.h
@@ -62,6 +62,10 @@
 #define __nocf_check __attribute__((nocf_check))
 #endif
 
+#ifndef __naked
+#define __naked __attribute__((__naked__))
+#endif
+
 /* Are two types/vars the same type (ignoring qualifiers)? */
 #ifndef __same_type
 # define __same_type(a, b) __builtin_types_compatible_p(typeof(a), typeof(b))
diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c 
b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c
new file mode 100644
index ..311ac19d8992
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c
@@ -0,0 +1,163 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include 
+
+#ifdef __x86_64__
+
+#include 
+#include 
+#include 
+#include "uprobe_syscall.skel.h"
+
+__naked unsigned long uretprobe_regs_trigger(void)
+{
+   asm volatile (
+   "movq $0xdeadbeef, %rax\n"
+   "ret\n"
+   );
+}
+
+__naked void uretprobe_regs(struct pt_regs *before, struct pt_regs *after)
+{
+   asm volatile (
+   "movq %r15,   0(%rdi)\n"
+   "movq %r14,   8(%rdi)\n"
+   "movq %r13,  16(%rdi)\n"
+   "movq %r12,  24(%rdi)\n"
+   "movq %rbp,  32(%rdi)\n"
+   "movq %rbx,  40(%rdi)\n"
+   "movq %r11,  48(%rdi)\n"
+   "movq %r10,  56(%rdi)\n"
+   "movq  %r9,  64(%rdi)\n"
+   "movq  %r8,  72(%rdi)\n"
+   "movq %rax,  80(%rdi)\n"
+   "movq %rcx,  88(%rdi)\n"
+   "movq %rdx,  96(%rdi)\n"
+   "movq %rsi, 104(%rdi)\n"
+   "movq %rdi, 112(%rdi)\n"
+   "movq   $0, 120(%rdi)\n" /* orig_rax */
+   "movq   $0, 128(%rdi)\n" /* rip  */
+   "movq   $0, 136(%rdi)\n" /* cs   */
+   "pushf\n"
+   "pop %rax\n"
+   "movq %rax, 144(%rdi)\n" /* eflags   */
+   "movq %rsp, 152(%rdi)\n" /* rsp  */
+   "movq   $0, 160(%rdi)\n" /* ss   */
+
+   /* save 2nd argument */
+   "pushq %rsi\n"
+   "call uretprobe_regs_trigger\n"
+
+   /* save  return value and load 2nd argument pointer to rax */
+   "pushq %rax\n"
+   "movq 8(%rsp), %rax\n"
+
+   "movq %r15,   0(%rax)\n"
+   "movq %r14,   8(%rax)\n"
+   "movq %r13,  16(%rax)\n"
+   "movq %r12,  24(%rax)\n"
+   "movq %rbp,  32(%rax)\n"
+   "movq %rbx,  40(%rax)\n"
+   "movq %r11,  48(%rax)\n"
+   "movq %r10,  56(%rax)\n"
+   "movq  %r9,  64(%rax)\n"
+   "movq  %r8,  72(%rax)\n"
+   "movq %rcx,  88(%rax)\n"
+   "movq %rdx,  96(%rax)\n"
+   "movq %rsi, 104(%rax)\n"
+   "movq %rdi, 112(%rax)\n"
+   "movq   $0, 120(%rax)\n" /* orig_rax */
+   "movq   $0, 128(%rax)\n" /* rip  */
+   "movq   $0, 136(%rax)\n" /* cs   */
+
+   /* restore return value and 2nd argument */
+   "pop %rax\n"
+   "pop %rsi\n"
+
+   "movq %rax,  80(%rsi)\n"
+
+   "pushf\n"
+   "pop %rax\n"
+
+   "movq %rax, 144(%rsi)\n" /* eflags   */
+   "movq %rsp, 152(%rsi)\n" /* rsp  */
+   "movq   $0, 160(%rsi)\n" /* ss   */
+   "ret\n"
+);
+}
+
+static void test_uretprobe_regs_equal(void)
+{
+   struct uprobe_syscall *skel = NULL;
+   struct pt_regs before = {}, after = {};
+   unsigned long *pb = (unsigned long *) 
+   unsigned long *pa = (unsigned long *) 
+   unsigned long *pp;
+   unsigned int i, cnt;
+   int err;
+
+   skel = uprobe_syscall__open_and_load();
+   if (!ASSERT_OK_PTR(skel, "uprobe_syscall__open_and_load"))
+   goto cleanup;
+
+   err = uprobe_syscall__attach(skel);
+   if (!ASSERT_OK(err, "uprobe_syscall__attach"))
+   goto cleanup;
+
+   uretprobe_regs(, );
+
+   pp = (unsigned long *) >bss->regs;
+   cnt = sizeof(before)/sizeof(*pb);
+
+   for (i = 0; i < cnt; i++) {

[PATCHv7 bpf-next 4/9] selftests/x86: Add return uprobe shadow stack test

2024-05-23 Thread Jiri Olsa

Adding return uprobe test for shadow stack and making sure it's
working properly. Borrowed some of the code from bpf selftests.

Signed-off-by: Jiri Olsa 
---
 .../testing/selftests/x86/test_shadow_stack.c | 145 ++
 1 file changed, 145 insertions(+)

diff --git a/tools/testing/selftests/x86/test_shadow_stack.c 
b/tools/testing/selftests/x86/test_shadow_stack.c
index 757e6527f67e..e3501b7e2ecc 100644
--- a/tools/testing/selftests/x86/test_shadow_stack.c
+++ b/tools/testing/selftests/x86/test_shadow_stack.c
@@ -34,6 +34,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /*
  * Define the ABI defines if needed, so people can run the tests
@@ -681,6 +682,144 @@ int test_32bit(void)
return !segv_triggered;
 }
 
+static int parse_uint_from_file(const char *file, const char *fmt)
+{
+   int err, ret;
+   FILE *f;
+
+   f = fopen(file, "re");
+   if (!f) {
+   err = -errno;
+   printf("failed to open '%s': %d\n", file, err);
+   return err;
+   }
+   err = fscanf(f, fmt, );
+   if (err != 1) {
+   err = err == EOF ? -EIO : -errno;
+   printf("failed to parse '%s': %d\n", file, err);
+   fclose(f);
+   return err;
+   }
+   fclose(f);
+   return ret;
+}
+
+static int determine_uprobe_perf_type(void)
+{
+   const char *file = "/sys/bus/event_source/devices/uprobe/type";
+
+   return parse_uint_from_file(file, "%d\n");
+}
+
+static int determine_uprobe_retprobe_bit(void)
+{
+   const char *file = 
"/sys/bus/event_source/devices/uprobe/format/retprobe";
+
+   return parse_uint_from_file(file, "config:%d\n");
+}
+
+static ssize_t get_uprobe_offset(const void *addr)
+{
+   size_t start, end, base;
+   char buf[256];
+   bool found = false;
+   FILE *f;
+
+   f = fopen("/proc/self/maps", "r");
+   if (!f)
+   return -errno;
+
+   while (fscanf(f, "%zx-%zx %s %zx %*[^\n]\n", , , buf, ) 
== 4) {
+   if (buf[2] == 'x' && (uintptr_t)addr >= start && 
(uintptr_t)addr < end) {
+   found = true;
+   break;
+   }
+   }
+
+   fclose(f);
+
+   if (!found)
+   return -ESRCH;
+
+   return (uintptr_t)addr - start + base;
+}
+
+static __attribute__((noinline)) void uretprobe_trigger(void)
+{
+   asm volatile ("");
+}
+
+/*
+ * This test setups return uprobe, which is sensitive to shadow stack
+ * (crashes without extra fix). After executing the uretprobe we fail
+ * the test if we receive SIGSEGV, no crash means we're good.
+ *
+ * Helper functions above borrowed from bpf selftests.
+ */
+static int test_uretprobe(void)
+{
+   const size_t attr_sz = sizeof(struct perf_event_attr);
+   const char *file = "/proc/self/exe";
+   int bit, fd = 0, type, err = 1;
+   struct perf_event_attr attr;
+   struct sigaction sa = {};
+   ssize_t offset;
+
+   type = determine_uprobe_perf_type();
+   if (type < 0) {
+   if (type == -ENOENT)
+   printf("[SKIP]\tUretprobe test, uprobes are not 
available\n");
+   return 0;
+   }
+
+   offset = get_uprobe_offset(uretprobe_trigger);
+   if (offset < 0)
+   return 1;
+
+   bit = determine_uprobe_retprobe_bit();
+   if (bit < 0)
+   return 1;
+
+   sa.sa_sigaction = segv_gp_handler;
+   sa.sa_flags = SA_SIGINFO;
+   if (sigaction(SIGSEGV, , NULL))
+   return 1;
+
+   /* Setup return uprobe through perf event interface. */
+   memset(, 0, attr_sz);
+   attr.size = attr_sz;
+   attr.type = type;
+   attr.config = 1 << bit;
+   attr.config1 = (__u64) (unsigned long) file;
+   attr.config2 = offset;
+
+   fd = syscall(__NR_perf_event_open, , 0 /* pid */, -1 /* cpu */,
+-1 /* group_fd */, PERF_FLAG_FD_CLOEXEC);
+   if (fd < 0)
+   goto out;
+
+   if (sigsetjmp(jmp_buffer, 1))
+   goto out;
+
+   ARCH_PRCTL(ARCH_SHSTK_ENABLE, ARCH_SHSTK_SHSTK);
+
+   /*
+* This either segfaults and goes through sigsetjmp above
+* or succeeds and we're good.
+*/
+   uretprobe_trigger();
+
+   printf("[OK]\tUretprobe test\n");
+   err = 0;
+
+out:
+   ARCH_PRCTL(ARCH_SHSTK_DISABLE, ARCH_SHSTK_SHSTK);
+   signal(SIGSEGV, SIG_DFL);
+   if (fd)
+   close(fd);
+   return err;
+}
+
 void segv_handler_ptrace(int signum, siginfo_t *si, void *uc)
 {
/* The SSP adjustment caused a segfault. */
@@ -867,6 +1006,12 @@ int main(int argc, char *argv[])
goto out;
}
 
+   if (test_uretprobe()) {
+   ret = 1;
+   printf("[FAIL]\turetprobe test\n");
+   goto out;
+   }
+
return ret;
 
 out:
-- 
2.45.1

[PATCHv7 bpf-next 3/9] uprobe: Add uretprobe syscall to speed up return probe

2024-05-23 Thread Jiri Olsa

Adding uretprobe syscall instead of trap to speed up return probe.

At the moment the uretprobe setup/path is:

  - install entry uprobe

  - when the uprobe is hit, it overwrites probed function's return address
on stack with address of the trampoline that contains breakpoint
instruction

  - the breakpoint trap code handles the uretprobe consumers execution and
jumps back to original return address

This patch replaces the above trampoline's breakpoint instruction with new
ureprobe syscall call. This syscall does exactly the same job as the trap
with some more extra work:

  - syscall trampoline must save original value for rax/r11/rcx registers
on stack - rax is set to syscall number and r11/rcx are changed and
used by syscall instruction

  - the syscall code reads the original values of those registers and
restore those values in task's pt_regs area

  - only caller from trampoline exposed in '[uprobes]' is allowed,
the process will receive SIGILL signal otherwise

Even with some extra work, using the uretprobes syscall shows speed
improvement (compared to using standard breakpoint):

  On Intel (11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz)

  current:
uretprobe-nop  :1.498 ± 0.000M/s
uretprobe-push :1.448 ± 0.001M/s
uretprobe-ret  :0.816 ± 0.001M/s

  with the fix:
uretprobe-nop  :1.969 ± 0.002M/s  < 31% speed up
uretprobe-push :1.910 ± 0.000M/s  < 31% speed up
uretprobe-ret  :0.934 ± 0.000M/s  < 14% speed up

  On Amd (AMD Ryzen 7 5700U)

  current:
uretprobe-nop  :0.778 ± 0.001M/s
uretprobe-push :0.744 ± 0.001M/s
uretprobe-ret  :0.540 ± 0.001M/s

  with the fix:
uretprobe-nop  :0.860 ± 0.001M/s  < 10% speed up
uretprobe-push :0.818 ± 0.001M/s  < 10% speed up
uretprobe-ret  :0.578 ± 0.000M/s  <  7% speed up

The performance test spawns a thread that runs loop which triggers
uprobe with attached bpf program that increments the counter that
gets printed in results above.

The uprobe (and uretprobe) kind is determined by which instruction
is being patched with breakpoint instruction. That's also important
for uretprobes, because uprobe is installed for each uretprobe.

The performance test is part of bpf selftests:
  tools/testing/selftests/bpf/run_bench_uprobes.sh

Note at the moment uretprobe syscall is supported only for native
64-bit process, compat process still uses standard breakpoint.

Note that when shadow stack is enabled the uretprobe syscall returns
via iret, which is slower than return via sysret, but won't cause the
shadow stack violation.

Suggested-by: Andrii Nakryiko 
Reviewed-by: Oleg Nesterov 
Reviewed-by: Masami Hiramatsu (Google) 
Acked-by: Andrii Nakryiko 
Signed-off-by: Oleg Nesterov 
Signed-off-by: Jiri Olsa 
---
 arch/x86/include/asm/shstk.h |   2 +
 arch/x86/kernel/shstk.c  |   5 ++
 arch/x86/kernel/uprobes.c| 117 +++
 include/linux/uprobes.h  |   3 +
 kernel/events/uprobes.c  |  24 ---
 5 files changed, 144 insertions(+), 7 deletions(-)

diff --git a/arch/x86/include/asm/shstk.h b/arch/x86/include/asm/shstk.h
index 896909f306e3..4cb77e004615 100644
--- a/arch/x86/include/asm/shstk.h
+++ b/arch/x86/include/asm/shstk.h
@@ -22,6 +22,7 @@ void shstk_free(struct task_struct *p);
 int setup_signal_shadow_stack(struct ksignal *ksig);
 int restore_signal_shadow_stack(void);
 int shstk_update_last_frame(unsigned long val);
+bool shstk_is_enabled(void);
 #else
 static inline long shstk_prctl(struct task_struct *task, int option,
   unsigned long arg2) { return -EINVAL; }
@@ -33,6 +34,7 @@ static inline void shstk_free(struct task_struct *p) {}
 static inline int setup_signal_shadow_stack(struct ksignal *ksig) { return 0; }
 static inline int restore_signal_shadow_stack(void) { return 0; }
 static inline int shstk_update_last_frame(unsigned long val) { return 0; }
+static inline bool shstk_is_enabled(void) { return false; }
 #endif /* CONFIG_X86_USER_SHADOW_STACK */
 
 #endif /* __ASSEMBLY__ */
diff --git a/arch/x86/kernel/shstk.c b/arch/x86/kernel/shstk.c
index 9797d4cdb78a..059685612362 100644
--- a/arch/x86/kernel/shstk.c
+++ b/arch/x86/kernel/shstk.c
@@ -588,3 +588,8 @@ int shstk_update_last_frame(unsigned long val)
ssp = get_user_shstk_addr();
return write_user_shstk_64((u64 __user *)ssp, (u64)val);
 }
+
+bool shstk_is_enabled(void)
+{
+   return features_enabled(ARCH_SHSTK_SHSTK);
+}
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index 6402fb3089d2..5a952c5ea66b 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -12,6 +12,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -308,6 +309,122 @@ static int uprobe_init_insn(struct arch_uprobe *auprobe, 
struct insn *insn, bool
 }
 
 #ifdef CONFIG_X86_64
+
+asm (
+   ".pushsection .rodata\n"
+   ".global uretprobe_trampoline_entry\n"
+

[PATCHv7 bpf-next 2/9] uprobe: Wire up uretprobe system call

2024-05-23 Thread Jiri Olsa

Wiring up uretprobe system call, which comes in following changes.
We need to do the wiring before, because the uretprobe implementation
needs the syscall number.

Note at the moment uretprobe syscall is supported only for native
64-bit process.

Reviewed-by: Oleg Nesterov 
Reviewed-by: Masami Hiramatsu (Google) 
Acked-by: Andrii Nakryiko 
Signed-off-by: Jiri Olsa 
---
 arch/x86/entry/syscalls/syscall_64.tbl | 1 +
 include/linux/syscalls.h   | 2 ++
 include/uapi/asm-generic/unistd.h  | 5 -
 kernel/sys_ni.c| 2 ++
 4 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/arch/x86/entry/syscalls/syscall_64.tbl 
b/arch/x86/entry/syscalls/syscall_64.tbl
index cc78226ffc35..47dfea0a827c 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -383,6 +383,7 @@
 459common  lsm_get_self_attr   sys_lsm_get_self_attr
 460common  lsm_set_self_attr   sys_lsm_set_self_attr
 461common  lsm_list_modulessys_lsm_list_modules
+46264  uretprobe   sys_uretprobe
 
 #
 # Due to a historical design error, certain syscalls are numbered differently
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index e619ac10cd23..5318e0e76799 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -972,6 +972,8 @@ asmlinkage long sys_lsm_list_modules(u64 *ids, u32 *size, 
u32 flags);
 /* x86 */
 asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int on);
 
+asmlinkage long sys_uretprobe(void);
+
 /* pciconfig: alpha, arm, arm64, ia64, sparc */
 asmlinkage long sys_pciconfig_read(unsigned long bus, unsigned long dfn,
unsigned long off, unsigned long len,
diff --git a/include/uapi/asm-generic/unistd.h 
b/include/uapi/asm-generic/unistd.h
index 75f00965ab15..8a747cd1d735 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -842,8 +842,11 @@ __SYSCALL(__NR_lsm_set_self_attr, sys_lsm_set_self_attr)
 #define __NR_lsm_list_modules 461
 __SYSCALL(__NR_lsm_list_modules, sys_lsm_list_modules)
 
+#define __NR_uretprobe 462
+__SYSCALL(__NR_uretprobe, sys_uretprobe)
+
 #undef __NR_syscalls
-#define __NR_syscalls 462
+#define __NR_syscalls 463
 
 /*
  * 32 bit systems traditionally used different
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index faad00cce269..be6195e0d078 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -391,3 +391,5 @@ COND_SYSCALL(setuid16);
 
 /* restartable sequence */
 COND_SYSCALL(rseq);
+
+COND_SYSCALL(uretprobe);
-- 
2.45.1

[PATCHv7 bpf-next 1/9] x86/shstk: Make return uprobe work with shadow stack

2024-05-23 Thread Jiri Olsa

Currently the application with enabled shadow stack will crash
if it sets up return uprobe. The reason is the uretprobe kernel
code changes the user space task's stack, but does not update
shadow stack accordingly.

Adding new functions to update values on shadow stack and using
them in uprobe code to keep shadow stack in sync with uretprobe
changes to user stack.

Reviewed-by: Oleg Nesterov 
Fixes: 488af8ea7131 ("x86/shstk: Wire in shadow stack interface")
Signed-off-by: Jiri Olsa 
---
 arch/x86/include/asm/shstk.h |  2 ++
 arch/x86/kernel/shstk.c  | 11 +++
 arch/x86/kernel/uprobes.c|  7 ++-
 3 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/shstk.h b/arch/x86/include/asm/shstk.h
index 42fee8959df7..896909f306e3 100644
--- a/arch/x86/include/asm/shstk.h
+++ b/arch/x86/include/asm/shstk.h
@@ -21,6 +21,7 @@ unsigned long shstk_alloc_thread_stack(struct task_struct *p, 
unsigned long clon
 void shstk_free(struct task_struct *p);
 int setup_signal_shadow_stack(struct ksignal *ksig);
 int restore_signal_shadow_stack(void);
+int shstk_update_last_frame(unsigned long val);
 #else
 static inline long shstk_prctl(struct task_struct *task, int option,
   unsigned long arg2) { return -EINVAL; }
@@ -31,6 +32,7 @@ static inline unsigned long shstk_alloc_thread_stack(struct 
task_struct *p,
 static inline void shstk_free(struct task_struct *p) {}
 static inline int setup_signal_shadow_stack(struct ksignal *ksig) { return 0; }
 static inline int restore_signal_shadow_stack(void) { return 0; }
+static inline int shstk_update_last_frame(unsigned long val) { return 0; }
 #endif /* CONFIG_X86_USER_SHADOW_STACK */
 
 #endif /* __ASSEMBLY__ */
diff --git a/arch/x86/kernel/shstk.c b/arch/x86/kernel/shstk.c
index 6f1e9883f074..9797d4cdb78a 100644
--- a/arch/x86/kernel/shstk.c
+++ b/arch/x86/kernel/shstk.c
@@ -577,3 +577,14 @@ long shstk_prctl(struct task_struct *task, int option, 
unsigned long arg2)
return wrss_control(true);
return -EINVAL;
 }
+
+int shstk_update_last_frame(unsigned long val)
+{
+   unsigned long ssp;
+
+   if (!features_enabled(ARCH_SHSTK_SHSTK))
+   return 0;
+
+   ssp = get_user_shstk_addr();
+   return write_user_shstk_64((u64 __user *)ssp, (u64)val);
+}
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index 6c07f6daaa22..6402fb3089d2 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -1076,8 +1076,13 @@ arch_uretprobe_hijack_return_addr(unsigned long 
trampoline_vaddr, struct pt_regs
return orig_ret_vaddr;
 
nleft = copy_to_user((void __user *)regs->sp, _vaddr, 
rasize);
-   if (likely(!nleft))
+   if (likely(!nleft)) {
+   if (shstk_update_last_frame(trampoline_vaddr)) {
+   force_sig(SIGSEGV);
+   return -1;
+   }
return orig_ret_vaddr;
+   }
 
if (nleft != rasize) {
pr_err("return address clobbered: pid=%d, %%sp=%#lx, 
%%ip=%#lx\n",
-- 
2.45.1

[PATCHv7 bpf-next 0/9] uprobe: uretprobe speed up

2024-05-23 Thread Jiri Olsa

hi,
as part of the effort on speeding up the uprobes [0] coming with
return uprobe optimization by using syscall instead of the trap
on the uretprobe trampoline.

The speed up depends on instruction type that uprobe is installed
and depends on specific HW type, please check patch 1 for details.

Patches 1-8 are based on bpf-next/master, but patch 2 and 3 are
apply-able on linux-trace.git tree probes/for-next branch.
Patch 9 is based on man-pages master.

v7 changes:
- fixes in man page [Alejandro Colomar]
- fixed patch #1 fixes tag [Oleg]

Also available at:
  https://git.kernel.org/pub/scm/linux/kernel/git/jolsa/perf.git
  uretprobe_syscall

thanks,
jirka


Notes to check list items in Documentation/process/adding-syscalls.rst:

- System Call Alternatives
  New syscall seems like the best way in here, because we need
  just to quickly enter kernel with no extra arguments processing,
  which we'd need to do if we decided to use another syscall.

- Designing the API: Planning for Extension
  The uretprobe syscall is very specific and most likely won't be
  extended in the future.

  At the moment it does not take any arguments and even if it does
  in future, it's allowed to be called only from trampoline prepared
  by kernel, so there'll be no broken user.

- Designing the API: Other Considerations
  N/A because uretprobe syscall does not return reference to kernel
  object.

- Proposing the API
  Wiring up of the uretprobe system call is in separate change,
  selftests and man page changes are part of the patchset.

- Generic System Call Implementation
  There's no CONFIG option for the new functionality because it
  keeps the same behaviour from the user POV.

- x86 System Call Implementation
  It's 64-bit syscall only.

- Compatibility System Calls (Generic)
  N/A uretprobe syscall has no arguments and is not supported
  for compat processes.

- Compatibility System Calls (x86)
  N/A uretprobe syscall is not supported for compat processes.

- System Calls Returning Elsewhere
  N/A.

- Other Details
  N/A.

- Testing
  Adding new bpf selftests and ran ltp on top of this change.

- Man Page
  Attached.

- Do not call System Calls in the Kernel
  N/A.


[0] https://lore.kernel.org/bpf/ZeCXHKJ--iYYbmLj@krava/
---
Jiri Olsa (8):
  x86/shstk: Make return uprobe work with shadow stack
  uprobe: Wire up uretprobe system call
  uprobe: Add uretprobe syscall to speed up return probe
  selftests/x86: Add return uprobe shadow stack test
  selftests/bpf: Add uretprobe syscall test for regs integrity
  selftests/bpf: Add uretprobe syscall test for regs changes
  selftests/bpf: Add uretprobe syscall call from user space test
  selftests/bpf: Add uretprobe shadow stack test

 arch/x86/entry/syscalls/syscall_64.tbl  |   1 +
 arch/x86/include/asm/shstk.h|   4 +
 arch/x86/kernel/shstk.c |  16 
 arch/x86/kernel/uprobes.c   | 124 
-
 include/linux/syscalls.h|   2 +
 include/linux/uprobes.h |   3 +
 include/uapi/asm-generic/unistd.h   |   5 +-
 kernel/events/uprobes.c |  24 --
 kernel/sys_ni.c |   2 +
 tools/include/linux/compiler.h  |   4 +
 tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c   | 123 
-
 tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c | 385 
+++
 tools/testing/selftests/bpf/progs/uprobe_syscall.c  |  15 
 tools/testing/selftests/bpf/progs/uprobe_syscall_executed.c |  17 
 tools/testing/selftests/x86/test_shadow_stack.c | 145 
++
 15 files changed, 860 insertions(+), 10 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c
 create mode 100644 tools/testing/selftests/bpf/progs/uprobe_syscall.c
 create mode 100644 tools/testing/selftests/bpf/progs/uprobe_syscall_executed.c

Jiri Olsa (1):
  man2: Add uretprobe syscall page

 man/man2/uretprobe.2 | 56 

 1 file changed, 56 insertions(+)
 create mode 100644 man/man2/uretprobe.2

[PATCH] riscv: Fix early ftrace nop patching

2024-05-23 Thread Alexandre Ghiti

Commit c97bf629963e ("riscv: Fix text patching when IPI are used")
converted ftrace_make_nop() to use patch_insn_write() which does not
emit any icache flush relying entirely on __ftrace_modify_code() to do
that.

But we missed that ftrace_make_nop() was called very early directly when
converting mcount calls into nops (actually on riscv it converts 2B nops
emitted by the compiler into 4B nops).

This caused crashes on multiple HW as reported by Conor and Björn since
the booting core could have half-patched instructions in its icache
which would trigger an illegal instruction trap: fix this by emitting a
local flush icache when early patching nops.

Fixes: c97bf629963e ("riscv: Fix text patching when IPI are used")
Signed-off-by: Alexandre Ghiti 
---
 arch/riscv/include/asm/cacheflush.h | 6 ++
 arch/riscv/kernel/ftrace.c  | 3 +++
 2 files changed, 9 insertions(+)

diff --git a/arch/riscv/include/asm/cacheflush.h 
b/arch/riscv/include/asm/cacheflush.h
index dd8d07146116..ce79c558a4c8 100644
--- a/arch/riscv/include/asm/cacheflush.h
+++ b/arch/riscv/include/asm/cacheflush.h
@@ -13,6 +13,12 @@ static inline void local_flush_icache_all(void)
asm volatile ("fence.i" ::: "memory");
 }
 
+static inline void local_flush_icache_range(unsigned long start,
+   unsigned long end)
+{
+   local_flush_icache_all();
+}
+
 #define PG_dcache_clean PG_arch_1
 
 static inline void flush_dcache_folio(struct folio *folio)
diff --git a/arch/riscv/kernel/ftrace.c b/arch/riscv/kernel/ftrace.c
index 4f4987a6d83d..32e7c401dfb4 100644
--- a/arch/riscv/kernel/ftrace.c
+++ b/arch/riscv/kernel/ftrace.c
@@ -120,6 +120,9 @@ int ftrace_init_nop(struct module *mod, struct dyn_ftrace 
*rec)
out = ftrace_make_nop(mod, rec, MCOUNT_ADDR);
mutex_unlock(_mutex);
 
+   if (!mod)
+   local_flush_icache_range(rec->ip, rec->ip + MCOUNT_INSN_SIZE);
+
return out;
 }
 
-- 
2.39.2

Re: [RFC PATCH 0/5] vsock/virtio: Add support for multi-devices

2024-05-23 Thread Stefano Garzarella


Hi,
thanks for this RFC!

On Fri, May 17, 2024 at 10:46:02PM GMT, Xuewei Niu wrote:

# Motivition

Vsock is a lightweight and widely used data exchange mechanism between host
and guest. Kata Containers, a secure container runtime, leverages the
capability to exchange control data between the shim and the kata-agent.

The Linux kernel only supports one vsock device for virtio-vsock transport,
resulting in the following limitations:

* Poor performance isolation: All vsock connections share the same
virtqueue.


This might be fixed if we implement multi-queue in virtio-vsock.


* Cannot enable more than one backend: Virtio-vsock, vhost-vsock, and
vhost-user-vsock cannot be enabled simultaneously on the transport.

We’d like to transfer networking data, such as TSI (Transparent Socket
Impersonation), over vsock via the vhost-user protocol to reduce overhead.
However, by default, the vsock device is occupied by the kata-agent.

# Usages

Principle: **Supporting virtio-vsock multi-devices while also being
compatible with existing ones.**

## Connection from Guest to Host

There are two valuable questions to take about:

1. How to be compatible with the existing usages?
2. How do we specify a virtio-vsock device?

### Question 1

Before we delve into question 1, I'd like to provide a piece of pseudocode
as an example of one of the existing use cases from the guest's
perspective.

Assuming there is one virtio-vsock device with CID 4. One of existing
usages to connect to host is shown as following.

```
fd = socket(AF_VSOCK);
connect(fd, 2, 1234);
n = write(fd, buffer);
```

The result is that a connection is established from the guest (4, ?) to the
host (2, 1234), where "?" denotes a random port.

In the context of multi-devices, there are more than two devices. If the
users don’t specify one CID explicitly, the kernel becomes confused about
which device to use. The new implementation should be compatible with the
old one.

We expanded the virtio-vsock specification to address this issue. The
specification now includes a new field called "order".

```
struct virtio_vsock_config {
 __le64 guest_cid;
 __le64 order;
} _attribute_((packed));
```

In the phase of virtio-vsock driver probing, the guest kernel reads 
from

VMM to get the order of each device. **We stipulate that the device with the
smallest order is regarded as the default device**(this mechanism functions
as a 'default gateway' in networking).

Assuming there are three virtio-vsock devices: device1 (CID=3), device2
(CID=4), and device3 (CID=5). The arrangement of the list is as follows
from the perspective of the guest kernel:

```
virtio_vsock_list =
virtio_vsock { cid: 4, order: 0 } -> virtio_vsock { cid: 3, order: 1 } -> 
virtio_vsock { cid: 5, order: 10 }
```

At this time, the guest kernel realizes that the device2 (CID=4) is the
default device. Execute the same code as before.

```
fd = socket(AF_VSOCK);
connect(fd, 2, 1234);
n = write(fd, buffer);
```

A connection will be established from the guest (4, ?) to the host (2, 1234).


It seems that only the one with order 0 is used here though, so what is 
the ordering for?
Wouldn't it suffice to simply indicate the default device (e.g., like 
the default gateway for networking)?




### Question 2

Now, the user wants to specify a device instead of the default one. An
explicit binding operation is required to be performed.

Use the device (CID=3), where “-1” represents any port, the kernel will


We have a macro: VMADDR_PORT_ANY (which is -1)


search an available port automatically.

```
fd = socket(AF_VSOCK);
bind(fd, 3, -1);
connect(fd, 2, 1234);)
n = write(fd, buffer);
```

Use the device (CID=4).

```
fd = socket(AF_VSOCK);
bind(fd, 4, -1);
connect(fd, 2, 1234);
n = write(fd, buffer);
```

## Connection from Host to Guest

Connection from host to guest is quite similar to the existing usages. The
device’s CID is specified by the bind operation.

Listen at the device (CID=3)’s port 1.

```
fd = socket(AF_VSOCK);
bind(fd, 3, 1);
listen(fd);
new_fd = accept(fd, _cid, _port);
n = write(fd, buffer);
```

Listen at the device (CID=4)’s port 1.

```
fd = socket(AF_VSOCK);
bind(fd, 4, 1);
listen(fd);
new_fd = accept(fd, _cid, _port);
n = write(fd, buffer);
```

# Use Cases

We've completed a POC with Kata Containers, Ztunnel, which is a
purpose-built per-node proxy for Istio ambient mesh, and TSI. Please refer
to the following link for more details.

Link: https://bit.ly/4bdPJbU


Thank you for this RFC, I left several comments in the patches, we still 
have some work to do, but I think it is something we can support :-)


Here I summarize the things that I think we need to fix:
1. Avoid adding transport-specific things in af_vsock.c
   We need to have a generic API to allow other transports to implement
   the same functionality.
2. We need to add negotiation of a new feature in virtio/vhost transports
   We need to enable or disable support depending on whether the
   feature is

Re: [RFC PATCH 5/5] vsock: Add an ioctl request to get all CIDs

2024-05-23 Thread Stefano Garzarella


On Fri, May 17, 2024 at 10:46:07PM GMT, Xuewei Niu wrote:

The new request is called `IOCTL_VM_SOCKETS_GET_LOCAL_CIDS`. And the old
one, `IOCTL_VM_SOCKETS_GET_LOCAL_CID` is retained.

For the transport that supports multi-devices:

* `IOCTL_VM_SOCKETS_GET_LOCAL_CID` returns "-1";


What about returning the default CID (lower prio)?

* `IOCTL_VM_SOCKETS_GET_LOCAL_CIDS` returns a vector of CIDS. The usage 
is

shown as following.

```
struct vsock_local_cids local_cids;
if ((ret = ioctl(fd, IOCTL_VM_SOCKETS_GET_LOCAL_CIDS, _cids))) {
   perror("failed to get cids");
   exit(1);
}
for (i = 0; i
---
include/net/af_vsock.h  |  7 +++
include/uapi/linux/vm_sockets.h |  8 
net/vmw_vsock/af_vsock.c| 19 +++
3 files changed, 34 insertions(+)

diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h
index 25f7dc3d602d..2febc816e388 100644
--- a/include/net/af_vsock.h
+++ b/include/net/af_vsock.h
@@ -264,4 +264,11 @@ static inline bool vsock_msgzerocopy_allow(const struct 
vsock_transport *t)
{
return t->msgzerocopy_allow && t->msgzerocopy_allow();
}
+
+/ IOCTL /
+/* Type of return value of IOCTL_VM_SOCKETS_GET_LOCAL_CIDS. */
+struct vsock_local_cids {
+   int nr;
+   unsigned int data[MAX_VSOCK_NUM];
+};
#endif /* __AF_VSOCK_H__ */
diff --git a/include/uapi/linux/vm_sockets.h b/include/uapi/linux/vm_sockets.h
index 36ca5023293a..01f73fb7af5a 100644
--- a/include/uapi/linux/vm_sockets.h
+++ b/include/uapi/linux/vm_sockets.h
@@ -195,8 +195,16 @@ struct sockaddr_vm {

#define MAX_VSOCK_NUM 16


Okay, now I see why you need this in the UAPI, but pleace try to follow
other defines.

What about VM_SOCKETS_MAX_DEVS ?



+/* Return actual context id if the transport not support vsock
+ * multi-devices. Otherwise, return `-1U`.
+ */
+
#define IOCTL_VM_SOCKETS_GET_LOCAL_CID  _IO(7, 0xb9)

+/* Only available in transports that support multiple devices. */
+
+#define IOCTL_VM_SOCKETS_GET_LOCAL_CIDS _IOR(7, 0xba, struct 
vsock_local_cids)
+
/* MSG_ZEROCOPY notifications are encoded in the standard error format,
 * sock_extended_err. See Documentation/networking/msg_zerocopy.rst in
 * kernel source tree for more details.
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 3b34be802bf2..2ea2ff52f15b 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -2454,6 +2454,7 @@ static long vsock_dev_do_ioctl(struct file *filp,
u32 __user *p = ptr;
u32 cid = VMADDR_CID_ANY;
int retval = 0;
+   struct vsock_local_cids local_cids;

switch (cmd) {
case IOCTL_VM_SOCKETS_GET_LOCAL_CID:
@@ -2469,6 +2470,24 @@ static long vsock_dev_do_ioctl(struct file *filp,
retval = -EFAULT;
break;

+   case IOCTL_VM_SOCKETS_GET_LOCAL_CIDS:
+   if (!transport_g2h || !transport_g2h->get_local_cids)
+   goto fault;
+
+   rcu_read_lock();
+   local_cids.nr = transport_g2h->get_local_cids(local_cids.data);
+   rcu_read_unlock();
+
+   if (local_cids.nr < 0 ||
+   copy_to_user(p, _cids, sizeof(local_cids)))
+   goto fault;
+
+   break;
+
+fault:
+   retval = -EFAULT;
+   break;
+
default:
retval = -ENOIOCTLCMD;
}
--
2.34.1

Re: [RFC PATCH 4/5] vsock: seqpacket_allow adapts to multi-devices

2024-05-23 Thread Stefano Garzarella


On Fri, May 17, 2024 at 10:46:06PM GMT, Xuewei Niu wrote:

Adds a new argument, named "src_cid", to let them know which `virtio_vsock`
to be selected.

Signed-off-by: Xuewei Niu 
---
include/net/af_vsock.h   |  2 +-
net/vmw_vsock/af_vsock.c | 15 +--
net/vmw_vsock/virtio_transport.c |  4 ++--
net/vmw_vsock/vsock_loopback.c   |  4 ++--
4 files changed, 18 insertions(+), 7 deletions(-)


Same for this.



diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h
index 0151296a0bc5..25f7dc3d602d 100644
--- a/include/net/af_vsock.h
+++ b/include/net/af_vsock.h
@@ -143,7 +143,7 @@ struct vsock_transport {
 int flags);
int (*seqpacket_enqueue)(struct vsock_sock *vsk, struct msghdr *msg,
 size_t len);
-   bool (*seqpacket_allow)(u32 remote_cid);
+   bool (*seqpacket_allow)(u32 src_cid, u32 remote_cid);
u32 (*seqpacket_has_data)(struct vsock_sock *vsk);

/* Notification. */
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index da06ddc940cd..3b34be802bf2 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -470,10 +470,12 @@ int vsock_assign_transport(struct vsock_sock *vsk, struct 
vsock_sock *psk)
{
const struct vsock_transport *new_transport;
struct sock *sk = sk_vsock(vsk);
-   unsigned int remote_cid = vsk->remote_addr.svm_cid;
+   unsigned int src_cid, remote_cid;
__u8 remote_flags;
int ret;

+   remote_cid = vsk->remote_addr.svm_cid;
+
/* If the packet is coming with the source and destination CIDs higher
 * than VMADDR_CID_HOST, then a vsock channel where all the packets are
 * forwarded to the host should be established. Then the host will
@@ -527,8 +529,17 @@ int vsock_assign_transport(struct vsock_sock *vsk, struct 
vsock_sock *psk)
return -ENODEV;

if (sk->sk_type == SOCK_SEQPACKET) {
+   if (vsk->local_addr.svm_cid == VMADDR_CID_ANY) {
+   if (new_transport->get_default_cid)
+   src_cid = new_transport->get_default_cid();
+   else
+   src_cid = new_transport->get_local_cid();
+   } else {
+   src_cid = vsk->local_addr.svm_cid;
+   }
+
if (!new_transport->seqpacket_allow ||
-   !new_transport->seqpacket_allow(remote_cid)) {
+   !new_transport->seqpacket_allow(src_cid, remote_cid)) {
module_put(new_transport->module);
return -ESOCKTNOSUPPORT;
}
diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index 998b22e5ce36..0bddcbd906a2 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -615,14 +615,14 @@ static struct virtio_transport virtio_transport = {
.can_msgzerocopy = virtio_transport_can_msgzerocopy,
};

-static bool virtio_transport_seqpacket_allow(u32 remote_cid)
+static bool virtio_transport_seqpacket_allow(u32 src_cid, u32 remote_cid)
{
struct virtio_vsock *vsock;
bool seqpacket_allow;

seqpacket_allow = false;
rcu_read_lock();
-   vsock = rcu_dereference(the_virtio_vsock);
+   vsock = virtio_transport_get_virtio_vsock(src_cid);
if (vsock)
seqpacket_allow = vsock->seqpacket_allow;
rcu_read_unlock();
diff --git a/net/vmw_vsock/vsock_loopback.c b/net/vmw_vsock/vsock_loopback.c
index 6dea6119f5b2..b94358f5bb2c 100644
--- a/net/vmw_vsock/vsock_loopback.c
+++ b/net/vmw_vsock/vsock_loopback.c
@@ -46,7 +46,7 @@ static int vsock_loopback_cancel_pkt(struct vsock_sock *vsk)
return 0;
}

-static bool vsock_loopback_seqpacket_allow(u32 remote_cid);
+static bool vsock_loopback_seqpacket_allow(u32 src_cid, u32 remote_cid);
static bool vsock_loopback_msgzerocopy_allow(void)
{
return true;
@@ -104,7 +104,7 @@ static struct virtio_transport loopback_transport = {
.send_pkt = vsock_loopback_send_pkt,
};

-static bool vsock_loopback_seqpacket_allow(u32 remote_cid)
+static bool vsock_loopback_seqpacket_allow(u32 src_cid, u32 remote_cid)
{
return true;
}
--
2.34.1

Re: [RFC PATCH 3/5] vsock/virtio: can_msgzerocopy adapts to multi-devices

2024-05-23 Thread Stefano Garzarella


On Fri, May 17, 2024 at 10:46:05PM GMT, Xuewei Niu wrote:

Adds a new argument, named "cid", to let them know which `virtio_vsock` to
be selected.

Signed-off-by: Xuewei Niu 
---
include/linux/virtio_vsock.h| 2 +-
net/vmw_vsock/virtio_transport.c| 5 ++---
net/vmw_vsock/virtio_transport_common.c | 6 +++---
3 files changed, 6 insertions(+), 7 deletions(-)


Every commit in linux must be working to support bisection. So these 
changes should be made before adding multi-device support.




diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
index c82089dee0c8..21bfd5e0c2e7 100644
--- a/include/linux/virtio_vsock.h
+++ b/include/linux/virtio_vsock.h
@@ -168,7 +168,7 @@ struct virtio_transport {
 * extra checks and can perform zerocopy transmission by
 * default.
 */
-   bool (*can_msgzerocopy)(int bufs_num);
+   bool (*can_msgzerocopy)(u32 cid, int bufs_num);
};

ssize_t
diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index 93d25aeafb83..998b22e5ce36 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -521,14 +521,13 @@ static void virtio_vsock_rx_done(struct virtqueue *vq)
queue_work(virtio_vsock_workqueue, >rx_work);
}

-static bool virtio_transport_can_msgzerocopy(int bufs_num)
+static bool virtio_transport_can_msgzerocopy(u32 cid, int bufs_num)
{
struct virtio_vsock *vsock;
bool res = false;

rcu_read_lock();
-
-   vsock = rcu_dereference(the_virtio_vsock);
+   vsock = virtio_transport_get_virtio_vsock(cid);
if (vsock) {
struct virtqueue *vq = vsock->vqs[VSOCK_VQ_TX];

diff --git a/net/vmw_vsock/virtio_transport_common.c 
b/net/vmw_vsock/virtio_transport_common.c
index bed75a41419e..e7315d7b9af1 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -39,7 +39,7 @@ virtio_transport_get_ops(struct vsock_sock *vsk)

static bool virtio_transport_can_zcopy(const struct virtio_transport *t_ops,
   struct virtio_vsock_pkt_info *info,
-  size_t pkt_len)
+  size_t pkt_len, unsigned int cid)
{
struct iov_iter *iov_iter;

@@ -62,7 +62,7 @@ static bool virtio_transport_can_zcopy(const struct 
virtio_transport *t_ops,
int pages_to_send = iov_iter_npages(iov_iter, MAX_SKB_FRAGS);

/* +1 is for packet header. */
-   return t_ops->can_msgzerocopy(pages_to_send + 1);
+   return t_ops->can_msgzerocopy(cid, pages_to_send + 1);
}

return true;
@@ -375,7 +375,7 @@ static int virtio_transport_send_pkt_info(struct vsock_sock 
*vsk,
info->msg->msg_flags &= ~MSG_ZEROCOPY;

if (info->msg->msg_flags & MSG_ZEROCOPY)
-   can_zcopy = virtio_transport_can_zcopy(t_ops, info, 
pkt_len);
+   can_zcopy = virtio_transport_can_zcopy(t_ops, info, 
pkt_len, src_cid);

if (can_zcopy)
max_skb_len = min_t(u32, VIRTIO_VSOCK_MAX_PKT_BUF_SIZE,
--
2.34.1

Re: [RFC PATCH 2/5] vsock/virtio: Add support for multi-devices

2024-05-23 Thread Stefano Garzarella


On Fri, May 17, 2024 at 10:46:04PM GMT, Xuewei Niu wrote:

The maximum number of devices is limited by `MAX_VSOCK_NUM`.

Extends `vsock_transport` struct with 4 methods to support multi-devices:

* `get_virtio_vsock()`: It receives a CID, and returns a struct of virtio
 vsock. This method is designed to select a vsock device by its CID.
* `get_default_cid()`: It receives nothing, returns the default CID of the
 first vsock device registered to the kernel.
* `get_local_cids()`: It returns a vector of vsock devices' CIDs.
* `compare_order()`: It receives two different CIDs, named "left" and
 "right" respectively. It returns "-1" while the "left" is behind the
 "right". Otherwise, return "1".

`get_local_cid()` is retained, but returns "-1" if the transport supports
multi-devices.

Replaces the single instance of `virtio_vsock` with a list, named
`virtio_vsock_list`. The devices are inserted into the list when probing.

The kernel will deny devices from being registered if there are conflicts
existing in CIDs or orders.

Signed-off-by: Xuewei Niu 
---
include/net/af_vsock.h  |  16 ++
include/uapi/linux/vm_sockets.h |   6 +
net/vmw_vsock/af_vsock.c|  82 ++--
net/vmw_vsock/virtio_transport.c| 246 ++--
net/vmw_vsock/virtio_transport_common.c |  10 +-
5 files changed, 293 insertions(+), 67 deletions(-)

diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h
index 535701efc1e5..0151296a0bc5 100644
--- a/include/net/af_vsock.h
+++ b/include/net/af_vsock.h
@@ -174,6 +174,22 @@ struct vsock_transport {

/* Addressing. */
u32 (*get_local_cid)(void);
+   /* Held rcu read lock by the caller. */


We should also explain why the rcu is needed.


+   struct virtio_vsock *(*get_virtio_vsock)(unsigned int cid);


af_vsock supports several transports (i.e. HyperV, VMCI, VIRTIO/VHOST,
loobpack), so we need to be generic here.

In addition, the pointer returned by this function is never used, so
why we need this?


+   unsigned int (*get_default_cid)(void);
+   /* Get an list containing all the CIDs of registered vsock.   Return
+* the length of the list.
+*
+* Held rcu read lock by the caller.
+*/
+   int (*get_local_cids)(unsigned int *local_cids);


Why int? get_local_cid() returns an u32, we should do the same.

In addition, can we remove get_local_cid() and implement 
get_local_cids() for all the transports?



+   /* Compare the order of two devices.  Given the guest CIDs of two
+* different devices, returns -1 while the left is behind the right.
+* Otherwise, return 1.
+*
+* Held rcu read lock by the caller.
+*/
+   int (*compare_order)(unsigned int left, unsigned int right);


Please check better the type for CIDs all over the place.



/* Read a single skb */
int (*read_skb)(struct vsock_sock *, skb_read_actor_t);
diff --git a/include/uapi/linux/vm_sockets.h b/include/uapi/linux/vm_sockets.h
index ed07181d4eff..36ca5023293a 100644
--- a/include/uapi/linux/vm_sockets.h
+++ b/include/uapi/linux/vm_sockets.h
@@ -189,6 +189,12 @@ struct sockaddr_vm {
   sizeof(__u8)];
};

+/* The maximum number of vsock devices.  Each vsock device has an exclusive
+ * context id.
+ */
+
+#define MAX_VSOCK_NUM 16


This is used internally in AF_VSOCK, I don't think we should expose it
in the UAPI.



+
#define IOCTL_VM_SOCKETS_GET_LOCAL_CID  _IO(7, 0xb9)

/* MSG_ZEROCOPY notifications are encoded in the standard error format,
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 54ba7316f808..da06ddc940cd 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -234,19 +234,45 @@ static void __vsock_remove_connected(struct vsock_sock 
*vsk)

static struct sock *__vsock_find_bound_socket(struct sockaddr_vm *addr)
{
-   struct vsock_sock *vsk;
+   struct vsock_sock *vsk, *any_vsk = NULL;

+   rcu_read_lock();


Why the rcu is needed?

	list_for_each_entry(vsk, vsock_bound_sockets(addr), bound_table) 
	{

+   /* The highest priority: full match. */
if (vsock_addr_equals_addr(addr, >local_addr))
-   return sk_vsock(vsk);
+   goto out;

-   if (addr->svm_port == vsk->local_addr.svm_port &&
-   (vsk->local_addr.svm_cid == VMADDR_CID_ANY ||
-addr->svm_cid == VMADDR_CID_ANY))
-   return sk_vsock(vsk);
+   /* Port match */
+   if (addr->svm_port == vsk->local_addr.svm_port) {
+   /* The second priority: local cid is VMADDR_CID_ANY. */
+   if (vsk->local_addr.svm_cid == VMADDR_CID_ANY)
+   goto out;
+
+   /* The third priority: local cid isn't VMADDR_CID_ANY. 
*/
+   if (addr->svm_cid ==

Re: [RFC PATCH 1/5] vsock/virtio: Extend virtio-vsock spec with an "order" field

2024-05-23 Thread Stefano Garzarella


As Alyssa suggested, we should discuss spec changes in the virtio ML.
BTW as long as this is an RFC, it's fine. Just be sure, though, to 
remember to merge the change in the specification first versus the 
patches in Linux.
So I recommend that you don't send a non-RFC set into Linux until you 
have agreed on the changes to the specification.


On Fri, May 17, 2024 at 10:46:03PM GMT, Xuewei Niu wrote:

The "order" field determines the location of the device in the linked list,
the device with CID 4, having a smallest order, is in the first place, and
so forth.


Do we really need an order, or would it suffice to just indicate the 
device to be used by default? (as the default gateway in networking)




Rules:

* It doesn’t have to be continuous;
* It cannot exist conflicts;
* It is optional for the mode of a single device, but is required for the
 mode of multiple devices.


We should also add a feature to support this new field.



Signed-off-by: Xuewei Niu 
---
include/uapi/linux/virtio_vsock.h | 1 +
1 file changed, 1 insertion(+)

diff --git a/include/uapi/linux/virtio_vsock.h 
b/include/uapi/linux/virtio_vsock.h
index 64738838bee5..b62ec7d2ab1e 100644
--- a/include/uapi/linux/virtio_vsock.h
+++ b/include/uapi/linux/virtio_vsock.h
@@ -43,6 +43,7 @@

struct virtio_vsock_config {
__le64 guest_cid;
+   __le64 order;


Do we really need 64 bits for the order?


} __attribute__((packed));

enum virtio_vsock_event_id {
--
2.34.1

Re: [PATCH] x86/paravirt: Disable virt spinlock when CONFIG_PARAVIRT_SPINLOCKS disabled

2024-05-23 Thread Juergen Gross


On 16.05.24 15:02, Chen Yu wrote:

Performance drop is reported when running encode/decode workload and
BenchSEE cache sub-workload.
Bisect points to commit ce0a1b608bfc ("x86/paravirt: Silence unused
native_pv_lock_init() function warning"). When CONFIG_PARAVIRT_SPINLOCKS
is disabled the virt_spin_lock_key is set to true on bare-metal.
The qspinlock degenerates to test-and-set spinlock, which decrease the
performance on bare-metal.

Fix this by disabling virt_spin_lock_key if CONFIG_PARAVIRT_SPINLOCKS
is not set, or it is on bare-metal.

Fixes: ce0a1b608bfc ("x86/paravirt: Silence unused native_pv_lock_init() function 
warning")
Suggested-by: Qiuxu Zhuo 
Reported-by: Prem Nath Dey 
Reported-by: Xiaoping Zhou 
Signed-off-by: Chen Yu 


Reviewed-by: Juergen Gross 


Juergen


---
  arch/x86/kernel/paravirt.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 5358d43886ad..ee51c0949ed8 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -55,7 +55,7 @@ DEFINE_STATIC_KEY_TRUE(virt_spin_lock_key);
  
  void __init native_pv_lock_init(void)

  {
-   if (IS_ENABLED(CONFIG_PARAVIRT_SPINLOCKS) &&
+   if (!IS_ENABLED(CONFIG_PARAVIRT_SPINLOCKS) ||
!boot_cpu_has(X86_FEATURE_HYPERVISOR))
static_branch_disable(_spin_lock_key);
  }




OpenPGP_0xB0DE9DD628BF132F.asc
Description: OpenPGP public key


OpenPGP_signature.asc
Description: OpenPGP digital signature

[PATCH] ring-buffer: Align meta-page to sub-buffers for improved TLB usage

2024-05-23 Thread Vincent Donnefort

Previously, the mapped ring-buffer layout caused misalignment between
the meta-page and sub-buffers when the sub-buffer size was not a
multiple of PAGE_SIZE. This prevented hardware with larger TLB entries
from utilizing them effectively.

Add a padding with the zero-page between the meta-page and sub-buffers.
Also update the ring-buffer map_test to verify that padding.

Signed-off-by: Vincent Donnefort 

-- 

This is based on the mm-unstable branch [1] as it depends on David's work [2]
for allowing the zero-page in vm_insert_page().

[1] https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git
[2] https://lore.kernel.org/all/20240522125713.775114-1-da...@redhat.com

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 7345a8b625fb..acaab4d4288f 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -6148,10 +6148,10 @@ static void rb_setup_ids_meta_page(struct 
ring_buffer_per_cpu *cpu_buffer,
/* install subbuf ID to kern VA translation */
cpu_buffer->subbuf_ids = subbuf_ids;
 
-   meta->meta_page_size = PAGE_SIZE;
meta->meta_struct_len = sizeof(*meta);
meta->nr_subbufs = nr_subbufs;
meta->subbuf_size = cpu_buffer->buffer->subbuf_size + BUF_PAGE_HDR_SIZE;
+   meta->meta_page_size = meta->subbuf_size;
 
rb_update_meta_page(cpu_buffer);
 }
@@ -6238,6 +6238,12 @@ static int __rb_map_vma(struct ring_buffer_per_cpu 
*cpu_buffer,
!(vma->vm_flags & VM_MAYSHARE))
return -EPERM;
 
+   subbuf_order = cpu_buffer->buffer->subbuf_order;
+   subbuf_pages = 1 << subbuf_order;
+
+   if (subbuf_order && pgoff % subbuf_pages)
+   return -EINVAL;
+
/*
 * Make sure the mapping cannot become writable later. Also tell the VM
 * to not touch these pages (VM_DONTCOPY | VM_DONTEXPAND).
@@ -6247,11 +6253,8 @@ static int __rb_map_vma(struct ring_buffer_per_cpu 
*cpu_buffer,
 
lockdep_assert_held(_buffer->mapping_lock);
 
-   subbuf_order = cpu_buffer->buffer->subbuf_order;
-   subbuf_pages = 1 << subbuf_order;
-
nr_subbufs = cpu_buffer->nr_pages + 1; /* + reader-subbuf */
-   nr_pages = ((nr_subbufs) << subbuf_order) - pgoff + 1; /* + meta-page */
+   nr_pages = ((nr_subbufs + 1) << subbuf_order) - pgoff; /* + meta-page */
 
vma_pages = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
if (!vma_pages || vma_pages > nr_pages)
@@ -6264,20 +6267,20 @@ static int __rb_map_vma(struct ring_buffer_per_cpu 
*cpu_buffer,
return -ENOMEM;
 
if (!pgoff) {
+   unsigned long meta_page_padding;
+
pages[p++] = virt_to_page(cpu_buffer->meta_page);
 
/*
-* TODO: Align sub-buffers on their size, once
-* vm_insert_pages() supports the zero-page.
+* Pad with the zero-page to align the meta-page with the
+* sub-buffers.
 */
+   meta_page_padding = subbuf_pages - 1;
+   while (meta_page_padding-- && p < nr_pages)
+   pages[p++] = ZERO_PAGE(vma->vm_start + (PAGE_SIZE * p));
} else {
/* Skip the meta-page */
-   pgoff--;
-
-   if (pgoff % subbuf_pages) {
-   err = -EINVAL;
-   goto out;
-   }
+   pgoff -= subbuf_pages;
 
s += pgoff / subbuf_pages;
}
diff --git a/tools/testing/selftests/ring-buffer/map_test.c 
b/tools/testing/selftests/ring-buffer/map_test.c
index a9006fa7097e..4bb0192e43f3 100644
--- a/tools/testing/selftests/ring-buffer/map_test.c
+++ b/tools/testing/selftests/ring-buffer/map_test.c
@@ -228,6 +228,20 @@ TEST_F(map, data_mmap)
data = mmap(NULL, data_len, PROT_READ, MAP_SHARED,
desc->cpu_fd, meta_len);
ASSERT_EQ(data, MAP_FAILED);
+
+   /* Verify meta-page padding */
+   if (desc->meta->meta_page_size > getpagesize()) {
+   void *addr;
+
+   data_len = desc->meta->meta_page_size;
+   data = mmap(NULL, data_len,
+   PROT_READ, MAP_SHARED, desc->cpu_fd, 0);
+   ASSERT_NE(data, MAP_FAILED);
+
+   addr = (void *)((unsigned long)data + getpagesize());
+   ASSERT_EQ(*((int *)addr), 0);
+   munmap(data, data_len);
+   }
 }
 
 FIXTURE(snapshot) {

base-commit: c65920c76a977c2b73c3a8b03b4c0c00cc1285ed
-- 
2.45.1.288.g0e0cd299f1-goog

Re: [RFC PATCH 1/5] vsock/virtio: Extend virtio-vsock spec with an "order" field

2024-05-23 Thread Alyssa Ross

(CCing virtio-comment, since this proposes adding a field to a struct
that is standardized[1] in the VIRTIO spec, so changes to the Linux
implementation should presumably be coordinated with changes to the
spec.)

[1]: 
https://docs.oasis-open.org/virtio/virtio/v1.3/csd01/virtio-v1.3-csd01.html#x1-4780004

On Fri, May 17, 2024 at 10:46:03PM +0800, Xuewei Niu wrote:
> The "order" field determines the location of the device in the linked list,
> the device with CID 4, having a smallest order, is in the first place, and
> so forth.
>
> Rules:
>
> * It doesn’t have to be continuous;
> * It cannot exist conflicts;
> * It is optional for the mode of a single device, but is required for the
>   mode of multiple devices.
>
> Signed-off-by: Xuewei Niu 
> ---
>  include/uapi/linux/virtio_vsock.h | 1 +
>  1 file changed, 1 insertion(+)
>
> diff --git a/include/uapi/linux/virtio_vsock.h 
> b/include/uapi/linux/virtio_vsock.h
> index 64738838bee5..b62ec7d2ab1e 100644
> --- a/include/uapi/linux/virtio_vsock.h
> +++ b/include/uapi/linux/virtio_vsock.h
> @@ -43,6 +43,7 @@
>
>  struct virtio_vsock_config {
>   __le64 guest_cid;
> + __le64 order;
>  } __attribute__((packed));
>
>  enum virtio_vsock_event_id {
> --
> 2.34.1
>


signature.asc
Description: PGP signature

Re: [PATCH RFC 1/2] dt-bindings: soc: qcom,smsm: Allow specifying mboxes instead of qcom,ipc

2024-05-23 Thread Krzysztof Kozlowski

On 23/05/2024 08:16, Luca Weiss wrote:
> On Donnerstag, 23. Mai 2024 08:02:13 MESZ Krzysztof Kozlowski wrote:
>> On 22/05/2024 19:34, Luca Weiss wrote:
>>> On Mittwoch, 22. Mai 2024 08:49:43 MESZ Krzysztof Kozlowski wrote:
 On 21/05/2024 22:35, Luca Weiss wrote:
> On Dienstag, 21. Mai 2024 10:58:07 MESZ Krzysztof Kozlowski wrote:
>> On 20/05/2024 17:11, Luca Weiss wrote:
>>> Hi Krzysztof
>>>
>>> Ack, sounds good.
>>>
>>> Maybe also from you, any opinion between these two binding styles?
>>>
>>> So first using index of mboxes for the numbering, where for the known
>>> usages the first element (and sometimes the 3rd - ipc-2) are empty <>.
>>>
>>> The second variant is using mbox-names to get the correct channel-mbox
>>> mapping.
>>>
>>> -   qcom,ipc-1 = < 8 13>;
>>> -   qcom,ipc-2 = < 8 9>;
>>> -   qcom,ipc-3 = < 8 19>;
>>> +   mboxes = <0>, < 13>, < 9>, < 19>;
>>>
>>> vs.
>>>
>>> -   qcom,ipc-1 = < 8 13>;
>>> -   qcom,ipc-2 = < 8 9>;
>>> -   qcom,ipc-3 = < 8 19>;
>>> +   mboxes = < 13>, < 9>, < 19>;
>>> +   mbox-names = "ipc-1", "ipc-2", "ipc-3";
>>
>> Sorry, don't get, ipc-1 is the first mailbox, so why would there be <0>
>> in first case?
>
> Actually not, ipc-0 would be permissible by the driver, used for the 0th 
> host
>
> e.g. from:
>
>   /* Iterate over all hosts to check whom wants a kick */
>   for (host = 0; host < smsm->num_hosts; host++) {
>   hostp = >hosts[host];
>
> Even though no mailbox is specified in any upstream dts for this 0th host 
> I
> didn't want the bindings to restrict that, that's why in the first example
> there's an empty element (<0>) for the 0th smsm host
>
>> Anyway, the question is if you need to know that some
>> mailbox is missing. But then it is weird to name them "ipc-1" etc.
>
> In either case we'd just query the mbox (either by name or index) and then
> see if it's there? Not quite sure I understand the sentence..
> Pretty sure either binding would work the same way.

 The question is: does the driver care only about having some mailboxes
 or the driver cares about each specific mailbox? IOW, is skipping ipc-0
 important for the driver?
>>>
>>> There's nothing special from driver side about any mailbox. Some SoCs have
>>> a mailbox for e.g. hosts 1&2&3, some have only 1&3, and apq8064 even has
>>> 1&2&3&4.
>>>
>>> And if the driver doesn't find a mailbox for a host, it just ignores it
>>> but then of course it can't 'ring' the mailbox for that host when necessary.
>>>
>>> Not sure how much more I can add here, to be fair I barely understand what
>>> this driver is doing myself apart from the obvious.
>>
>> From what you said, it looks like it is enough to just list mailboxes,
>> e.g. for ipc-1, ipc-2 and ipc-4 (so no ipc-0 and ipc-3):
> 
> No, for sure we need also the possibility to list ipc-3.

? You can list it, what's the problem>

> 
> And my point is that I'm not sure if any platform will ever need ipc-0, but
> the code to use that if it ever exists is there - the driver always
> tries getting an mbox (currently just syscon of course) for every host
> from 0 to n.
> 
> These are the current (non-mbox-API) mboxes provided to smsm:
> 
> $ git grep qcom,ipc- arch/
> arch/arm/boot/dts/qcom/qcom-apq8064.dtsi:   qcom,ipc-1 = < 8 
> 4>;
> arch/arm/boot/dts/qcom/qcom-apq8064.dtsi:   qcom,ipc-2 = < 8 
> 14>;
> arch/arm/boot/dts/qcom/qcom-apq8064.dtsi:   qcom,ipc-3 = < 8 
> 23>;
> arch/arm/boot/dts/qcom/qcom-apq8064.dtsi:   qcom,ipc-4 = 
> <_sic_non_secure 0x4094 0>;
> arch/arm/boot/dts/qcom/qcom-msm8974.dtsi:   qcom,ipc-1 = < 8 
> 13>;
> arch/arm/boot/dts/qcom/qcom-msm8974.dtsi:   qcom,ipc-2 = < 8 
> 9>;
> arch/arm/boot/dts/qcom/qcom-msm8974.dtsi:   qcom,ipc-3 = < 8 
> 19>;
> arch/arm64/boot/dts/qcom/msm8916.dtsi:  qcom,ipc-1 = < 8 13>;
> arch/arm64/boot/dts/qcom/msm8916.dtsi:  qcom,ipc-3 = < 8 19>;
> arch/arm64/boot/dts/qcom/msm8939.dtsi:  qcom,ipc-1 = <_mbox 8 
> 13>;
> arch/arm64/boot/dts/qcom/msm8939.dtsi:  qcom,ipc-3 = <_mbox 8 
> 19>;
> arch/arm64/boot/dts/qcom/msm8953.dtsi:  qcom,ipc-1 = < 8 13>;
> arch/arm64/boot/dts/qcom/msm8953.dtsi:  qcom,ipc-3 = < 8 19>;
> arch/arm64/boot/dts/qcom/msm8976.dtsi:  qcom,ipc-1 = < 8 13>;
> arch/arm64/boot/dts/qcom/msm8976.dtsi:  qcom,ipc-2 = < 8 9>;
> arch/arm64/boot/dts/qcom/msm8976.dtsi:  qcom,ipc-3 = < 8 19>;
> 
>>
>> mboxes = < 13>, < 9>, < 19>;

So which case is not covered?

Best regards,
Krzysztof

Re: [PATCH RFC 1/2] dt-bindings: soc: qcom,smsm: Allow specifying mboxes instead of qcom,ipc

2024-05-23 Thread Luca Weiss

On Donnerstag, 23. Mai 2024 08:02:13 MESZ Krzysztof Kozlowski wrote:
> On 22/05/2024 19:34, Luca Weiss wrote:
> > On Mittwoch, 22. Mai 2024 08:49:43 MESZ Krzysztof Kozlowski wrote:
> >> On 21/05/2024 22:35, Luca Weiss wrote:
> >>> On Dienstag, 21. Mai 2024 10:58:07 MESZ Krzysztof Kozlowski wrote:
>  On 20/05/2024 17:11, Luca Weiss wrote:
> > Hi Krzysztof
> >
> > Ack, sounds good.
> >
> > Maybe also from you, any opinion between these two binding styles?
> >
> > So first using index of mboxes for the numbering, where for the known
> > usages the first element (and sometimes the 3rd - ipc-2) are empty <>.
> >
> > The second variant is using mbox-names to get the correct channel-mbox
> > mapping.
> >
> > -   qcom,ipc-1 = < 8 13>;
> > -   qcom,ipc-2 = < 8 9>;
> > -   qcom,ipc-3 = < 8 19>;
> > +   mboxes = <0>, < 13>, < 9>, < 19>;
> >
> > vs.
> >
> > -   qcom,ipc-1 = < 8 13>;
> > -   qcom,ipc-2 = < 8 9>;
> > -   qcom,ipc-3 = < 8 19>;
> > +   mboxes = < 13>, < 9>, < 19>;
> > +   mbox-names = "ipc-1", "ipc-2", "ipc-3";
> 
>  Sorry, don't get, ipc-1 is the first mailbox, so why would there be <0>
>  in first case?
> >>>
> >>> Actually not, ipc-0 would be permissible by the driver, used for the 0th 
> >>> host
> >>>
> >>> e.g. from:
> >>>
> >>>   /* Iterate over all hosts to check whom wants a kick */
> >>>   for (host = 0; host < smsm->num_hosts; host++) {
> >>>   hostp = >hosts[host];
> >>>
> >>> Even though no mailbox is specified in any upstream dts for this 0th host 
> >>> I
> >>> didn't want the bindings to restrict that, that's why in the first example
> >>> there's an empty element (<0>) for the 0th smsm host
> >>>
>  Anyway, the question is if you need to know that some
>  mailbox is missing. But then it is weird to name them "ipc-1" etc.
> >>>
> >>> In either case we'd just query the mbox (either by name or index) and then
> >>> see if it's there? Not quite sure I understand the sentence..
> >>> Pretty sure either binding would work the same way.
> >>
> >> The question is: does the driver care only about having some mailboxes
> >> or the driver cares about each specific mailbox? IOW, is skipping ipc-0
> >> important for the driver?
> > 
> > There's nothing special from driver side about any mailbox. Some SoCs have
> > a mailbox for e.g. hosts 1&2&3, some have only 1&3, and apq8064 even has
> > 1&2&3&4.
> > 
> > And if the driver doesn't find a mailbox for a host, it just ignores it
> > but then of course it can't 'ring' the mailbox for that host when necessary.
> > 
> > Not sure how much more I can add here, to be fair I barely understand what
> > this driver is doing myself apart from the obvious.
> 
> From what you said, it looks like it is enough to just list mailboxes,
> e.g. for ipc-1, ipc-2 and ipc-4 (so no ipc-0 and ipc-3):

No, for sure we need also the possibility to list ipc-3.

And my point is that I'm not sure if any platform will ever need ipc-0, but
the code to use that if it ever exists is there - the driver always
tries getting an mbox (currently just syscon of course) for every host
from 0 to n.

These are the current (non-mbox-API) mboxes provided to smsm:

$ git grep qcom,ipc- arch/
arch/arm/boot/dts/qcom/qcom-apq8064.dtsi:   qcom,ipc-1 = < 8 
4>;
arch/arm/boot/dts/qcom/qcom-apq8064.dtsi:   qcom,ipc-2 = < 8 
14>;
arch/arm/boot/dts/qcom/qcom-apq8064.dtsi:   qcom,ipc-3 = < 8 
23>;
arch/arm/boot/dts/qcom/qcom-apq8064.dtsi:   qcom,ipc-4 = 
<_sic_non_secure 0x4094 0>;
arch/arm/boot/dts/qcom/qcom-msm8974.dtsi:   qcom,ipc-1 = < 8 
13>;
arch/arm/boot/dts/qcom/qcom-msm8974.dtsi:   qcom,ipc-2 = < 8 
9>;
arch/arm/boot/dts/qcom/qcom-msm8974.dtsi:   qcom,ipc-3 = < 8 
19>;
arch/arm64/boot/dts/qcom/msm8916.dtsi:  qcom,ipc-1 = < 8 13>;
arch/arm64/boot/dts/qcom/msm8916.dtsi:  qcom,ipc-3 = < 8 19>;
arch/arm64/boot/dts/qcom/msm8939.dtsi:  qcom,ipc-1 = <_mbox 8 13>;
arch/arm64/boot/dts/qcom/msm8939.dtsi:  qcom,ipc-3 = <_mbox 8 19>;
arch/arm64/boot/dts/qcom/msm8953.dtsi:  qcom,ipc-1 = < 8 13>;
arch/arm64/boot/dts/qcom/msm8953.dtsi:  qcom,ipc-3 = < 8 19>;
arch/arm64/boot/dts/qcom/msm8976.dtsi:  qcom,ipc-1 = < 8 13>;
arch/arm64/boot/dts/qcom/msm8976.dtsi:  qcom,ipc-2 = < 8 9>;
arch/arm64/boot/dts/qcom/msm8976.dtsi:  qcom,ipc-3 = < 8 19>;

> 
> mboxes = < 13>, < 9>, < 19>;
> 
> Best regards,
> Krzysztof
> 
>

Re: [PATCH RFC 1/2] dt-bindings: soc: qcom,smsm: Allow specifying mboxes instead of qcom,ipc

2024-05-23 Thread Krzysztof Kozlowski

On 22/05/2024 19:34, Luca Weiss wrote:
> On Mittwoch, 22. Mai 2024 08:49:43 MESZ Krzysztof Kozlowski wrote:
>> On 21/05/2024 22:35, Luca Weiss wrote:
>>> On Dienstag, 21. Mai 2024 10:58:07 MESZ Krzysztof Kozlowski wrote:
 On 20/05/2024 17:11, Luca Weiss wrote:
> Hi Krzysztof
>
> Ack, sounds good.
>
> Maybe also from you, any opinion between these two binding styles?
>
> So first using index of mboxes for the numbering, where for the known
> usages the first element (and sometimes the 3rd - ipc-2) are empty <>.
>
> The second variant is using mbox-names to get the correct channel-mbox
> mapping.
>
> -   qcom,ipc-1 = < 8 13>;
> -   qcom,ipc-2 = < 8 9>;
> -   qcom,ipc-3 = < 8 19>;
> +   mboxes = <0>, < 13>, < 9>, < 19>;
>
> vs.
>
> -   qcom,ipc-1 = < 8 13>;
> -   qcom,ipc-2 = < 8 9>;
> -   qcom,ipc-3 = < 8 19>;
> +   mboxes = < 13>, < 9>, < 19>;
> +   mbox-names = "ipc-1", "ipc-2", "ipc-3";

 Sorry, don't get, ipc-1 is the first mailbox, so why would there be <0>
 in first case?
>>>
>>> Actually not, ipc-0 would be permissible by the driver, used for the 0th 
>>> host
>>>
>>> e.g. from:
>>>
>>> /* Iterate over all hosts to check whom wants a kick */
>>> for (host = 0; host < smsm->num_hosts; host++) {
>>> hostp = >hosts[host];
>>>
>>> Even though no mailbox is specified in any upstream dts for this 0th host I
>>> didn't want the bindings to restrict that, that's why in the first example
>>> there's an empty element (<0>) for the 0th smsm host
>>>
 Anyway, the question is if you need to know that some
 mailbox is missing. But then it is weird to name them "ipc-1" etc.
>>>
>>> In either case we'd just query the mbox (either by name or index) and then
>>> see if it's there? Not quite sure I understand the sentence..
>>> Pretty sure either binding would work the same way.
>>
>> The question is: does the driver care only about having some mailboxes
>> or the driver cares about each specific mailbox? IOW, is skipping ipc-0
>> important for the driver?
> 
> There's nothing special from driver side about any mailbox. Some SoCs have
> a mailbox for e.g. hosts 1&2&3, some have only 1&3, and apq8064 even has
> 1&2&3&4.
> 
> And if the driver doesn't find a mailbox for a host, it just ignores it
> but then of course it can't 'ring' the mailbox for that host when necessary.
> 
> Not sure how much more I can add here, to be fair I barely understand what
> this driver is doing myself apart from the obvious.

>From what you said, it looks like it is enough to just list mailboxes,
e.g. for ipc-1, ipc-2 and ipc-4 (so no ipc-0 and ipc-3):

mboxes = < 13>, < 9>, < 19>;

Best regards,
Krzysztof

[GIT PULL v2] virtio: features, fixes, cleanups

2024-05-23 Thread Michael S. Tsirkin



Things to note here:
- dropped a couple of patches at the last moment. Did a bunch
  of testing in the last day to make sure that's not causing
  any fallout, it's a revert and no other changes in the same area
  so I feel rather safe doing that.
- the new Marvell OCTEON DPU driver is not here: latest v4 keeps causing
  build failures on mips. I kept deferring the pull hoping to get it in
  and I might try to merge a new version post rc1 (supposed to be ok for
  new drivers as they can't cause regressions), but we'll see.
- there are also a couple bugfixes under review, to be merged after rc1
- there is a trivial conflict in the header file. Shouldn't be any
  trouble to resolve, but fyi the resolution by Stephen is here
diff --cc drivers/virtio/virtio_mem.c
index e8355f55a8f7,6d4dfbc53a66..
--- a/drivers/virtio/virtio_mem.c
+++ b/drivers/virtio/virtio_mem.c
@@@ -21,7 -21,7 +21,8 @@@
  #include 
  #include 
  #include 
 +#include 
+ #include 
  Also see it here:
  https://lore.kernel.org/all/20240423145947.14217...@canb.auug.org.au/


The following changes since commit 18daea77cca626f590fb140fc11e3a43c5d41354:

  Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm 
(2024-04-30 12:40:41 -0700)

are available in the Git repository at:

  https://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost.git tags/for_linus

for you to fetch changes up to c8fae27d141a32a1624d0d0d5419d94252824498:

  virtio-pci: Check if is_avq is NULL (2024-05-22 08:39:41 -0400)


virtio: features, fixes, cleanups

Several new features here:

- virtio-net is finally supported in vduse.

- Virtio (balloon and mem) interaction with suspend is improved

- vhost-scsi now handles signals better/faster.

Fixes, cleanups all over the place.

Signed-off-by: Michael S. Tsirkin 


Christophe JAILLET (1):
  vhost-vdpa: Remove usage of the deprecated ida_simple_xx() API

David Hildenbrand (1):
  virtio-mem: support suspend+resume

David Stevens (2):
  virtio_balloon: Give the balloon its own wakeup source
  virtio_balloon: Treat stats requests as wakeup events

Eugenio Pérez (1):
  MAINTAINERS: add Eugenio Pérez as reviewer

Jiri Pirko (1):
  virtio: delete vq in vp_find_vqs_msix() when request_irq() fails

Krzysztof Kozlowski (24):
  virtio: balloon: drop owner assignment
  virtio: input: drop owner assignment
  virtio: mem: drop owner assignment
  um: virt-pci: drop owner assignment
  virtio_blk: drop owner assignment
  bluetooth: virtio: drop owner assignment
  hwrng: virtio: drop owner assignment
  virtio_console: drop owner assignment
  crypto: virtio - drop owner assignment
  firmware: arm_scmi: virtio: drop owner assignment
  gpio: virtio: drop owner assignment
  drm/virtio: drop owner assignment
  iommu: virtio: drop owner assignment
  misc: nsm: drop owner assignment
  net: caif: virtio: drop owner assignment
  net: virtio: drop owner assignment
  net: 9p: virtio: drop owner assignment
  vsock/virtio: drop owner assignment
  wifi: mac80211_hwsim: drop owner assignment
  nvdimm: virtio_pmem: drop owner assignment
  rpmsg: virtio: drop owner assignment
  scsi: virtio: drop owner assignment
  fuse: virtio: drop owner assignment
  sound: virtio: drop owner assignment

Li Zhang (1):
  virtio-pci: Check if is_avq is NULL

Li Zhijian (1):
  vdpa: Convert sprintf/snprintf to sysfs_emit

Maxime Coquelin (3):
  vduse: validate block features only with block devices
  vduse: Temporarily fail if control queue feature requested
  vduse: enable Virtio-net device type

Michael S. Tsirkin (1):
  Merge tag 'stable/vduse-virtio-net' into vhost

Mike Christie (9):
  vhost-scsi: Handle vhost_vq_work_queue failures for events
  vhost-scsi: Handle vhost_vq_work_queue failures for cmds
  vhost-scsi: Use system wq to flush dev for TMFs
  vhost: Remove vhost_vq_flush
  vhost_scsi: Handle vhost_vq_work_queue failures for TMFs
  vhost: Use virtqueue mutex for swapping worker
  vhost: Release worker mutex during flushes
  vhost_task: Handle SIGKILL by flushing work and exiting
  kernel: Remove signal hacks for vhost_tasks

Uwe Kleine-König (1):
  virtio-mmio: Convert to platform remove callback returning void

Yuxue Liu (2):
  vp_vdpa: Fix return value check vp_vdpa_request_irq
  vp_vdpa: don't allocate unused msix vectors

Zhu Lingshan (1):
  MAINTAINERS: apply maintainer role of Intel vDPA driver

 MAINTAINERS   |  10 +-
 arch/um/drivers/virt-pci.c|   1 -
 drivers/block/virtio_blk.c|   1 -
 drivers/bluetooth/virtio_bt.c |   1 -

[PATCH v2 3/4] eventfs: Update all the eventfs_inodes from the events descriptor

2024-05-22 Thread Steven Rostedt

From: "Steven Rostedt (Google)" 

The change to update the permissions of the eventfs_inode had the
misconception that using the tracefs_inode would find all the
eventfs_inodes that have been updated and reset them on remount.
The problem with this approach is that the eventfs_inodes are freed when
they are no longer used (basically the reason the eventfs system exists).
When they are freed, the updated eventfs_inodes are not reset on a remount
because their tracefs_inodes have been freed.

Instead, since the events directory eventfs_inode always has a
tracefs_inode pointing to it (it is not freed when finished), and the
events directory has a link to all its children, have the
eventfs_remount() function only operate on the events eventfs_inode and
have it descend into its children updating their uid and gids.

Link: 
https://lore.kernel.org/all/cak7lnarxgaww3kh9jgrnh4vk6fr8ldknkf3wq8nhmwjrvwj...@mail.gmail.com/

Cc: sta...@vger.kernel.org
Fixes: baa23a8d4360d ("tracefs: Reset permissions on remount if permissions are 
options")
Reported-by: Masahiro Yamada 
Signed-off-by: Steven Rostedt (Google) 
---
 fs/tracefs/event_inode.c | 44 
 1 file changed, 31 insertions(+), 13 deletions(-)

diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c
index 5dfb1ccd56ea..129d0f54ba62 100644
--- a/fs/tracefs/event_inode.c
+++ b/fs/tracefs/event_inode.c
@@ -305,27 +305,27 @@ static const struct file_operations 
eventfs_file_operations = {
.llseek = generic_file_llseek,
 };
 
-/*
- * On a remount of tracefs, if UID or GID options are set, then
- * the mount point inode permissions should be used.
- * Reset the saved permission flags appropriately.
- */
-void eventfs_remount(struct tracefs_inode *ti, bool update_uid, bool 
update_gid)
+static void eventfs_set_attrs(struct eventfs_inode *ei, bool update_uid, 
kuid_t uid,
+ bool update_gid, kgid_t gid, int level)
 {
-   struct eventfs_inode *ei = ti->private;
+   struct eventfs_inode *ei_child;
 
-   if (!ei)
+   /* Update events// */
+   if (WARN_ON_ONCE(level > 3))
return;
 
if (update_uid) {
ei->attr.mode &= ~EVENTFS_SAVE_UID;
-   ei->attr.uid = ti->vfs_inode.i_uid;
+   ei->attr.uid = uid;
}
 
-
if (update_gid) {
ei->attr.mode &= ~EVENTFS_SAVE_GID;
-   ei->attr.gid = ti->vfs_inode.i_gid;
+   ei->attr.gid = gid;
+   }
+
+   list_for_each_entry(ei_child, >children, list) {
+   eventfs_set_attrs(ei_child, update_uid, uid, update_gid, gid, 
level + 1);
}
 
if (!ei->entry_attrs)
@@ -334,13 +334,31 @@ void eventfs_remount(struct tracefs_inode *ti, bool 
update_uid, bool update_gid)
for (int i = 0; i < ei->nr_entries; i++) {
if (update_uid) {
ei->entry_attrs[i].mode &= ~EVENTFS_SAVE_UID;
-   ei->entry_attrs[i].uid = ti->vfs_inode.i_uid;
+   ei->entry_attrs[i].uid = uid;
}
if (update_gid) {
ei->entry_attrs[i].mode &= ~EVENTFS_SAVE_GID;
-   ei->entry_attrs[i].gid = ti->vfs_inode.i_gid;
+   ei->entry_attrs[i].gid = gid;
}
}
+
+}
+
+/*
+ * On a remount of tracefs, if UID or GID options are set, then
+ * the mount point inode permissions should be used.
+ * Reset the saved permission flags appropriately.
+ */
+void eventfs_remount(struct tracefs_inode *ti, bool update_uid, bool 
update_gid)
+{
+   struct eventfs_inode *ei = ti->private;
+
+   /* Only the events directory does the updates */
+   if (!ei || !ei->is_events || ei->is_freed)
+   return;
+
+   eventfs_set_attrs(ei, update_uid, ti->vfs_inode.i_uid,
+ update_gid, ti->vfs_inode.i_gid, 0);
 }
 
 /* Return the evenfs_inode of the "events" directory */
-- 
2.43.0

1 2 3 4 5 6 7 8 9 10 >

1 - 100 of 4332496 matches

Mail list logo