[PATCH v2 5/7] tracing: add new type "%pd/%pD" in readme_msg[]

2024-01-21 Thread Ye Bin
Signed-off-by: Ye Bin 
---
 kernel/trace/trace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 2a7c6fd934e9..13197d3b86bd 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -5745,7 +5745,7 @@ static const char readme_msg[] =
"\t   +|-[u](), \\imm-value, 
\\\"imm-string\"\n"
"\t type: s8/16/32/64, u8/16/32/64, x8/16/32/64, char, string, 
symbol,\n"
"\t   b@/, ustring,\n"
-   "\t   symstr, \\[\\]\n"
+   "\t   symstr, %pd/%pD \\[\\]\n"
 #ifdef CONFIG_HIST_TRIGGERS
"\tfield:  ;\n"
"\tstype: u8/u16/u32/u64, s8/s16/s32/s64, pid_t,\n"
-- 
2.31.1




[PATCH v2 7/7] selftests/ftrace: add test cases for VFS type "%pd" and "%pD"

2024-01-21 Thread Ye Bin
This patch adds test cases for new print format type "%pd/%pD".The test cases
test the following items:
1. Test README if add "%pd/%pD" type;
2. Test "%pd" type for dput();
3. Test "%pD" type for vfs_read();

Signed-off-by: Ye Bin 
---
 .../ftrace/test.d/kprobe/kprobe_args_vfs.tc   | 79 +++
 1 file changed, 79 insertions(+)
 create mode 100644 
tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_vfs.tc

diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_vfs.tc 
b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_vfs.tc
new file mode 100644
index ..1d8edd294dd6
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_vfs.tc
@@ -0,0 +1,79 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: Kprobe event VFS type argument
+# requires: kprobe_events
+
+case `uname -m` in
+x86_64)
+  ARG1=%di
+;;
+i[3456]86)
+  ARG1=%ax
+;;
+aarch64)
+  ARG1=%x0
+;;
+arm*)
+  ARG1=%r0
+;;
+ppc64*)
+  ARG1=%r3
+;;
+ppc*)
+  ARG1=%r3
+;;
+s390*)
+  ARG1=%r2
+;;
+mips*)
+  ARG1=%r4
+;;
+loongarch*)
+  ARG1=%r4
+;;
+riscv*)
+  ARG1=%a0
+;;
+*)
+  echo "Please implement other architecture here"
+  exit_untested
+esac
+
+: "Test argument %pd/%pD in README"
+grep -q "%pd/%pD" README
+
+: "Test argument %pd with name"
+echo "p:testprobe dput name=${ARG1}:%pd" > kprobe_events
+echo 1 > events/kprobes/testprobe/enable
+grep -q "1" events/kprobes/testprobe/enable
+echo 0 > events/kprobes/testprobe/enable
+grep "dput" trace | grep -q "enable"
+echo "" > kprobe_events
+echo "" > trace
+
+: "Test argument %pd without name"
+echo "p:testprobe dput ${ARG1}:%pd" > kprobe_events
+echo 1 > events/kprobes/testprobe/enable
+grep -q "1" events/kprobes/testprobe/enable
+echo 0 > events/kprobes/testprobe/enable
+grep "dput" trace | grep -q "enable"
+echo "" > kprobe_events
+echo "" > trace
+
+: "Test argument %pD with name"
+echo "p:testprobe vfs_read name=${ARG1}:%pD" > kprobe_events
+echo 1 > events/kprobes/testprobe/enable
+grep -q "1" events/kprobes/testprobe/enable
+echo 0 > events/kprobes/testprobe/enable
+grep "vfs_read" trace | grep -q "enable"
+echo "" > kprobe_events
+echo "" > trace
+
+: "Test argument %pD without name"
+echo "p:testprobe vfs_read ${ARG1}:%pD" > kprobe_events
+echo 1 > events/kprobes/testprobe/enable
+grep -q "1"  events/kprobes/testprobe/enable
+echo 0 > events/kprobes/testprobe/enable
+grep "vfs_read" trace | grep -q "enable"
+echo "" > kprobe_events
+echo "" > trace
-- 
2.31.1




[PATCH v2 4/7] tracing/probes: support '%pD' type for print struct file's name

2024-01-21 Thread Ye Bin
Similar to '%pD' for printk, use '%pD' for print struct file's name.

Signed-off-by: Ye Bin 
---
 kernel/trace/trace_probe.c | 41 --
 1 file changed, 26 insertions(+), 15 deletions(-)

diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 1599c0c3e6b7..f9819625de58 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -12,6 +12,7 @@
 #define pr_fmt(fmt)"trace_probe: " fmt
 
 #include 
+#include 
 #include "trace_btf.h"
 
 #include "trace_probe.h"
@@ -1572,28 +1573,38 @@ int traceprobe_expand_dentry_args(int argc, const char 
*argv[], char *buf,
 
used = 0;
for (i = 0; i < argc; i++) {
-   if (str_has_suffix(argv[i], ":%pd")) {
-   char *tmp = kstrdup(argv[i], GFP_KERNEL);
-   char *equal;
+   if (!str_has_suffix(argv[i], ":%pd") &&
+   !str_has_suffix(argv[i], ":%pD"))
+   continue;
 
-   if (!tmp)
-   return -ENOMEM;
+   char *tmp = kstrdup(argv[i], GFP_KERNEL);
+   char *equal;
+
+   if (!tmp)
+   return -ENOMEM;
 
-   equal = strchr(tmp, '=');
-   if (equal)
-   *equal = '\0';
-   tmp[strlen(argv[i]) - 4] = '\0';
+   equal = strchr(tmp, '=');
+   if (equal)
+   *equal = '\0';
+   tmp[strlen(argv[i]) - 4] = '\0';
+   if (argv[i][strlen(argv[i]) - 1] == 'd')
ret = snprintf(buf + used, bufsize - used,
   "%s%s+0x0(+0x%zx(%s)):string",
   equal ? tmp : "", equal ? "=" : "",
   offsetof(struct dentry, d_name.name),
   equal ? equal + 1 : tmp);
-   kfree(tmp);
-   if (ret >= bufsize - used)
-   return -ENOMEM;
-   argv[i] = buf + used;
-   used += ret + 1;
-   }
+   else
+   ret = snprintf(buf + used, bufsize - used,
+  "%s%s+0x0(+0x%zx(+0x%zx(%s))):string",
+  equal ? tmp : "", equal ? "=" : "",
+  offsetof(struct dentry, d_name.name),
+  offsetof(struct file, f_path.dentry),
+  equal ? equal + 1 : tmp);
+   kfree(tmp);
+   if (ret >= bufsize - used)
+   return -ENOMEM;
+   argv[i] = buf + used;
+   used += ret + 1;
}
 
return 0;
-- 
2.31.1




[PATCH v2 6/7] Documentation: tracing: add new type '%pd' and '%pD' for kprobe

2024-01-21 Thread Ye Bin
Similar to printk() '%pd' is for fetch dentry's name from struct dentry's
pointer, and '%pD' is for fetch file's name from struct file's pointer.

Signed-off-by: Ye Bin 
---
 Documentation/trace/kprobetrace.rst | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/Documentation/trace/kprobetrace.rst 
b/Documentation/trace/kprobetrace.rst
index bf9cecb69fc9..a1d12d65a8dc 100644
--- a/Documentation/trace/kprobetrace.rst
+++ b/Documentation/trace/kprobetrace.rst
@@ -58,7 +58,8 @@ Synopsis of kprobe_events
   NAME=FETCHARG : Set NAME as the argument name of FETCHARG.
   FETCHARG:TYPE : Set TYPE as the type of FETCHARG. Currently, basic types
  (u8/u16/u32/u64/s8/s16/s32/s64), hexadecimal types
- (x8/x16/x32/x64), "char", "string", "ustring", "symbol", 
"symstr"
+ (x8/x16/x32/x64), VFS layer common type(%pd/%pD) for print
+  file name, "char", "string", "ustring", "symbol", "symstr"
   and bitfield are supported.
 
   (\*1) only for the probe on function entry (offs == 0). Note, this argument 
access
@@ -113,6 +114,9 @@ With 'symstr' type, you can filter the event with wildcard 
pattern of the
 symbols, and you don't need to solve symbol name by yourself.
 For $comm, the default type is "string"; any other type is invalid.
 
+VFS layer common type(%pd/%pD) is a special type, which fetches dentry's or
+file's name from struct dentry's pointer or struct file's pointer.
+
 .. _user_mem_access:
 
 User Memory Access
-- 
2.31.1




[PATCH v2 3/7] tracing/probes: support '%pd' type for print struct dentry's name

2024-01-21 Thread Ye Bin
Similar to '%pd' for printk, use '%pd' for print struct dentry's name.

Signed-off-by: Ye Bin 
---
 kernel/trace/trace_kprobe.c | 6 ++
 kernel/trace/trace_probe.h  | 1 +
 2 files changed, 7 insertions(+)

diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index c4c6e0e0068b..00b74530fbad 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -779,6 +779,7 @@ static int __trace_kprobe_create(int argc, const char 
*argv[])
char buf[MAX_EVENT_NAME_LEN];
char gbuf[MAX_EVENT_NAME_LEN];
char abuf[MAX_BTF_ARGS_LEN];
+   char dbuf[MAX_DENTRY_ARGS_LEN];
struct traceprobe_parse_context ctx = { .flags = TPARG_FL_KERNEL };
 
switch (argv[0][0]) {
@@ -930,6 +931,11 @@ static int __trace_kprobe_create(int argc, const char 
*argv[])
argv = new_argv;
}
 
+   ret = traceprobe_expand_dentry_args(argc, argv, dbuf,
+   MAX_DENTRY_ARGS_LEN);
+   if (ret)
+   goto out;
+
/* setup a probe */
tk = alloc_trace_kprobe(group, event, addr, symbol, offset, maxactive,
argc, is_return);
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 553371a4e0b1..d9c053824975 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -34,6 +34,7 @@
 #define MAX_ARRAY_LEN  64
 #define MAX_ARG_NAME_LEN   32
 #define MAX_BTF_ARGS_LEN   128
+#define MAX_DENTRY_ARGS_LEN256
 #define MAX_STRING_SIZEPATH_MAX
 #define MAX_ARG_BUF_LEN(MAX_TRACE_ARGS * MAX_ARG_NAME_LEN)
 
-- 
2.31.1




[PATCH v2 2/7] tracing/probes: add traceprobe_expand_dentry_args() helper

2024-01-21 Thread Ye Bin
Add traceprobe_expand_dentry_args() to expand dentry args. this API is
prepare to support "%pd" print format for kprobe.

Signed-off-by: Ye Bin 
---
 kernel/trace/trace_probe.c | 34 ++
 kernel/trace/trace_probe.h |  2 ++
 2 files changed, 36 insertions(+)

diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 4dc74d73fc1d..1599c0c3e6b7 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -1565,6 +1565,40 @@ const char **traceprobe_expand_meta_args(int argc, const 
char *argv[],
return ERR_PTR(ret);
 }
 
+int traceprobe_expand_dentry_args(int argc, const char *argv[], char *buf,
+ int bufsize)
+{
+   int i, used, ret;
+
+   used = 0;
+   for (i = 0; i < argc; i++) {
+   if (str_has_suffix(argv[i], ":%pd")) {
+   char *tmp = kstrdup(argv[i], GFP_KERNEL);
+   char *equal;
+
+   if (!tmp)
+   return -ENOMEM;
+
+   equal = strchr(tmp, '=');
+   if (equal)
+   *equal = '\0';
+   tmp[strlen(argv[i]) - 4] = '\0';
+   ret = snprintf(buf + used, bufsize - used,
+  "%s%s+0x0(+0x%zx(%s)):string",
+  equal ? tmp : "", equal ? "=" : "",
+  offsetof(struct dentry, d_name.name),
+  equal ? equal + 1 : tmp);
+   kfree(tmp);
+   if (ret >= bufsize - used)
+   return -ENOMEM;
+   argv[i] = buf + used;
+   used += ret + 1;
+   }
+   }
+
+   return 0;
+}
+
 void traceprobe_finish_parse(struct traceprobe_parse_context *ctx)
 {
clear_btf_context(ctx);
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 850d9ecb6765..553371a4e0b1 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -402,6 +402,8 @@ extern int traceprobe_parse_probe_arg(struct trace_probe 
*tp, int i,
 const char **traceprobe_expand_meta_args(int argc, const char *argv[],
 int *new_argc, char *buf, int bufsize,
 struct traceprobe_parse_context *ctx);
+extern int traceprobe_expand_dentry_args(int argc, const char *argv[], char 
*buf,
+int bufsize);
 
 extern int traceprobe_update_arg(struct probe_arg *arg);
 extern void traceprobe_free_probe_arg(struct probe_arg *arg);
-- 
2.31.1




[PATCH v2 0/7] support '%pd' and '%pD' for print file name

2024-01-21 Thread Ye Bin
During fault locating, the file name needs to be printed based on the
dentry/file address. The offset needs to be calculated each time, which
is troublesome. Similar to printk, kprobe supports printing file names
for dentry/file addresses.

Diff v2 vs v1:
1. Use "%pd/%pD" print format instead of "pd/pD" print format;
2. Add "%pd/%pD" in README;
3. Expand "%pd/%pD" argument before parameter parsing;
4. Add more detail information in ftrace documentation;
5. Add test cases for new print format in selftests/ftrace;

Ye Bin (7):
  string.h: add str_has_suffix() helper for test string ends with
specify string
  tracing/probes: add traceprobe_expand_dentry_args() helper
  tracing/probes: support '%pd' type for print struct dentry's name
  tracing/probes: support '%pD' type for print struct file's name
  tracing: add new type "%pd/%pD" in readme_msg[]
  Documentation: tracing: add new type '%pd' and '%pD' for kprobe
  selftests/ftrace: add test cases for VFS type "%pd" and "%pD"

 Documentation/trace/kprobetrace.rst   |  6 +-
 include/linux/string.h| 20 +
 kernel/trace/trace.c  |  2 +-
 kernel/trace/trace_kprobe.c   |  6 ++
 kernel/trace/trace_probe.c| 45 +++
 kernel/trace/trace_probe.h|  3 +
 .../ftrace/test.d/kprobe/kprobe_args_vfs.tc   | 79 +++
 7 files changed, 159 insertions(+), 2 deletions(-)
 create mode 100644 
tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_vfs.tc

-- 
2.31.1




[PATCH v2 1/7] string.h: add str_has_suffix() helper for test string ends with specify string

2024-01-21 Thread Ye Bin
str_has_suffix() is test string if ends with specify string.

Signed-off-by: Ye Bin 
---
 include/linux/string.h | 20 
 1 file changed, 20 insertions(+)

diff --git a/include/linux/string.h b/include/linux/string.h
index 433c207a01da..e47e9597af27 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -405,4 +405,24 @@ static __always_inline size_t str_has_prefix(const char 
*str, const char *prefix
return strncmp(str, prefix, len) == 0 ? len : 0;
 }
 
+/**
+ * str_has_suffix - Test if a string has a given suffix
+ * @str: The string to test
+ * @suffix: The string to see if @str ends with
+ *
+ * Returns:
+ * * strlen(@suffix) if @str ends with @suffix
+ * * 0 if @str does not end with @suffix
+ */
+static __always_inline size_t str_has_suffix(const char *str, const char 
*suffix)
+{
+   size_t len = strlen(suffix);
+   size_t str_len = strlen(str);
+
+   if (len > str_len)
+   return 0;
+
+   return strncmp(str + str_len - len, suffix, len) == 0 ? len : 0;
+}
+
 #endif /* _LINUX_STRING_H_ */
-- 
2.31.1




[PATCH] percpu: improve percpu_alloc_percpu_fail event trace

2024-01-21 Thread George Guo
From: George Guo 

Add do_warn, warn_limit fields to the output of the
percpu_alloc_percpu_fail ftrace event.

This is required to percpu_alloc failed with no warning showing.

Signed-off-by: George Guo 
---
 include/trace/events/percpu.h | 22 ++
 mm/percpu.c   |  2 +-
 2 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/include/trace/events/percpu.h b/include/trace/events/percpu.h
index 5b8211ca8950..c5f412e84bb8 100644
--- a/include/trace/events/percpu.h
+++ b/include/trace/events/percpu.h
@@ -75,15 +75,18 @@ TRACE_EVENT(percpu_free_percpu,
 
 TRACE_EVENT(percpu_alloc_percpu_fail,
 
-   TP_PROTO(bool reserved, bool is_atomic, size_t size, size_t align),
+   TP_PROTO(bool reserved, bool is_atomic, size_t size, size_t align,
+bool do_warn, int warn_limit),
 
-   TP_ARGS(reserved, is_atomic, size, align),
+   TP_ARGS(reserved, is_atomic, size, align, do_warn, warn_limit),
 
TP_STRUCT__entry(
-   __field(bool,   reserved)
-   __field(bool,   is_atomic   )
-   __field(size_t, size)
-   __field(size_t, align   )
+   __field(bool,   reserved)
+   __field(bool,   is_atomic)
+   __field(size_t, size)
+   __field(size_t, align)
+   __field(bool,   do_warn)
+   __field(int,warn_limit)
),
 
TP_fast_assign(
@@ -91,11 +94,14 @@ TRACE_EVENT(percpu_alloc_percpu_fail,
__entry->is_atomic  = is_atomic;
__entry->size   = size;
__entry->align  = align;
+   __entry->do_warn= do_warn;
+   __entry->warn_limit = warn_limit;
),
 
-   TP_printk("reserved=%d is_atomic=%d size=%zu align=%zu",
+   TP_printk("reserved=%d is_atomic=%d size=%zu align=%zu do_warn=%d, 
warn_limit=%d",
  __entry->reserved, __entry->is_atomic,
- __entry->size, __entry->align)
+ __entry->size, __entry->align,
+ __entry->do_warn, __entry->warn_limit)
 );
 
 TRACE_EVENT(percpu_create_chunk,
diff --git a/mm/percpu.c b/mm/percpu.c
index 4e11fc1e6def..ac5b48268c99 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1886,7 +1886,7 @@ static void __percpu *pcpu_alloc(size_t size, size_t 
align, bool reserved,
 fail_unlock:
spin_unlock_irqrestore(_lock, flags);
 fail:
-   trace_percpu_alloc_percpu_fail(reserved, is_atomic, size, align);
+   trace_percpu_alloc_percpu_fail(reserved, is_atomic, size, align, 
do_warn, warn_limit);
 
if (do_warn && warn_limit) {
pr_warn("allocation failed, size=%zu align=%zu atomic=%d, %s\n",
-- 
2.34.1




Re: [PATCH net] ipvs: Simplify the allocation of ip_vs_conn slab caches

2024-01-21 Thread Kunwu Chan

On 2024/1/19 23:20, Simon Horman wrote:

On Thu, Jan 18, 2024 at 10:22:05AM +0800, Kunwu Chan wrote:

Hi Simon,

Thanks for your reply.

On 2024/1/17 17:29, Simon Horman wrote:

On Wed, Jan 17, 2024 at 03:20:45PM +0800, Kunwu Chan wrote:

Use the new KMEM_CACHE() macro instead of direct kmem_cache_create
to simplify the creation of SLAB caches.

Signed-off-by: Kunwu Chan 


Hi Kunwu Chan,

I think this is more of a cleanup than a fix,
so it should probably be targeted at 'nf-next' rather than 'net'.

Thanks, I'm confused about when to use "nf-next" or "net" or "net-next".
"nf-next" means fixing errors for linux-next.git and linux-stable.git, while
"nf" or "next" just means linux-next.git?


Hi Kunwu,

nf is for fixes for Netfilter (which includes IPVS). The target tree is nf.git
nf-next is for non-fixes for Netfilter. The target tree if nf-next.git

net is for fixes for Networking code, which does not have a more specific
tree (as is the case for Netfilter). The target tree is net.git.
Liikewise, net-next is for non-fixes for Networking code.
The target tree is net-next.git


Hi Simon,

Thank you very much for your detailed guidance.
In the future, I will carefully follow the rules you introduced to set 
the appropriate subject for the patch.




The MAINTAINERS file, and get_maintainers.pl script are useful here.

nf is merged into net on request from the Netfilter maintainers,
this is it's path to released kernels.
Likewise, nf-next is merged into net-next.

Before send the patch, I'll read the MAINTAINERS file, and search in 
email-list to confirm the correct subject.


And if need a new subject patch, i could resend a new one.




If it is a fix, then I would suggest targeting it at 'nf'
and providing a Fixes tag.

I'll keep it in mind in the future.


The above notwithstanding, this looks good to me.

Acked-by: Simon Horman 


---
   net/netfilter/ipvs/ip_vs_conn.c | 4 +---
   1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index a743db073887..98d7dbe3d787 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -1511,9 +1511,7 @@ int __init ip_vs_conn_init(void)
return -ENOMEM;
/* Allocate ip_vs_conn slab cache */
-   ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn",
- sizeof(struct ip_vs_conn), 0,
- SLAB_HWCACHE_ALIGN, NULL);
+   ip_vs_conn_cachep = KMEM_CACHE(ip_vs_conn, SLAB_HWCACHE_ALIGN);
if (!ip_vs_conn_cachep) {
kvfree(ip_vs_conn_tab);
return -ENOMEM;

--
Thanks,
   Kunwu


--
Thanks,
  Kunwu




Re: [PATCH] block: Add ioprio to block_rq tracepoint

2024-01-21 Thread dongliang cui
Hi,

Can this submission be uploaded,or is there anything that needs to be modified?

On Tue, Dec 26, 2023 at 10:10 AM Dongliang Cui  wrote:
>
> Sometimes we need to track the processing order of
> requests with ioprio set, especially when using
> mq-deadline. So the ioprio of request can be useful
> information.
>
> Signed-off-by: Dongliang Cui 
> ---
>  include/trace/events/block.h | 20 ++--
>  1 file changed, 14 insertions(+), 6 deletions(-)
>
> diff --git a/include/trace/events/block.h b/include/trace/events/block.h
> index 0e128ad..e84ff93 100644
> --- a/include/trace/events/block.h
> +++ b/include/trace/events/block.h
> @@ -82,6 +82,7 @@
> __field(  dev_t,dev )
> __field(  sector_t, sector  )
> __field(  unsigned int, nr_sector   )
> +   __field(  unsigned int, ioprio  )
> __array(  char, rwbs,   RWBS_LEN)
> __dynamic_array( char,  cmd,1   )
> ),
> @@ -90,16 +91,17 @@
> __entry->dev   = rq->q->disk ? disk_devt(rq->q->disk) : 0;
> __entry->sector= blk_rq_trace_sector(rq);
> __entry->nr_sector = blk_rq_trace_nr_sectors(rq);
> +   __entry->ioprio= rq->ioprio;
>
> blk_fill_rwbs(__entry->rwbs, rq->cmd_flags);
> __get_str(cmd)[0] = '\0';
> ),
>
> -   TP_printk("%d,%d %s (%s) %llu + %u [%d]",
> +   TP_printk("%d,%d %s (%s) %llu + %u 0x%x [%d]",
>   MAJOR(__entry->dev), MINOR(__entry->dev),
>   __entry->rwbs, __get_str(cmd),
>   (unsigned long long)__entry->sector,
> - __entry->nr_sector, 0)
> + __entry->nr_sector, __entry->ioprio, 0)
>  );
>
>  DECLARE_EVENT_CLASS(block_rq_completion,
> @@ -112,6 +114,7 @@
> __field(  dev_t,dev )
> __field(  sector_t, sector  )
> __field(  unsigned int, nr_sector   )
> +   __field(  unsigned int, ioprio  )
> __field(  int   ,   error   )
> __array(  char, rwbs,   RWBS_LEN)
> __dynamic_array( char,  cmd,1   )
> @@ -121,17 +124,19 @@
> __entry->dev   = rq->q->disk ? disk_devt(rq->q->disk) : 0;
> __entry->sector= blk_rq_pos(rq);
> __entry->nr_sector = nr_bytes >> 9;
> +   __entry->ioprio= rq->ioprio;
> __entry->error = blk_status_to_errno(error);
>
> blk_fill_rwbs(__entry->rwbs, rq->cmd_flags);
> __get_str(cmd)[0] = '\0';
> ),
>
> -   TP_printk("%d,%d %s (%s) %llu + %u [%d]",
> +   TP_printk("%d,%d %s (%s) %llu + %u 0x%x [%d]",
>   MAJOR(__entry->dev), MINOR(__entry->dev),
>   __entry->rwbs, __get_str(cmd),
>   (unsigned long long)__entry->sector,
> - __entry->nr_sector, __entry->error)
> + __entry->nr_sector, __entry->ioprio,
> + __entry->error)
>  );
>
>  /**
> @@ -180,6 +185,7 @@
> __field(  sector_t, sector  )
> __field(  unsigned int, nr_sector   )
> __field(  unsigned int, bytes   )
> +   __field(  unsigned int, ioprio  )
> __array(  char, rwbs,   RWBS_LEN)
> __array(  char, comm,   TASK_COMM_LEN   )
> __dynamic_array( char,  cmd,1   )
> @@ -190,17 +196,19 @@
> __entry->sector= blk_rq_trace_sector(rq);
> __entry->nr_sector = blk_rq_trace_nr_sectors(rq);
> __entry->bytes = blk_rq_bytes(rq);
> +   __entry->ioprio= rq->ioprio;
>
> blk_fill_rwbs(__entry->rwbs, rq->cmd_flags);
> __get_str(cmd)[0] = '\0';
> memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
> ),
>
> -   TP_printk("%d,%d %s %u (%s) %llu + %u [%s]",
> +   TP_printk("%d,%d %s %u (%s) %llu + %u 0x%x [%s]",
>   MAJOR(__entry->dev), MINOR(__entry->dev),
>   __entry->rwbs, __entry->bytes, __get_str(cmd),
>   (unsigned long long)__entry->sector,
> - __entry->nr_sector, __entry->comm)
> + __entry->nr_sector, __entry->ioprio,
> + __entry->comm)
>  );
>
>  /**
> --
> 1.9.1
>



Re: [PATCH V1] vdpa_sim: reset must not run

2024-01-21 Thread Jason Wang
On Thu, Jan 18, 2024 at 3:23 AM Steve Sistare  wrote:
>
> vdpasim_do_reset sets running to true, which is wrong, as it allows
> vdpasim_kick_vq to post work requests before the device has been
> configured.  To fix, do not set running until VIRTIO_CONFIG_S_FEATURES_OK
> is set.
>
> Fixes: 0c89e2a3a9d0 ("vdpa_sim: Implement suspend vdpa op")
> Signed-off-by: Steve Sistare 
> Reviewed-by: Eugenio Pérez 

Acked-by: Jason Wang 

Thanks




Re: [RFC V1 00/13] vdpa live update

2024-01-21 Thread Jason Wang
On Thu, Jan 18, 2024 at 4:32 AM Steven Sistare
 wrote:
>
> On 1/10/2024 9:55 PM, Jason Wang wrote:
> > On Thu, Jan 11, 2024 at 4:40 AM Steve Sistare  
> > wrote:
> >>
> >> Live update is a technique wherein an application saves its state, exec's
> >> to an updated version of itself, and restores its state.  Clients of the
> >> application experience a brief suspension of service, on the order of
> >> 100's of milliseconds, but are otherwise unaffected.
> >>
> >> Define and implement interfaces that allow vdpa devices to be preserved
> >> across fork or exec, to support live update for applications such as qemu.
> >> The device must be suspended during the update, but its dma mappings are
> >> preserved, so the suspension is brief.
> >>
> >> The VHOST_NEW_OWNER ioctl transfers device ownership and pinned memory
> >> accounting from one process to another.
> >>
> >> The VHOST_BACKEND_F_NEW_OWNER backend capability indicates that
> >> VHOST_NEW_OWNER is supported.
> >>
> >> The VHOST_IOTLB_REMAP message type updates a dma mapping with its userland
> >> address in the new process.
> >>
> >> The VHOST_BACKEND_F_IOTLB_REMAP backend capability indicates that
> >> VHOST_IOTLB_REMAP is supported and required.  Some devices do not
> >> require it, because the userland address of each dma mapping is discarded
> >> after being translated to a physical address.
> >>
> >> Here is a pseudo-code sequence for performing live update, based on
> >> suspend + reset because resume is not yet available.  The vdpa device
> >> descriptor, fd, remains open across the exec.
> >>
> >>   ioctl(fd, VHOST_VDPA_SUSPEND)
> >>   ioctl(fd, VHOST_VDPA_SET_STATUS, 0)
> >>   exec
> >
> > Is there a userspace implementation as a reference?
>
> I have working patches for qemu that use these ioctl's, but they depend on 
> other
> qemu cpr patches that are a work in progress, and not posted yet.  I'm 
> working on
> that.

Ok.

>
> >>   ioctl(fd, VHOST_NEW_OWNER)
> >>
> >>   issue ioctls to re-create vrings
> >>
> >>   if VHOST_BACKEND_F_IOTLB_REMAP
> >>   foreach dma mapping
> >>   write(fd, {VHOST_IOTLB_REMAP, new_addr})
> >
> > I think I need to understand the advantages of this approach. For
> > example, why it is better than
> >
> > ioctl(VHOST_RESET_OWNER)
> > exec
> >
> > ioctl(VHOST_SET_OWNER)
> >
> > for each dma mapping
> >  ioctl(VHOST_IOTLB_UPDATE)
>
> That is slower.  VHOST_RESET_OWNER unbinds physical pages, and 
> VHOST_IOTLB_UPDATE
> rebinds them.  It costs multiple seconds for large memories, and is incurred 
> during the
> virtual machine's pause time during live update.  For comparison, the total 
> pause time
> for live update with vfio interfaces is ~100 millis.
>
> However, the interaction with userland is so similar that the same code paths 
> can be used.
> In my qemu prototype, after cpr exec's new qemu:
>   - vhost_vdpa_set_owner() calls VHOST_NEW_OWNER instead of VHOST_SET_OWNER
>   - vhost_vdpa_dma_map() sets type VHOST_IOTLB_REMAP instead of 
> VHOST_IOTLB_UPDATE
>
> - Steve
>

Ok, let's document this in the changlog.

Thanks




Re: [RFC V1 05/13] vhost-vdpa: VHOST_IOTLB_REMAP

2024-01-21 Thread Jason Wang
On Thu, Jan 18, 2024 at 4:32 AM Steven Sistare
 wrote:
>
> On 1/10/2024 10:08 PM, Jason Wang wrote:
> > On Thu, Jan 11, 2024 at 4:40 AM Steve Sistare  
> > wrote:
> >>
> >> When device ownership is passed to a new process via VHOST_NEW_OWNER,
> >> some devices need to know the new userland addresses of the dma mappings.
> >> Define the new iotlb message type VHOST_IOTLB_REMAP to update the uaddr
> >> of a mapping.  The new uaddr must address the same memory object as
> >> originally mapped.
> >>
> >> The user must suspend the device before the old address is invalidated,
> >> and cannot resume it until after VHOST_IOTLB_REMAP is called, but this
> >> requirement is not enforced by the API.
> >>
> >> Signed-off-by: Steve Sistare 
> >> ---
> >>  drivers/vhost/vdpa.c | 34 
> >>  include/uapi/linux/vhost_types.h | 11 ++-
> >>  2 files changed, 44 insertions(+), 1 deletion(-)
> >>
> >> diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
> >> index faed6471934a..ec5ca20bd47d 100644
> >> --- a/drivers/vhost/vdpa.c
> >> +++ b/drivers/vhost/vdpa.c
> >> @@ -1219,6 +1219,37 @@ static int vhost_vdpa_pa_map(struct vhost_vdpa *v,
> >>
> >>  }
> >>
> >> +static int vhost_vdpa_process_iotlb_remap(struct vhost_vdpa *v,
> >> + struct vhost_iotlb *iotlb,
> >> + struct vhost_iotlb_msg *msg)
> >> +{
> >> +   struct vdpa_device *vdpa = v->vdpa;
> >> +   const struct vdpa_config_ops *ops = vdpa->config;
> >> +   u32 asid = iotlb_to_asid(iotlb);
> >> +   u64 start = msg->iova;
> >> +   u64 last = start + msg->size - 1;
> >> +   struct vhost_iotlb_map *map;
> >> +   int r = 0;
> >> +
> >> +   if (msg->perm || !msg->size)
> >> +   return -EINVAL;
> >> +
> >> +   map = vhost_iotlb_itree_first(iotlb, start, last);
> >> +   if (!map)
> >> +   return -ENOENT;
> >> +
> >> +   if (map->start != start || map->last != last)
> >> +   return -EINVAL;
> >> +
> >> +   /* batch will finish with remap.  non-batch must do it now. */
> >> +   if (!v->in_batch)
> >> +   r = ops->set_map(vdpa, asid, iotlb);
> >> +   if (!r)
> >> +   map->addr = msg->uaddr;
> >
> > I may miss something, for example for PA mapping,
> >
> > 1) need to convert uaddr into phys addr
> > 2) need to check whether the uaddr is backed by the same page or not?
>
> This code does not verify that the new size@uaddr points to the same physical
> pages as the old size@uaddr.  If the app screws up and they differ, then the 
> app
> may corrupt its own memory, but no-one else's.
>
> It would be expensive for large memories to verify page by page, O(npages), 
> and such
> verification lies on the critical path for virtual machine downtime during 
> live update.
> I could compare the properties of the vma(s) for the old size@uaddr vs the 
> vma for the
> new, but that is more complicated and would be a maintenance headache.  When 
> I submitted
> such code to Alex W when writing the equivalent patches for vfio, he said 
> don't check,
> correctness is the user's responsibility.

Ok, let's document this somewhere.

Thanks

>
> - Steve
>
> >> +
> >> +   return r;
> >> +}
> >> +
> >>  static int vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v,
> >>struct vhost_iotlb *iotlb,
> >>struct vhost_iotlb_msg *msg)
> >> @@ -1298,6 +1329,9 @@ static int vhost_vdpa_process_iotlb_msg(struct 
> >> vhost_dev *dev, u32 asid,
> >> ops->set_map(vdpa, asid, iotlb);
> >> v->in_batch = false;
> >> break;
> >> +   case VHOST_IOTLB_REMAP:
> >> +   r = vhost_vdpa_process_iotlb_remap(v, iotlb, msg);
> >> +   break;
> >> default:
> >> r = -EINVAL;
> >> break;
> >> diff --git a/include/uapi/linux/vhost_types.h 
> >> b/include/uapi/linux/vhost_types.h
> >> index 9177843951e9..35908315ff55 100644
> >> --- a/include/uapi/linux/vhost_types.h
> >> +++ b/include/uapi/linux/vhost_types.h
> >> @@ -79,7 +79,7 @@ struct vhost_iotlb_msg {
> >>  /*
> >>   * VHOST_IOTLB_BATCH_BEGIN and VHOST_IOTLB_BATCH_END allow modifying
> >>   * multiple mappings in one go: beginning with
> >> - * VHOST_IOTLB_BATCH_BEGIN, followed by any number of
> >> + * VHOST_IOTLB_BATCH_BEGIN, followed by any number of VHOST_IOTLB_REMAP or
> >>   * VHOST_IOTLB_UPDATE messages, and ending with VHOST_IOTLB_BATCH_END.
> >>   * When one of these two values is used as the message type, the rest
> >>   * of the fields in the message are ignored. There's no guarantee that
> >> @@ -87,6 +87,15 @@ struct vhost_iotlb_msg {
> >>   */
> >>  #define VHOST_IOTLB_BATCH_BEGIN5
> >>  #define VHOST_IOTLB_BATCH_END  6
> >> +
> >> +/*
> >> + * VHOST_IOTLB_REMAP registers a new uaddr for the 

WARNING in depot_fetch_stack

2024-01-21 Thread Ubisectech Sirius
Hello.
We are Ubisectech Sirius Team, the vulnerability lab of China ValiantSec. 
Recently, our team has discovered a issue in Linux kernel 6.7.0-g052d534373b7. 
Attached to the email were a POC file of the issue.
Stack dump:
[ 154.711833][ T8003] [ cut here ]
[ 154.711851][ T8003] pool index 81727 out of bounds (941) for stack id 3f3f3f3f
[ 154.712204][ T8003] WARNING: CPU: 1 PID: 8003 at lib/stackdepot.c:410 
depot_fetch_stack (lib/stackdepot.c:410 (discriminator 1))
[ 154.712267][ T8003] Modules linked in:
[ 154.712284][ T8003] CPU: 1 PID: 8003 Comm: poc Not tainted 
6.7.0-g9d1694dc91ce #20
[ 154.712302][ T8003] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), 
BIOS 1.15.0-1 04/01/2014
[ 154.712315][ T8003] RIP: 0010:depot_fetch_stack (lib/stackdepot.c:410 
(discriminator 1))
[ 154.712491][ T8003] Call Trace:
[ 154.712496][ T8003] 
[ 154.712766][ T8003] stack_depot_put (lib/stackdepot.c:632 
lib/stackdepot.c:620)
[ 154.712788][ T8003] kasan_release_object_meta (mm/kasan/generic.c:511 
mm/kasan/generic.c:543)
[ 154.712807][ T8003] qlist_free_all (./arch/x86/include/asm/jump_label.h:27 
mm/kasan/../slab.h:646 mm/kasan/quarantine.c:156 mm/kasan/quarantine.c:176)
[ 154.712823][ T8003] kasan_quarantine_reduce (./include/linux/srcu.h:285 
mm/kasan/quarantine.c:284)
[ 154.712843][ T8003] __kasan_slab_alloc (mm/kasan/common.c:326)
[ 154.712867][ T8003] kmalloc_trace (mm/slub.c:3814 mm/slub.c:3860 
mm/slub.c:4007)
[ 154.712888][ T8003] bdev_open_by_dev (block/bdev.c:822)
[ 154.712908][ T8003] blkdev_open (block/fops.c:617 (discriminator 4))
[ 154.712926][ T8003] do_dentry_open (fs/open.c:954)
[ 154.712969][ T8003] path_openat (fs/namei.c:3642 fs/namei.c:3798)
[ 154.713068][ T8003] do_filp_open (fs/namei.c:3826)
[ 154.713216][ T8003] do_sys_openat2 (fs/open.c:1405)
[ 154.713306][ T8003] __x64_sys_openat (fs/open.c:1430)
[ 154.713351][ T8003] do_syscall_64 (arch/x86/entry/common.c:52 
arch/x86/entry/common.c:83)
[ 154.713375][ T8003] entry_SYSCALL_64_after_hwframe 
(arch/x86/entry/entry_64.S:129)
[ 154.713396][ T8003] RIP: 0033:0x7f8bc3aa9127
[ 154.713485][ T8003] 
Thank you for taking the time to read this email and we look forward to working 
with you further.
 Ubisectech Sirius Team
 Web: www.ubisectech.com
 Email: bugrep...@ubisectech.com


横板竖版组合LOGO_画板 1.png
Description: Binary data


poc.c
Description: Binary data


Re: Re: Re: EEVDF/vhost regression (bisected to 86bfbb7ce4f6 sched/fair: Add lag based placement)

2024-01-21 Thread Michael S. Tsirkin
On Mon, Jan 08, 2024 at 02:13:25PM +0100, Tobias Huschle wrote:
> On Thu, Dec 14, 2023 at 02:14:59AM -0500, Michael S. Tsirkin wrote:
> > 
> > Peter, would appreciate feedback on this. When is cond_resched()
> > insufficient to give up the CPU? Should 
> > Documentation/kernel-hacking/hacking.rst
> > be updated to require schedule() instead?
> > 
> 
> Happy new year everybody!
> 
> I'd like to bring this thread back to life. To reiterate:
> 
> - The introduction of the EEVDF scheduler revealed a performance
>   regression in a uperf testcase of ~50%.
> - Tracing the scheduler showed that it takes decisions which are
>   in line with its design.
> - The traces showed as well, that a vhost instance might run
>   excessively long on its CPU in some circumstance. Those cause
>   the performance regression as they cause delay times of 100+ms
>   for a kworker which drives the actual network processing.
> - Before EEVDF, the vhost would always be scheduled off its CPU
>   in favor of the kworker, as the kworker was being woken up and
>   the former scheduler was giving more priority to the woken up
>   task. With EEVDF, the kworker, as a long running process, is
>   able to accumulate negative lag, which causes EEVDF to not
>   prefer it on its wake up, leaving the vhost running.
> - If the kworker is not scheduled when being woken up, the vhost
>   continues looping until it is migrated off the CPU.
> - The vhost offers to be scheduled off the CPU by calling 
>   cond_resched(), but, the the need_resched flag is not set,
>   therefore cond_resched() does nothing.
> 
> To solve this, I see the following options 
>   (might not be a complete nor a correct list)
> - Along with the wakeup of the kworker, need_resched needs to
>   be set, such that cond_resched() triggers a reschedule.

Let's try this? Does not look like discussing vhost itself will
draw attention from scheduler guys but posting a scheduling
patch probably will? Can you post a patch?

> - The vhost calls schedule() instead of cond_resched() to give up
>   the CPU. This would of course be a significantly stricter
>   approach and might limit the performance of vhost in other cases.
> - Preventing the kworker from accumulating negative lag as it is
>   mostly not runnable and if it runs, it only runs for a very short
>   time frame. This might clash with the overall concept of EEVDF.
> - On cond_resched(), verify if the consumed runtime of the caller
>   is outweighing the negative lag of another process (e.g. the 
>   kworker) and schedule the other process. Introduces overhead
>   to cond_resched.

Or this last one.


> 
> I would be curious on feedback on those ideas and interested in
> alternative approaches.




[PATCH] ARM: dts: qcom: msm8926-htc-memul: Add rmtfs memory node

2024-01-21 Thread Luca Weiss
Add the rmtfs-mem node which was part of one of the "unknown" memory
reservation. Split that one, make sure the reserved-memory in total
still covers the same space.

Signed-off-by: Luca Weiss 
---
 arch/arm/boot/dts/qcom/qcom-msm8926-htc-memul.dts | 15 ++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/arch/arm/boot/dts/qcom/qcom-msm8926-htc-memul.dts 
b/arch/arm/boot/dts/qcom/qcom-msm8926-htc-memul.dts
index ed328b24335f..3037344eb240 100644
--- a/arch/arm/boot/dts/qcom/qcom-msm8926-htc-memul.dts
+++ b/arch/arm/boot/dts/qcom/qcom-msm8926-htc-memul.dts
@@ -107,7 +107,20 @@ smem_region: smem@fa0 {
};
 
unknown@fb0 {
-   reg = <0x0fb0 0x1b0>;
+   reg = <0x0fb0 0x28>;
+   no-map;
+   };
+
+   rmtfs@fd8 {
+   compatible = "qcom,rmtfs-mem";
+   reg = <0x0fd8 0x18>;
+   no-map;
+
+   qcom,client-id = <1>;
+   };
+
+   unknown@ff0 {
+   reg = <0x0ff0 0x170>;
no-map;
};
};

---
base-commit: ad5c60d66016e544c51ed98635a74073f761f45d
change-id: 20240121-memul-rmtfs-9aa9b54f200a

Best regards,
-- 
Luca Weiss 




[PATCH] ARM: dts: qcom: apq8026-lg-lenok: Add vibrator support

2024-01-21 Thread Luca Weiss
This device has a vibrator attached to the CAMSS_GP0_CLK, use clk-pwm
and pwm-vibrator to make the vibrator work.

Signed-off-by: Luca Weiss 
---
 arch/arm/boot/dts/qcom/qcom-apq8026-lg-lenok.dts | 38 
 1 file changed, 38 insertions(+)

diff --git a/arch/arm/boot/dts/qcom/qcom-apq8026-lg-lenok.dts 
b/arch/arm/boot/dts/qcom/qcom-apq8026-lg-lenok.dts
index 0a1fd5eb3c6d..a70de21bf139 100644
--- a/arch/arm/boot/dts/qcom/qcom-apq8026-lg-lenok.dts
+++ b/arch/arm/boot/dts/qcom/qcom-apq8026-lg-lenok.dts
@@ -7,6 +7,7 @@
 
 #include "qcom-msm8226.dtsi"
 #include "pm8226.dtsi"
+#include 
 
 /delete-node/ _region;
 
@@ -56,6 +57,29 @@ vreg_wlan: wlan-regulator {
pinctrl-names = "default";
pinctrl-0 = <_regulator_default_state>;
};
+
+   pwm_vibrator: pwm {
+   compatible = "clk-pwm";
+   clocks = < CAMSS_GP0_CLK>;
+
+   pinctrl-0 = <_clk_default_state>;
+   pinctrl-names = "default";
+
+   #pwm-cells = <2>;
+   };
+
+   vibrator {
+   compatible = "pwm-vibrator";
+
+   pwms = <_vibrator 0 1>;
+   pwm-names = "enable";
+
+   vcc-supply = <_l28>;
+   enable-gpios = < 62 GPIO_ACTIVE_HIGH>;
+
+   pinctrl-0 = <_en_default_state>;
+   pinctrl-names = "default";
+   };
 };
 
  {
@@ -330,6 +354,20 @@ reset-pins {
};
};
 
+   vibrator_clk_default_state: vibrator-clk-default-state {
+   pins = "gpio33";
+   function = "gp0_clk";
+   drive-strength = <2>;
+   bias-disable;
+   };
+
+   vibrator_en_default_state: vibrator-en-default-state {
+   pins = "gpio62";
+   function = "gpio";
+   drive-strength = <2>;
+   bias-disable;
+   };
+
wlan_hostwake_default_state: wlan-hostwake-default-state {
pins = "gpio37";
function = "gpio";

---
base-commit: ad5c60d66016e544c51ed98635a74073f761f45d
change-id: 20240121-lenok-vibrator-ce5b1734e2bb

Best regards,
-- 
Luca Weiss