[PATCH v2 5/7] tracing: add new type "%pd/%pD" in readme_msg[]
Signed-off-by: Ye Bin --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 2a7c6fd934e9..13197d3b86bd 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5745,7 +5745,7 @@ static const char readme_msg[] = "\t +|-[u](), \\imm-value, \\\"imm-string\"\n" "\t type: s8/16/32/64, u8/16/32/64, x8/16/32/64, char, string, symbol,\n" "\t b@/, ustring,\n" - "\t symstr, \\[\\]\n" + "\t symstr, %pd/%pD \\[\\]\n" #ifdef CONFIG_HIST_TRIGGERS "\tfield: ;\n" "\tstype: u8/u16/u32/u64, s8/s16/s32/s64, pid_t,\n" -- 2.31.1
[PATCH v2 7/7] selftests/ftrace: add test cases for VFS type "%pd" and "%pD"
This patch adds test cases for new print format type "%pd/%pD".The test cases test the following items: 1. Test README if add "%pd/%pD" type; 2. Test "%pd" type for dput(); 3. Test "%pD" type for vfs_read(); Signed-off-by: Ye Bin --- .../ftrace/test.d/kprobe/kprobe_args_vfs.tc | 79 +++ 1 file changed, 79 insertions(+) create mode 100644 tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_vfs.tc diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_vfs.tc b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_vfs.tc new file mode 100644 index ..1d8edd294dd6 --- /dev/null +++ b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_vfs.tc @@ -0,0 +1,79 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# description: Kprobe event VFS type argument +# requires: kprobe_events + +case `uname -m` in +x86_64) + ARG1=%di +;; +i[3456]86) + ARG1=%ax +;; +aarch64) + ARG1=%x0 +;; +arm*) + ARG1=%r0 +;; +ppc64*) + ARG1=%r3 +;; +ppc*) + ARG1=%r3 +;; +s390*) + ARG1=%r2 +;; +mips*) + ARG1=%r4 +;; +loongarch*) + ARG1=%r4 +;; +riscv*) + ARG1=%a0 +;; +*) + echo "Please implement other architecture here" + exit_untested +esac + +: "Test argument %pd/%pD in README" +grep -q "%pd/%pD" README + +: "Test argument %pd with name" +echo "p:testprobe dput name=${ARG1}:%pd" > kprobe_events +echo 1 > events/kprobes/testprobe/enable +grep -q "1" events/kprobes/testprobe/enable +echo 0 > events/kprobes/testprobe/enable +grep "dput" trace | grep -q "enable" +echo "" > kprobe_events +echo "" > trace + +: "Test argument %pd without name" +echo "p:testprobe dput ${ARG1}:%pd" > kprobe_events +echo 1 > events/kprobes/testprobe/enable +grep -q "1" events/kprobes/testprobe/enable +echo 0 > events/kprobes/testprobe/enable +grep "dput" trace | grep -q "enable" +echo "" > kprobe_events +echo "" > trace + +: "Test argument %pD with name" +echo "p:testprobe vfs_read name=${ARG1}:%pD" > kprobe_events +echo 1 > events/kprobes/testprobe/enable +grep -q "1" events/kprobes/testprobe/enable +echo 0 > events/kprobes/testprobe/enable +grep "vfs_read" trace | grep -q "enable" +echo "" > kprobe_events +echo "" > trace + +: "Test argument %pD without name" +echo "p:testprobe vfs_read ${ARG1}:%pD" > kprobe_events +echo 1 > events/kprobes/testprobe/enable +grep -q "1" events/kprobes/testprobe/enable +echo 0 > events/kprobes/testprobe/enable +grep "vfs_read" trace | grep -q "enable" +echo "" > kprobe_events +echo "" > trace -- 2.31.1
[PATCH v2 4/7] tracing/probes: support '%pD' type for print struct file's name
Similar to '%pD' for printk, use '%pD' for print struct file's name. Signed-off-by: Ye Bin --- kernel/trace/trace_probe.c | 41 -- 1 file changed, 26 insertions(+), 15 deletions(-) diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 1599c0c3e6b7..f9819625de58 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -12,6 +12,7 @@ #define pr_fmt(fmt)"trace_probe: " fmt #include +#include #include "trace_btf.h" #include "trace_probe.h" @@ -1572,28 +1573,38 @@ int traceprobe_expand_dentry_args(int argc, const char *argv[], char *buf, used = 0; for (i = 0; i < argc; i++) { - if (str_has_suffix(argv[i], ":%pd")) { - char *tmp = kstrdup(argv[i], GFP_KERNEL); - char *equal; + if (!str_has_suffix(argv[i], ":%pd") && + !str_has_suffix(argv[i], ":%pD")) + continue; - if (!tmp) - return -ENOMEM; + char *tmp = kstrdup(argv[i], GFP_KERNEL); + char *equal; + + if (!tmp) + return -ENOMEM; - equal = strchr(tmp, '='); - if (equal) - *equal = '\0'; - tmp[strlen(argv[i]) - 4] = '\0'; + equal = strchr(tmp, '='); + if (equal) + *equal = '\0'; + tmp[strlen(argv[i]) - 4] = '\0'; + if (argv[i][strlen(argv[i]) - 1] == 'd') ret = snprintf(buf + used, bufsize - used, "%s%s+0x0(+0x%zx(%s)):string", equal ? tmp : "", equal ? "=" : "", offsetof(struct dentry, d_name.name), equal ? equal + 1 : tmp); - kfree(tmp); - if (ret >= bufsize - used) - return -ENOMEM; - argv[i] = buf + used; - used += ret + 1; - } + else + ret = snprintf(buf + used, bufsize - used, + "%s%s+0x0(+0x%zx(+0x%zx(%s))):string", + equal ? tmp : "", equal ? "=" : "", + offsetof(struct dentry, d_name.name), + offsetof(struct file, f_path.dentry), + equal ? equal + 1 : tmp); + kfree(tmp); + if (ret >= bufsize - used) + return -ENOMEM; + argv[i] = buf + used; + used += ret + 1; } return 0; -- 2.31.1
[PATCH v2 6/7] Documentation: tracing: add new type '%pd' and '%pD' for kprobe
Similar to printk() '%pd' is for fetch dentry's name from struct dentry's pointer, and '%pD' is for fetch file's name from struct file's pointer. Signed-off-by: Ye Bin --- Documentation/trace/kprobetrace.rst | 6 +- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/Documentation/trace/kprobetrace.rst b/Documentation/trace/kprobetrace.rst index bf9cecb69fc9..a1d12d65a8dc 100644 --- a/Documentation/trace/kprobetrace.rst +++ b/Documentation/trace/kprobetrace.rst @@ -58,7 +58,8 @@ Synopsis of kprobe_events NAME=FETCHARG : Set NAME as the argument name of FETCHARG. FETCHARG:TYPE : Set TYPE as the type of FETCHARG. Currently, basic types (u8/u16/u32/u64/s8/s16/s32/s64), hexadecimal types - (x8/x16/x32/x64), "char", "string", "ustring", "symbol", "symstr" + (x8/x16/x32/x64), VFS layer common type(%pd/%pD) for print + file name, "char", "string", "ustring", "symbol", "symstr" and bitfield are supported. (\*1) only for the probe on function entry (offs == 0). Note, this argument access @@ -113,6 +114,9 @@ With 'symstr' type, you can filter the event with wildcard pattern of the symbols, and you don't need to solve symbol name by yourself. For $comm, the default type is "string"; any other type is invalid. +VFS layer common type(%pd/%pD) is a special type, which fetches dentry's or +file's name from struct dentry's pointer or struct file's pointer. + .. _user_mem_access: User Memory Access -- 2.31.1
[PATCH v2 3/7] tracing/probes: support '%pd' type for print struct dentry's name
Similar to '%pd' for printk, use '%pd' for print struct dentry's name. Signed-off-by: Ye Bin --- kernel/trace/trace_kprobe.c | 6 ++ kernel/trace/trace_probe.h | 1 + 2 files changed, 7 insertions(+) diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index c4c6e0e0068b..00b74530fbad 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -779,6 +779,7 @@ static int __trace_kprobe_create(int argc, const char *argv[]) char buf[MAX_EVENT_NAME_LEN]; char gbuf[MAX_EVENT_NAME_LEN]; char abuf[MAX_BTF_ARGS_LEN]; + char dbuf[MAX_DENTRY_ARGS_LEN]; struct traceprobe_parse_context ctx = { .flags = TPARG_FL_KERNEL }; switch (argv[0][0]) { @@ -930,6 +931,11 @@ static int __trace_kprobe_create(int argc, const char *argv[]) argv = new_argv; } + ret = traceprobe_expand_dentry_args(argc, argv, dbuf, + MAX_DENTRY_ARGS_LEN); + if (ret) + goto out; + /* setup a probe */ tk = alloc_trace_kprobe(group, event, addr, symbol, offset, maxactive, argc, is_return); diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 553371a4e0b1..d9c053824975 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -34,6 +34,7 @@ #define MAX_ARRAY_LEN 64 #define MAX_ARG_NAME_LEN 32 #define MAX_BTF_ARGS_LEN 128 +#define MAX_DENTRY_ARGS_LEN256 #define MAX_STRING_SIZEPATH_MAX #define MAX_ARG_BUF_LEN(MAX_TRACE_ARGS * MAX_ARG_NAME_LEN) -- 2.31.1
[PATCH v2 2/7] tracing/probes: add traceprobe_expand_dentry_args() helper
Add traceprobe_expand_dentry_args() to expand dentry args. this API is prepare to support "%pd" print format for kprobe. Signed-off-by: Ye Bin --- kernel/trace/trace_probe.c | 34 ++ kernel/trace/trace_probe.h | 2 ++ 2 files changed, 36 insertions(+) diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 4dc74d73fc1d..1599c0c3e6b7 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -1565,6 +1565,40 @@ const char **traceprobe_expand_meta_args(int argc, const char *argv[], return ERR_PTR(ret); } +int traceprobe_expand_dentry_args(int argc, const char *argv[], char *buf, + int bufsize) +{ + int i, used, ret; + + used = 0; + for (i = 0; i < argc; i++) { + if (str_has_suffix(argv[i], ":%pd")) { + char *tmp = kstrdup(argv[i], GFP_KERNEL); + char *equal; + + if (!tmp) + return -ENOMEM; + + equal = strchr(tmp, '='); + if (equal) + *equal = '\0'; + tmp[strlen(argv[i]) - 4] = '\0'; + ret = snprintf(buf + used, bufsize - used, + "%s%s+0x0(+0x%zx(%s)):string", + equal ? tmp : "", equal ? "=" : "", + offsetof(struct dentry, d_name.name), + equal ? equal + 1 : tmp); + kfree(tmp); + if (ret >= bufsize - used) + return -ENOMEM; + argv[i] = buf + used; + used += ret + 1; + } + } + + return 0; +} + void traceprobe_finish_parse(struct traceprobe_parse_context *ctx) { clear_btf_context(ctx); diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 850d9ecb6765..553371a4e0b1 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -402,6 +402,8 @@ extern int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, const char **traceprobe_expand_meta_args(int argc, const char *argv[], int *new_argc, char *buf, int bufsize, struct traceprobe_parse_context *ctx); +extern int traceprobe_expand_dentry_args(int argc, const char *argv[], char *buf, +int bufsize); extern int traceprobe_update_arg(struct probe_arg *arg); extern void traceprobe_free_probe_arg(struct probe_arg *arg); -- 2.31.1
[PATCH v2 0/7] support '%pd' and '%pD' for print file name
During fault locating, the file name needs to be printed based on the dentry/file address. The offset needs to be calculated each time, which is troublesome. Similar to printk, kprobe supports printing file names for dentry/file addresses. Diff v2 vs v1: 1. Use "%pd/%pD" print format instead of "pd/pD" print format; 2. Add "%pd/%pD" in README; 3. Expand "%pd/%pD" argument before parameter parsing; 4. Add more detail information in ftrace documentation; 5. Add test cases for new print format in selftests/ftrace; Ye Bin (7): string.h: add str_has_suffix() helper for test string ends with specify string tracing/probes: add traceprobe_expand_dentry_args() helper tracing/probes: support '%pd' type for print struct dentry's name tracing/probes: support '%pD' type for print struct file's name tracing: add new type "%pd/%pD" in readme_msg[] Documentation: tracing: add new type '%pd' and '%pD' for kprobe selftests/ftrace: add test cases for VFS type "%pd" and "%pD" Documentation/trace/kprobetrace.rst | 6 +- include/linux/string.h| 20 + kernel/trace/trace.c | 2 +- kernel/trace/trace_kprobe.c | 6 ++ kernel/trace/trace_probe.c| 45 +++ kernel/trace/trace_probe.h| 3 + .../ftrace/test.d/kprobe/kprobe_args_vfs.tc | 79 +++ 7 files changed, 159 insertions(+), 2 deletions(-) create mode 100644 tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_vfs.tc -- 2.31.1
[PATCH v2 1/7] string.h: add str_has_suffix() helper for test string ends with specify string
str_has_suffix() is test string if ends with specify string. Signed-off-by: Ye Bin --- include/linux/string.h | 20 1 file changed, 20 insertions(+) diff --git a/include/linux/string.h b/include/linux/string.h index 433c207a01da..e47e9597af27 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -405,4 +405,24 @@ static __always_inline size_t str_has_prefix(const char *str, const char *prefix return strncmp(str, prefix, len) == 0 ? len : 0; } +/** + * str_has_suffix - Test if a string has a given suffix + * @str: The string to test + * @suffix: The string to see if @str ends with + * + * Returns: + * * strlen(@suffix) if @str ends with @suffix + * * 0 if @str does not end with @suffix + */ +static __always_inline size_t str_has_suffix(const char *str, const char *suffix) +{ + size_t len = strlen(suffix); + size_t str_len = strlen(str); + + if (len > str_len) + return 0; + + return strncmp(str + str_len - len, suffix, len) == 0 ? len : 0; +} + #endif /* _LINUX_STRING_H_ */ -- 2.31.1
[PATCH] percpu: improve percpu_alloc_percpu_fail event trace
From: George Guo Add do_warn, warn_limit fields to the output of the percpu_alloc_percpu_fail ftrace event. This is required to percpu_alloc failed with no warning showing. Signed-off-by: George Guo --- include/trace/events/percpu.h | 22 ++ mm/percpu.c | 2 +- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/include/trace/events/percpu.h b/include/trace/events/percpu.h index 5b8211ca8950..c5f412e84bb8 100644 --- a/include/trace/events/percpu.h +++ b/include/trace/events/percpu.h @@ -75,15 +75,18 @@ TRACE_EVENT(percpu_free_percpu, TRACE_EVENT(percpu_alloc_percpu_fail, - TP_PROTO(bool reserved, bool is_atomic, size_t size, size_t align), + TP_PROTO(bool reserved, bool is_atomic, size_t size, size_t align, +bool do_warn, int warn_limit), - TP_ARGS(reserved, is_atomic, size, align), + TP_ARGS(reserved, is_atomic, size, align, do_warn, warn_limit), TP_STRUCT__entry( - __field(bool, reserved) - __field(bool, is_atomic ) - __field(size_t, size) - __field(size_t, align ) + __field(bool, reserved) + __field(bool, is_atomic) + __field(size_t, size) + __field(size_t, align) + __field(bool, do_warn) + __field(int,warn_limit) ), TP_fast_assign( @@ -91,11 +94,14 @@ TRACE_EVENT(percpu_alloc_percpu_fail, __entry->is_atomic = is_atomic; __entry->size = size; __entry->align = align; + __entry->do_warn= do_warn; + __entry->warn_limit = warn_limit; ), - TP_printk("reserved=%d is_atomic=%d size=%zu align=%zu", + TP_printk("reserved=%d is_atomic=%d size=%zu align=%zu do_warn=%d, warn_limit=%d", __entry->reserved, __entry->is_atomic, - __entry->size, __entry->align) + __entry->size, __entry->align, + __entry->do_warn, __entry->warn_limit) ); TRACE_EVENT(percpu_create_chunk, diff --git a/mm/percpu.c b/mm/percpu.c index 4e11fc1e6def..ac5b48268c99 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1886,7 +1886,7 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, fail_unlock: spin_unlock_irqrestore(_lock, flags); fail: - trace_percpu_alloc_percpu_fail(reserved, is_atomic, size, align); + trace_percpu_alloc_percpu_fail(reserved, is_atomic, size, align, do_warn, warn_limit); if (do_warn && warn_limit) { pr_warn("allocation failed, size=%zu align=%zu atomic=%d, %s\n", -- 2.34.1
Re: [PATCH net] ipvs: Simplify the allocation of ip_vs_conn slab caches
On 2024/1/19 23:20, Simon Horman wrote: On Thu, Jan 18, 2024 at 10:22:05AM +0800, Kunwu Chan wrote: Hi Simon, Thanks for your reply. On 2024/1/17 17:29, Simon Horman wrote: On Wed, Jan 17, 2024 at 03:20:45PM +0800, Kunwu Chan wrote: Use the new KMEM_CACHE() macro instead of direct kmem_cache_create to simplify the creation of SLAB caches. Signed-off-by: Kunwu Chan Hi Kunwu Chan, I think this is more of a cleanup than a fix, so it should probably be targeted at 'nf-next' rather than 'net'. Thanks, I'm confused about when to use "nf-next" or "net" or "net-next". "nf-next" means fixing errors for linux-next.git and linux-stable.git, while "nf" or "next" just means linux-next.git? Hi Kunwu, nf is for fixes for Netfilter (which includes IPVS). The target tree is nf.git nf-next is for non-fixes for Netfilter. The target tree if nf-next.git net is for fixes for Networking code, which does not have a more specific tree (as is the case for Netfilter). The target tree is net.git. Liikewise, net-next is for non-fixes for Networking code. The target tree is net-next.git Hi Simon, Thank you very much for your detailed guidance. In the future, I will carefully follow the rules you introduced to set the appropriate subject for the patch. The MAINTAINERS file, and get_maintainers.pl script are useful here. nf is merged into net on request from the Netfilter maintainers, this is it's path to released kernels. Likewise, nf-next is merged into net-next. Before send the patch, I'll read the MAINTAINERS file, and search in email-list to confirm the correct subject. And if need a new subject patch, i could resend a new one. If it is a fix, then I would suggest targeting it at 'nf' and providing a Fixes tag. I'll keep it in mind in the future. The above notwithstanding, this looks good to me. Acked-by: Simon Horman --- net/netfilter/ipvs/ip_vs_conn.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index a743db073887..98d7dbe3d787 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -1511,9 +1511,7 @@ int __init ip_vs_conn_init(void) return -ENOMEM; /* Allocate ip_vs_conn slab cache */ - ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn", - sizeof(struct ip_vs_conn), 0, - SLAB_HWCACHE_ALIGN, NULL); + ip_vs_conn_cachep = KMEM_CACHE(ip_vs_conn, SLAB_HWCACHE_ALIGN); if (!ip_vs_conn_cachep) { kvfree(ip_vs_conn_tab); return -ENOMEM; -- Thanks, Kunwu -- Thanks, Kunwu
Re: [PATCH] block: Add ioprio to block_rq tracepoint
Hi, Can this submission be uploaded,or is there anything that needs to be modified? On Tue, Dec 26, 2023 at 10:10 AM Dongliang Cui wrote: > > Sometimes we need to track the processing order of > requests with ioprio set, especially when using > mq-deadline. So the ioprio of request can be useful > information. > > Signed-off-by: Dongliang Cui > --- > include/trace/events/block.h | 20 ++-- > 1 file changed, 14 insertions(+), 6 deletions(-) > > diff --git a/include/trace/events/block.h b/include/trace/events/block.h > index 0e128ad..e84ff93 100644 > --- a/include/trace/events/block.h > +++ b/include/trace/events/block.h > @@ -82,6 +82,7 @@ > __field( dev_t,dev ) > __field( sector_t, sector ) > __field( unsigned int, nr_sector ) > + __field( unsigned int, ioprio ) > __array( char, rwbs, RWBS_LEN) > __dynamic_array( char, cmd,1 ) > ), > @@ -90,16 +91,17 @@ > __entry->dev = rq->q->disk ? disk_devt(rq->q->disk) : 0; > __entry->sector= blk_rq_trace_sector(rq); > __entry->nr_sector = blk_rq_trace_nr_sectors(rq); > + __entry->ioprio= rq->ioprio; > > blk_fill_rwbs(__entry->rwbs, rq->cmd_flags); > __get_str(cmd)[0] = '\0'; > ), > > - TP_printk("%d,%d %s (%s) %llu + %u [%d]", > + TP_printk("%d,%d %s (%s) %llu + %u 0x%x [%d]", > MAJOR(__entry->dev), MINOR(__entry->dev), > __entry->rwbs, __get_str(cmd), > (unsigned long long)__entry->sector, > - __entry->nr_sector, 0) > + __entry->nr_sector, __entry->ioprio, 0) > ); > > DECLARE_EVENT_CLASS(block_rq_completion, > @@ -112,6 +114,7 @@ > __field( dev_t,dev ) > __field( sector_t, sector ) > __field( unsigned int, nr_sector ) > + __field( unsigned int, ioprio ) > __field( int , error ) > __array( char, rwbs, RWBS_LEN) > __dynamic_array( char, cmd,1 ) > @@ -121,17 +124,19 @@ > __entry->dev = rq->q->disk ? disk_devt(rq->q->disk) : 0; > __entry->sector= blk_rq_pos(rq); > __entry->nr_sector = nr_bytes >> 9; > + __entry->ioprio= rq->ioprio; > __entry->error = blk_status_to_errno(error); > > blk_fill_rwbs(__entry->rwbs, rq->cmd_flags); > __get_str(cmd)[0] = '\0'; > ), > > - TP_printk("%d,%d %s (%s) %llu + %u [%d]", > + TP_printk("%d,%d %s (%s) %llu + %u 0x%x [%d]", > MAJOR(__entry->dev), MINOR(__entry->dev), > __entry->rwbs, __get_str(cmd), > (unsigned long long)__entry->sector, > - __entry->nr_sector, __entry->error) > + __entry->nr_sector, __entry->ioprio, > + __entry->error) > ); > > /** > @@ -180,6 +185,7 @@ > __field( sector_t, sector ) > __field( unsigned int, nr_sector ) > __field( unsigned int, bytes ) > + __field( unsigned int, ioprio ) > __array( char, rwbs, RWBS_LEN) > __array( char, comm, TASK_COMM_LEN ) > __dynamic_array( char, cmd,1 ) > @@ -190,17 +196,19 @@ > __entry->sector= blk_rq_trace_sector(rq); > __entry->nr_sector = blk_rq_trace_nr_sectors(rq); > __entry->bytes = blk_rq_bytes(rq); > + __entry->ioprio= rq->ioprio; > > blk_fill_rwbs(__entry->rwbs, rq->cmd_flags); > __get_str(cmd)[0] = '\0'; > memcpy(__entry->comm, current->comm, TASK_COMM_LEN); > ), > > - TP_printk("%d,%d %s %u (%s) %llu + %u [%s]", > + TP_printk("%d,%d %s %u (%s) %llu + %u 0x%x [%s]", > MAJOR(__entry->dev), MINOR(__entry->dev), > __entry->rwbs, __entry->bytes, __get_str(cmd), > (unsigned long long)__entry->sector, > - __entry->nr_sector, __entry->comm) > + __entry->nr_sector, __entry->ioprio, > + __entry->comm) > ); > > /** > -- > 1.9.1 >
Re: [PATCH V1] vdpa_sim: reset must not run
On Thu, Jan 18, 2024 at 3:23 AM Steve Sistare wrote: > > vdpasim_do_reset sets running to true, which is wrong, as it allows > vdpasim_kick_vq to post work requests before the device has been > configured. To fix, do not set running until VIRTIO_CONFIG_S_FEATURES_OK > is set. > > Fixes: 0c89e2a3a9d0 ("vdpa_sim: Implement suspend vdpa op") > Signed-off-by: Steve Sistare > Reviewed-by: Eugenio Pérez Acked-by: Jason Wang Thanks
Re: [RFC V1 00/13] vdpa live update
On Thu, Jan 18, 2024 at 4:32 AM Steven Sistare wrote: > > On 1/10/2024 9:55 PM, Jason Wang wrote: > > On Thu, Jan 11, 2024 at 4:40 AM Steve Sistare > > wrote: > >> > >> Live update is a technique wherein an application saves its state, exec's > >> to an updated version of itself, and restores its state. Clients of the > >> application experience a brief suspension of service, on the order of > >> 100's of milliseconds, but are otherwise unaffected. > >> > >> Define and implement interfaces that allow vdpa devices to be preserved > >> across fork or exec, to support live update for applications such as qemu. > >> The device must be suspended during the update, but its dma mappings are > >> preserved, so the suspension is brief. > >> > >> The VHOST_NEW_OWNER ioctl transfers device ownership and pinned memory > >> accounting from one process to another. > >> > >> The VHOST_BACKEND_F_NEW_OWNER backend capability indicates that > >> VHOST_NEW_OWNER is supported. > >> > >> The VHOST_IOTLB_REMAP message type updates a dma mapping with its userland > >> address in the new process. > >> > >> The VHOST_BACKEND_F_IOTLB_REMAP backend capability indicates that > >> VHOST_IOTLB_REMAP is supported and required. Some devices do not > >> require it, because the userland address of each dma mapping is discarded > >> after being translated to a physical address. > >> > >> Here is a pseudo-code sequence for performing live update, based on > >> suspend + reset because resume is not yet available. The vdpa device > >> descriptor, fd, remains open across the exec. > >> > >> ioctl(fd, VHOST_VDPA_SUSPEND) > >> ioctl(fd, VHOST_VDPA_SET_STATUS, 0) > >> exec > > > > Is there a userspace implementation as a reference? > > I have working patches for qemu that use these ioctl's, but they depend on > other > qemu cpr patches that are a work in progress, and not posted yet. I'm > working on > that. Ok. > > >> ioctl(fd, VHOST_NEW_OWNER) > >> > >> issue ioctls to re-create vrings > >> > >> if VHOST_BACKEND_F_IOTLB_REMAP > >> foreach dma mapping > >> write(fd, {VHOST_IOTLB_REMAP, new_addr}) > > > > I think I need to understand the advantages of this approach. For > > example, why it is better than > > > > ioctl(VHOST_RESET_OWNER) > > exec > > > > ioctl(VHOST_SET_OWNER) > > > > for each dma mapping > > ioctl(VHOST_IOTLB_UPDATE) > > That is slower. VHOST_RESET_OWNER unbinds physical pages, and > VHOST_IOTLB_UPDATE > rebinds them. It costs multiple seconds for large memories, and is incurred > during the > virtual machine's pause time during live update. For comparison, the total > pause time > for live update with vfio interfaces is ~100 millis. > > However, the interaction with userland is so similar that the same code paths > can be used. > In my qemu prototype, after cpr exec's new qemu: > - vhost_vdpa_set_owner() calls VHOST_NEW_OWNER instead of VHOST_SET_OWNER > - vhost_vdpa_dma_map() sets type VHOST_IOTLB_REMAP instead of > VHOST_IOTLB_UPDATE > > - Steve > Ok, let's document this in the changlog. Thanks
Re: [RFC V1 05/13] vhost-vdpa: VHOST_IOTLB_REMAP
On Thu, Jan 18, 2024 at 4:32 AM Steven Sistare wrote: > > On 1/10/2024 10:08 PM, Jason Wang wrote: > > On Thu, Jan 11, 2024 at 4:40 AM Steve Sistare > > wrote: > >> > >> When device ownership is passed to a new process via VHOST_NEW_OWNER, > >> some devices need to know the new userland addresses of the dma mappings. > >> Define the new iotlb message type VHOST_IOTLB_REMAP to update the uaddr > >> of a mapping. The new uaddr must address the same memory object as > >> originally mapped. > >> > >> The user must suspend the device before the old address is invalidated, > >> and cannot resume it until after VHOST_IOTLB_REMAP is called, but this > >> requirement is not enforced by the API. > >> > >> Signed-off-by: Steve Sistare > >> --- > >> drivers/vhost/vdpa.c | 34 > >> include/uapi/linux/vhost_types.h | 11 ++- > >> 2 files changed, 44 insertions(+), 1 deletion(-) > >> > >> diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c > >> index faed6471934a..ec5ca20bd47d 100644 > >> --- a/drivers/vhost/vdpa.c > >> +++ b/drivers/vhost/vdpa.c > >> @@ -1219,6 +1219,37 @@ static int vhost_vdpa_pa_map(struct vhost_vdpa *v, > >> > >> } > >> > >> +static int vhost_vdpa_process_iotlb_remap(struct vhost_vdpa *v, > >> + struct vhost_iotlb *iotlb, > >> + struct vhost_iotlb_msg *msg) > >> +{ > >> + struct vdpa_device *vdpa = v->vdpa; > >> + const struct vdpa_config_ops *ops = vdpa->config; > >> + u32 asid = iotlb_to_asid(iotlb); > >> + u64 start = msg->iova; > >> + u64 last = start + msg->size - 1; > >> + struct vhost_iotlb_map *map; > >> + int r = 0; > >> + > >> + if (msg->perm || !msg->size) > >> + return -EINVAL; > >> + > >> + map = vhost_iotlb_itree_first(iotlb, start, last); > >> + if (!map) > >> + return -ENOENT; > >> + > >> + if (map->start != start || map->last != last) > >> + return -EINVAL; > >> + > >> + /* batch will finish with remap. non-batch must do it now. */ > >> + if (!v->in_batch) > >> + r = ops->set_map(vdpa, asid, iotlb); > >> + if (!r) > >> + map->addr = msg->uaddr; > > > > I may miss something, for example for PA mapping, > > > > 1) need to convert uaddr into phys addr > > 2) need to check whether the uaddr is backed by the same page or not? > > This code does not verify that the new size@uaddr points to the same physical > pages as the old size@uaddr. If the app screws up and they differ, then the > app > may corrupt its own memory, but no-one else's. > > It would be expensive for large memories to verify page by page, O(npages), > and such > verification lies on the critical path for virtual machine downtime during > live update. > I could compare the properties of the vma(s) for the old size@uaddr vs the > vma for the > new, but that is more complicated and would be a maintenance headache. When > I submitted > such code to Alex W when writing the equivalent patches for vfio, he said > don't check, > correctness is the user's responsibility. Ok, let's document this somewhere. Thanks > > - Steve > > >> + > >> + return r; > >> +} > >> + > >> static int vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v, > >>struct vhost_iotlb *iotlb, > >>struct vhost_iotlb_msg *msg) > >> @@ -1298,6 +1329,9 @@ static int vhost_vdpa_process_iotlb_msg(struct > >> vhost_dev *dev, u32 asid, > >> ops->set_map(vdpa, asid, iotlb); > >> v->in_batch = false; > >> break; > >> + case VHOST_IOTLB_REMAP: > >> + r = vhost_vdpa_process_iotlb_remap(v, iotlb, msg); > >> + break; > >> default: > >> r = -EINVAL; > >> break; > >> diff --git a/include/uapi/linux/vhost_types.h > >> b/include/uapi/linux/vhost_types.h > >> index 9177843951e9..35908315ff55 100644 > >> --- a/include/uapi/linux/vhost_types.h > >> +++ b/include/uapi/linux/vhost_types.h > >> @@ -79,7 +79,7 @@ struct vhost_iotlb_msg { > >> /* > >> * VHOST_IOTLB_BATCH_BEGIN and VHOST_IOTLB_BATCH_END allow modifying > >> * multiple mappings in one go: beginning with > >> - * VHOST_IOTLB_BATCH_BEGIN, followed by any number of > >> + * VHOST_IOTLB_BATCH_BEGIN, followed by any number of VHOST_IOTLB_REMAP or > >> * VHOST_IOTLB_UPDATE messages, and ending with VHOST_IOTLB_BATCH_END. > >> * When one of these two values is used as the message type, the rest > >> * of the fields in the message are ignored. There's no guarantee that > >> @@ -87,6 +87,15 @@ struct vhost_iotlb_msg { > >> */ > >> #define VHOST_IOTLB_BATCH_BEGIN5 > >> #define VHOST_IOTLB_BATCH_END 6 > >> + > >> +/* > >> + * VHOST_IOTLB_REMAP registers a new uaddr for the
WARNING in depot_fetch_stack
Hello. We are Ubisectech Sirius Team, the vulnerability lab of China ValiantSec. Recently, our team has discovered a issue in Linux kernel 6.7.0-g052d534373b7. Attached to the email were a POC file of the issue. Stack dump: [ 154.711833][ T8003] [ cut here ] [ 154.711851][ T8003] pool index 81727 out of bounds (941) for stack id 3f3f3f3f [ 154.712204][ T8003] WARNING: CPU: 1 PID: 8003 at lib/stackdepot.c:410 depot_fetch_stack (lib/stackdepot.c:410 (discriminator 1)) [ 154.712267][ T8003] Modules linked in: [ 154.712284][ T8003] CPU: 1 PID: 8003 Comm: poc Not tainted 6.7.0-g9d1694dc91ce #20 [ 154.712302][ T8003] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014 [ 154.712315][ T8003] RIP: 0010:depot_fetch_stack (lib/stackdepot.c:410 (discriminator 1)) [ 154.712491][ T8003] Call Trace: [ 154.712496][ T8003] [ 154.712766][ T8003] stack_depot_put (lib/stackdepot.c:632 lib/stackdepot.c:620) [ 154.712788][ T8003] kasan_release_object_meta (mm/kasan/generic.c:511 mm/kasan/generic.c:543) [ 154.712807][ T8003] qlist_free_all (./arch/x86/include/asm/jump_label.h:27 mm/kasan/../slab.h:646 mm/kasan/quarantine.c:156 mm/kasan/quarantine.c:176) [ 154.712823][ T8003] kasan_quarantine_reduce (./include/linux/srcu.h:285 mm/kasan/quarantine.c:284) [ 154.712843][ T8003] __kasan_slab_alloc (mm/kasan/common.c:326) [ 154.712867][ T8003] kmalloc_trace (mm/slub.c:3814 mm/slub.c:3860 mm/slub.c:4007) [ 154.712888][ T8003] bdev_open_by_dev (block/bdev.c:822) [ 154.712908][ T8003] blkdev_open (block/fops.c:617 (discriminator 4)) [ 154.712926][ T8003] do_dentry_open (fs/open.c:954) [ 154.712969][ T8003] path_openat (fs/namei.c:3642 fs/namei.c:3798) [ 154.713068][ T8003] do_filp_open (fs/namei.c:3826) [ 154.713216][ T8003] do_sys_openat2 (fs/open.c:1405) [ 154.713306][ T8003] __x64_sys_openat (fs/open.c:1430) [ 154.713351][ T8003] do_syscall_64 (arch/x86/entry/common.c:52 arch/x86/entry/common.c:83) [ 154.713375][ T8003] entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:129) [ 154.713396][ T8003] RIP: 0033:0x7f8bc3aa9127 [ 154.713485][ T8003] Thank you for taking the time to read this email and we look forward to working with you further. Ubisectech Sirius Team Web: www.ubisectech.com Email: bugrep...@ubisectech.com 横板竖版组合LOGO_画板 1.png Description: Binary data poc.c Description: Binary data
Re: Re: Re: EEVDF/vhost regression (bisected to 86bfbb7ce4f6 sched/fair: Add lag based placement)
On Mon, Jan 08, 2024 at 02:13:25PM +0100, Tobias Huschle wrote: > On Thu, Dec 14, 2023 at 02:14:59AM -0500, Michael S. Tsirkin wrote: > > > > Peter, would appreciate feedback on this. When is cond_resched() > > insufficient to give up the CPU? Should > > Documentation/kernel-hacking/hacking.rst > > be updated to require schedule() instead? > > > > Happy new year everybody! > > I'd like to bring this thread back to life. To reiterate: > > - The introduction of the EEVDF scheduler revealed a performance > regression in a uperf testcase of ~50%. > - Tracing the scheduler showed that it takes decisions which are > in line with its design. > - The traces showed as well, that a vhost instance might run > excessively long on its CPU in some circumstance. Those cause > the performance regression as they cause delay times of 100+ms > for a kworker which drives the actual network processing. > - Before EEVDF, the vhost would always be scheduled off its CPU > in favor of the kworker, as the kworker was being woken up and > the former scheduler was giving more priority to the woken up > task. With EEVDF, the kworker, as a long running process, is > able to accumulate negative lag, which causes EEVDF to not > prefer it on its wake up, leaving the vhost running. > - If the kworker is not scheduled when being woken up, the vhost > continues looping until it is migrated off the CPU. > - The vhost offers to be scheduled off the CPU by calling > cond_resched(), but, the the need_resched flag is not set, > therefore cond_resched() does nothing. > > To solve this, I see the following options > (might not be a complete nor a correct list) > - Along with the wakeup of the kworker, need_resched needs to > be set, such that cond_resched() triggers a reschedule. Let's try this? Does not look like discussing vhost itself will draw attention from scheduler guys but posting a scheduling patch probably will? Can you post a patch? > - The vhost calls schedule() instead of cond_resched() to give up > the CPU. This would of course be a significantly stricter > approach and might limit the performance of vhost in other cases. > - Preventing the kworker from accumulating negative lag as it is > mostly not runnable and if it runs, it only runs for a very short > time frame. This might clash with the overall concept of EEVDF. > - On cond_resched(), verify if the consumed runtime of the caller > is outweighing the negative lag of another process (e.g. the > kworker) and schedule the other process. Introduces overhead > to cond_resched. Or this last one. > > I would be curious on feedback on those ideas and interested in > alternative approaches.
[PATCH] ARM: dts: qcom: msm8926-htc-memul: Add rmtfs memory node
Add the rmtfs-mem node which was part of one of the "unknown" memory reservation. Split that one, make sure the reserved-memory in total still covers the same space. Signed-off-by: Luca Weiss --- arch/arm/boot/dts/qcom/qcom-msm8926-htc-memul.dts | 15 ++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/arch/arm/boot/dts/qcom/qcom-msm8926-htc-memul.dts b/arch/arm/boot/dts/qcom/qcom-msm8926-htc-memul.dts index ed328b24335f..3037344eb240 100644 --- a/arch/arm/boot/dts/qcom/qcom-msm8926-htc-memul.dts +++ b/arch/arm/boot/dts/qcom/qcom-msm8926-htc-memul.dts @@ -107,7 +107,20 @@ smem_region: smem@fa0 { }; unknown@fb0 { - reg = <0x0fb0 0x1b0>; + reg = <0x0fb0 0x28>; + no-map; + }; + + rmtfs@fd8 { + compatible = "qcom,rmtfs-mem"; + reg = <0x0fd8 0x18>; + no-map; + + qcom,client-id = <1>; + }; + + unknown@ff0 { + reg = <0x0ff0 0x170>; no-map; }; }; --- base-commit: ad5c60d66016e544c51ed98635a74073f761f45d change-id: 20240121-memul-rmtfs-9aa9b54f200a Best regards, -- Luca Weiss
[PATCH] ARM: dts: qcom: apq8026-lg-lenok: Add vibrator support
This device has a vibrator attached to the CAMSS_GP0_CLK, use clk-pwm and pwm-vibrator to make the vibrator work. Signed-off-by: Luca Weiss --- arch/arm/boot/dts/qcom/qcom-apq8026-lg-lenok.dts | 38 1 file changed, 38 insertions(+) diff --git a/arch/arm/boot/dts/qcom/qcom-apq8026-lg-lenok.dts b/arch/arm/boot/dts/qcom/qcom-apq8026-lg-lenok.dts index 0a1fd5eb3c6d..a70de21bf139 100644 --- a/arch/arm/boot/dts/qcom/qcom-apq8026-lg-lenok.dts +++ b/arch/arm/boot/dts/qcom/qcom-apq8026-lg-lenok.dts @@ -7,6 +7,7 @@ #include "qcom-msm8226.dtsi" #include "pm8226.dtsi" +#include /delete-node/ _region; @@ -56,6 +57,29 @@ vreg_wlan: wlan-regulator { pinctrl-names = "default"; pinctrl-0 = <_regulator_default_state>; }; + + pwm_vibrator: pwm { + compatible = "clk-pwm"; + clocks = < CAMSS_GP0_CLK>; + + pinctrl-0 = <_clk_default_state>; + pinctrl-names = "default"; + + #pwm-cells = <2>; + }; + + vibrator { + compatible = "pwm-vibrator"; + + pwms = <_vibrator 0 1>; + pwm-names = "enable"; + + vcc-supply = <_l28>; + enable-gpios = < 62 GPIO_ACTIVE_HIGH>; + + pinctrl-0 = <_en_default_state>; + pinctrl-names = "default"; + }; }; { @@ -330,6 +354,20 @@ reset-pins { }; }; + vibrator_clk_default_state: vibrator-clk-default-state { + pins = "gpio33"; + function = "gp0_clk"; + drive-strength = <2>; + bias-disable; + }; + + vibrator_en_default_state: vibrator-en-default-state { + pins = "gpio62"; + function = "gpio"; + drive-strength = <2>; + bias-disable; + }; + wlan_hostwake_default_state: wlan-hostwake-default-state { pins = "gpio37"; function = "gpio"; --- base-commit: ad5c60d66016e544c51ed98635a74073f761f45d change-id: 20240121-lenok-vibrator-ce5b1734e2bb Best regards, -- Luca Weiss