Re: [PATCH v2 1/2] tracing/user_events: Fix non-spaced field matching
On Thu, May 02, 2024 at 05:16:34PM -0400, Steven Rostedt wrote: > On Tue, 23 Apr 2024 16:23:37 + > Beau Belgrave wrote: > > > When the ABI was updated to prevent same name w/different args, it > > missed an important corner case when fields don't end with a space. > > Typically, space is used for fields to help separate them, like > > "u8 field1; u8 field2". If no spaces are used, like > > "u8 field1;u8 field2", then the parsing works for the first time. > > However, the match check fails on a subsequent register, leading to > > confusion. > > > > This is because the match check uses argv_split() and assumes that all > > fields will be split upon the space. When spaces are used, we get back > > { "u8", "field1;" }, without spaces we get back { "u8", "field1;u8" }. > > This causes a mismatch, and the user program gets back -EADDRINUSE. > > > > Add a method to detect this case before calling argv_split(). If found > > force a space after the field separator character ';'. This ensures all > > cases work properly for matching. > > > > With this fix, the following are all treated as matching: > > u8 field1;u8 field2 > > u8 field1; u8 field2 > > u8 field1;\tu8 field2 > > u8 field1;\nu8 field2 > > I'm curious, what happens if you have: "u8 field1; u8 field2;" ? > You'll get an extra whitespace during the copy, assuming it was really: "u8 field1;u8 field2" If it had spaces, this code wouldn't run. > Do you care? As you will then create "u8 field1; u8 field2; " > > but I'm guessing the extra whitespace at the end doesn't affect anything. > Right, you get an extra byte allocated, but the argv_split() with ignore it. The compare will work correctly (I've verified this just now to double check). IE these all match: "Test u8 a; u8 b; " "Test u8 a; u8 b;" "Test u8 a; u8 b" > > > > > Fixes: ba470eebc2f6 ("tracing/user_events: Prevent same name but different > > args event") > > Signed-off-by: Beau Belgrave > > --- > > kernel/trace/trace_events_user.c | 76 +++- > > 1 file changed, 75 insertions(+), 1 deletion(-) > > > > diff --git a/kernel/trace/trace_events_user.c > > b/kernel/trace/trace_events_user.c > > index 70d428c394b6..82b191f33a28 100644 > > --- a/kernel/trace/trace_events_user.c > > +++ b/kernel/trace/trace_events_user.c > > @@ -1989,6 +1989,80 @@ static int user_event_set_tp_name(struct user_event > > *user) > > return 0; > > } > > > > +/* > > + * Counts how many ';' without a trailing space are in the args. > > + */ > > +static int count_semis_no_space(char *args) > > +{ > > + int count = 0; > > + > > + while ((args = strchr(args, ';'))) { > > + args++; > > + > > + if (!isspace(*args)) > > + count++; > > This will count that "..;" > > This is most likely not an issue, but since I didn't see this case > anywhere, I figured I bring it up just to confirm that it's not an issue. > It's not an issue on the matching/logic. However, you do get an extra byte alloc (which doesn't bother me in this edge case). Thanks, -Beau > -- Steve > > > > + } > > + > > + return count; > > +} > > + > > +/* > > + * Copies the arguments while ensuring all ';' have a trailing space. > > + */ > > +static char *insert_space_after_semis(char *args, int count) > > +{ > > + char *fixed, *pos; > > + int len; > > + > > + len = strlen(args) + count; > > + fixed = kmalloc(len + 1, GFP_KERNEL); > > + > > + if (!fixed) > > + return NULL; > > + > > + pos = fixed; > > + > > + /* Insert a space after ';' if there is no trailing space. */ > > + while (*args) { > > + *pos = *args++; > > + > > + if (*pos++ == ';' && !isspace(*args)) > > + *pos++ = ' '; > > + } > > + > > + *pos = '\0'; > > + > > + return fixed; > > +} > > +
[PATCH v2 1/2] tracing/user_events: Fix non-spaced field matching
When the ABI was updated to prevent same name w/different args, it missed an important corner case when fields don't end with a space. Typically, space is used for fields to help separate them, like "u8 field1; u8 field2". If no spaces are used, like "u8 field1;u8 field2", then the parsing works for the first time. However, the match check fails on a subsequent register, leading to confusion. This is because the match check uses argv_split() and assumes that all fields will be split upon the space. When spaces are used, we get back { "u8", "field1;" }, without spaces we get back { "u8", "field1;u8" }. This causes a mismatch, and the user program gets back -EADDRINUSE. Add a method to detect this case before calling argv_split(). If found force a space after the field separator character ';'. This ensures all cases work properly for matching. With this fix, the following are all treated as matching: u8 field1;u8 field2 u8 field1; u8 field2 u8 field1;\tu8 field2 u8 field1;\nu8 field2 Fixes: ba470eebc2f6 ("tracing/user_events: Prevent same name but different args event") Signed-off-by: Beau Belgrave --- kernel/trace/trace_events_user.c | 76 +++- 1 file changed, 75 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index 70d428c394b6..82b191f33a28 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -1989,6 +1989,80 @@ static int user_event_set_tp_name(struct user_event *user) return 0; } +/* + * Counts how many ';' without a trailing space are in the args. + */ +static int count_semis_no_space(char *args) +{ + int count = 0; + + while ((args = strchr(args, ';'))) { + args++; + + if (!isspace(*args)) + count++; + } + + return count; +} + +/* + * Copies the arguments while ensuring all ';' have a trailing space. + */ +static char *insert_space_after_semis(char *args, int count) +{ + char *fixed, *pos; + int len; + + len = strlen(args) + count; + fixed = kmalloc(len + 1, GFP_KERNEL); + + if (!fixed) + return NULL; + + pos = fixed; + + /* Insert a space after ';' if there is no trailing space. */ + while (*args) { + *pos = *args++; + + if (*pos++ == ';' && !isspace(*args)) + *pos++ = ' '; + } + + *pos = '\0'; + + return fixed; +} + +static char **user_event_argv_split(char *args, int *argc) +{ + char **split; + char *fixed; + int count; + + /* Count how many ';' without a trailing space */ + count = count_semis_no_space(args); + + /* No fixup is required */ + if (!count) + return argv_split(GFP_KERNEL, args, argc); + + /* We must fixup 'field;field' to 'field; field' */ + fixed = insert_space_after_semis(args, count); + + if (!fixed) + return NULL; + + /* We do a normal split afterwards */ + split = argv_split(GFP_KERNEL, fixed, argc); + + /* We can free since argv_split makes a copy */ + kfree(fixed); + + return split; +} + /* * Parses the event name, arguments and flags then registers if successful. * The name buffer lifetime is owned by this method for success cases only. @@ -2012,7 +2086,7 @@ static int user_event_parse(struct user_event_group *group, char *name, return -EPERM; if (args) { - argv = argv_split(GFP_KERNEL, args, ); + argv = user_event_argv_split(args, ); if (!argv) return -ENOMEM; -- 2.34.1
[PATCH v2 2/2] selftests/user_events: Add non-spacing separator check
The ABI documentation indicates that field separators do not need a space between them, only a ';'. When no spacing is used, the register must work. Any subsequent register, with or without spaces, must match and not return -EADDRINUSE. Add a non-spacing separator case to our self-test register case to ensure it works going forward. Signed-off-by: Beau Belgrave --- tools/testing/selftests/user_events/ftrace_test.c | 8 1 file changed, 8 insertions(+) diff --git a/tools/testing/selftests/user_events/ftrace_test.c b/tools/testing/selftests/user_events/ftrace_test.c index dcd7509fe2e0..0bb46793dcd4 100644 --- a/tools/testing/selftests/user_events/ftrace_test.c +++ b/tools/testing/selftests/user_events/ftrace_test.c @@ -261,6 +261,12 @@ TEST_F(user, register_events) { ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSREG, )); ASSERT_EQ(0, reg.write_index); + /* Register without separator spacing should still match */ + reg.enable_bit = 29; + reg.name_args = (__u64)"__test_event u32 field1;u32 field2"; + ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSREG, )); + ASSERT_EQ(0, reg.write_index); + /* Multiple registers to same name but different args should fail */ reg.enable_bit = 29; reg.name_args = (__u64)"__test_event u32 field1;"; @@ -288,6 +294,8 @@ TEST_F(user, register_events) { ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSUNREG, )); unreg.disable_bit = 30; ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSUNREG, )); + unreg.disable_bit = 29; + ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSUNREG, )); /* Delete should have been auto-done after close and unregister */ close(self->data_fd); -- 2.34.1
[PATCH v2 0/2] tracing/user_events: Fix non-spaced field matching
When the ABI was updated to prevent same name w/different args, it missed an important corner case when fields don't end with a space. Typically, space is used for fields to help separate them, like "u8 field1; u8 field2". If no spaces are used, like "u8 field1;u8 field2", then the parsing works for the first time. However, the match check fails on a subsequent register, leading to confusion. This is because the match check uses argv_split() and assumes that all fields will be split upon the space. When spaces are used, we get back { "u8", "field1;" }, without spaces we get back { "u8", "field1;u8" }. This causes a mismatch, and the user program gets back -EADDRINUSE. Add a method to detect this case before calling argv_split(). If found force a space after the field separator character ';'. This ensures all cases work properly for matching. I could not find an existing function to accomplish this, so I had to hand code a copy with this logic. If there is a better way to achieve this, I'm all ears. This series also adds a selftest to ensure this doesn't break again. With this fix, the following are all treated as matching: u8 field1;u8 field2 u8 field1; u8 field2 u8 field1;\tu8 field2 u8 field1;\nu8 field2 V2 changes: Renamed fix_semis_no_space() to insert_space_after_semis(). Have user_event_argv_split() return fast in no-split case. Pulled in Masami's shorter loop in insert_space_after_semis(). Beau Belgrave (2): tracing/user_events: Fix non-spaced field matching selftests/user_events: Add non-spacing separator check kernel/trace/trace_events_user.c | 76 ++- .../selftests/user_events/ftrace_test.c | 8 ++ 2 files changed, 83 insertions(+), 1 deletion(-) base-commit: 0bbac3facb5d6cc0171c45c9873a2dc96bea9680 -- 2.34.1
Re: [PATCH 1/2] tracing/user_events: Fix non-spaced field matching
On Sat, Apr 20, 2024 at 09:50:52PM +0900, Masami Hiramatsu wrote: > On Fri, 19 Apr 2024 14:13:34 -0700 > Beau Belgrave wrote: > > > On Fri, Apr 19, 2024 at 11:33:05AM +0900, Masami Hiramatsu wrote: > > > On Tue, 16 Apr 2024 22:41:01 + > > > Beau Belgrave wrote: *SNIP* > > > nit: This loop can be simpler, because we are sure fixed has enough > > > length; > > > > > > /* insert a space after ';' if there is no space. */ > > > while(*args) { > > > *pos = *args++; > > > if (*pos++ == ';' && !isspace(*args)) > > > *pos++ = ' '; > > > } > > > > > > > I was worried that if count_semis_no_space() ever had different logic > > (maybe after this commit) that it could cause an overflow if the count > > was wrong, etc. > > > > I don't have an issue making it shorter, but I was trying to be more on > > the safe side, since this isn't a fast path (event register). > > OK, anyway current code looks correct. But note that I don't think > "pos++; len--;" is safer, since it is not atomic. This pattern > easily loose "len--;" in my experience. So please carefully use it ;) > I'll stick with your loop. Perhaps others will chime in on the v2 and state a stronger opinion. You scared me with the atomic comment, I went back and looked at all the paths for this. In the user_events IOCTL the buffer is copied from user to kernel, so it cannot change (and no other threads access it). I also checked trace_parse_run_command() which is the same. So at least in this context the non-atomic part is OK. > > > > > > + > > > > + /* > > > > +* len is the length of the copy excluding the null. > > > > +* This ensures we always have room for a null. > > > > +*/ > > > > + *pos = '\0'; > > > > + > > > > + return fixed; > > > > +} > > > > + > > > > +static char **user_event_argv_split(char *args, int *argc) > > > > +{ > > > > + /* Count how many ';' without a trailing space */ > > > > + int count = count_semis_no_space(args); > > > > + > > > > + if (count) { > > > > > > nit: it is better to exit fast, so > > > > > > if (!count) > > > return argv_split(GFP_KERNEL, args, argc); > > > > > > ... > > > > Sure, will fix in a v2. > > > > > > > > Thank you, > > > > > > OT: BTW, can this also simplify synthetic events? > > > > > > > I'm not sure, I'll check when I have some time. I want to get this fix > > in sooner rather than later. > > Ah, nevermind. Synthetic event parses the field by strsep(';') first > and argv_split(). So it does not have this issue. > Ok, seems unrelated. Thanks for checking. Thanks, -Beau > Thank you, > > > > > Thanks, > > -Beau > > *SNIP* > > > Masami Hiramatsu (Google) > > > -- > Masami Hiramatsu (Google)
Re: [PATCH 1/2] tracing/user_events: Fix non-spaced field matching
On Fri, Apr 19, 2024 at 11:33:05AM +0900, Masami Hiramatsu wrote: > On Tue, 16 Apr 2024 22:41:01 + > Beau Belgrave wrote: > > > When the ABI was updated to prevent same name w/different args, it > > missed an important corner case when fields don't end with a space. > > Typically, space is used for fields to help separate them, like > > "u8 field1; u8 field2". If no spaces are used, like > > "u8 field1;u8 field2", then the parsing works for the first time. > > However, the match check fails on a subsequent register, leading to > > confusion. > > > > This is because the match check uses argv_split() and assumes that all > > fields will be split upon the space. When spaces are used, we get back > > { "u8", "field1;" }, without spaces we get back { "u8", "field1;u8" }. > > This causes a mismatch, and the user program gets back -EADDRINUSE. > > > > Add a method to detect this case before calling argv_split(). If found > > force a space after the field separator character ';'. This ensures all > > cases work properly for matching. > > > > With this fix, the following are all treated as matching: > > u8 field1;u8 field2 > > u8 field1; u8 field2 > > u8 field1;\tu8 field2 > > u8 field1;\nu8 field2 > > Sounds good to me. I just have some nits. > > > > > Fixes: ba470eebc2f6 ("tracing/user_events: Prevent same name but different > > args event") > > Signed-off-by: Beau Belgrave > > --- > > kernel/trace/trace_events_user.c | 88 +++- > > 1 file changed, 87 insertions(+), 1 deletion(-) > > > > diff --git a/kernel/trace/trace_events_user.c > > b/kernel/trace/trace_events_user.c > > index 70d428c394b6..9184d3962b2a 100644 > > --- a/kernel/trace/trace_events_user.c > > +++ b/kernel/trace/trace_events_user.c > > @@ -1989,6 +1989,92 @@ static int user_event_set_tp_name(struct user_event > > *user) > > return 0; > > } > > > > +/* > > + * Counts how many ';' without a trailing space are in the args. > > + */ > > +static int count_semis_no_space(char *args) > > +{ > > + int count = 0; > > + > > + while ((args = strchr(args, ';'))) { > > + args++; > > + > > + if (!isspace(*args)) > > + count++; > > + } > > + > > + return count; > > +} > > + > > +/* > > + * Copies the arguments while ensuring all ';' have a trailing space. > > + */ > > +static char *fix_semis_no_space(char *args, int count) > > nit: This name does not represent what it does. 'insert_space_after_semis()' > is more self-described. > Sure, will fix in a v2. > > +{ > > + char *fixed, *pos; > > + char c, last; > > + int len; > > + > > + len = strlen(args) + count; > > + fixed = kmalloc(len + 1, GFP_KERNEL); > > + > > + if (!fixed) > > + return NULL; > > + > > + pos = fixed; > > + last = '\0'; > > + > > + while (len > 0) { > > + c = *args++; > > + > > + if (last == ';' && !isspace(c)) { > > + *pos++ = ' '; > > + len--; > > + } > > + > > + if (len > 0) { > > + *pos++ = c; > > + len--; > > + } > > + > > + last = c; > > + } > > nit: This loop can be simpler, because we are sure fixed has enough length; > > /* insert a space after ';' if there is no space. */ > while(*args) { > *pos = *args++; > if (*pos++ == ';' && !isspace(*args)) > *pos++ = ' '; > } > I was worried that if count_semis_no_space() ever had different logic (maybe after this commit) that it could cause an overflow if the count was wrong, etc. I don't have an issue making it shorter, but I was trying to be more on the safe side, since this isn't a fast path (event register). > > + > > + /* > > +* len is the length of the copy excluding the null. > > +* This ensures we always have room for a null. > > +*/ > > + *pos = '\0'; > > + > > + return fixed; > > +} > > + > > +static char **user_event_argv_split(char *args, int *argc) > > +{ > > + /* Count how many ';' without a trailing space */ > > + int count = count_semis_no_space(args); > > + > > + if (count) { > > nit: it is better to exit fast, so > &
[PATCH 2/2] selftests/user_events: Add non-spacing separator check
The ABI documentation indicates that field separators do not need a space between them, only a ';'. When no spacing is used, the register must work. Any subsequent register, with or without spaces, must match and not return -EADDRINUSE. Add a non-spacing separator case to our self-test register case to ensure it works going forward. Signed-off-by: Beau Belgrave --- tools/testing/selftests/user_events/ftrace_test.c | 8 1 file changed, 8 insertions(+) diff --git a/tools/testing/selftests/user_events/ftrace_test.c b/tools/testing/selftests/user_events/ftrace_test.c index dcd7509fe2e0..0bb46793dcd4 100644 --- a/tools/testing/selftests/user_events/ftrace_test.c +++ b/tools/testing/selftests/user_events/ftrace_test.c @@ -261,6 +261,12 @@ TEST_F(user, register_events) { ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSREG, )); ASSERT_EQ(0, reg.write_index); + /* Register without separator spacing should still match */ + reg.enable_bit = 29; + reg.name_args = (__u64)"__test_event u32 field1;u32 field2"; + ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSREG, )); + ASSERT_EQ(0, reg.write_index); + /* Multiple registers to same name but different args should fail */ reg.enable_bit = 29; reg.name_args = (__u64)"__test_event u32 field1;"; @@ -288,6 +294,8 @@ TEST_F(user, register_events) { ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSUNREG, )); unreg.disable_bit = 30; ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSUNREG, )); + unreg.disable_bit = 29; + ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSUNREG, )); /* Delete should have been auto-done after close and unregister */ close(self->data_fd); -- 2.34.1
[PATCH 1/2] tracing/user_events: Fix non-spaced field matching
When the ABI was updated to prevent same name w/different args, it missed an important corner case when fields don't end with a space. Typically, space is used for fields to help separate them, like "u8 field1; u8 field2". If no spaces are used, like "u8 field1;u8 field2", then the parsing works for the first time. However, the match check fails on a subsequent register, leading to confusion. This is because the match check uses argv_split() and assumes that all fields will be split upon the space. When spaces are used, we get back { "u8", "field1;" }, without spaces we get back { "u8", "field1;u8" }. This causes a mismatch, and the user program gets back -EADDRINUSE. Add a method to detect this case before calling argv_split(). If found force a space after the field separator character ';'. This ensures all cases work properly for matching. With this fix, the following are all treated as matching: u8 field1;u8 field2 u8 field1; u8 field2 u8 field1;\tu8 field2 u8 field1;\nu8 field2 Fixes: ba470eebc2f6 ("tracing/user_events: Prevent same name but different args event") Signed-off-by: Beau Belgrave --- kernel/trace/trace_events_user.c | 88 +++- 1 file changed, 87 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index 70d428c394b6..9184d3962b2a 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -1989,6 +1989,92 @@ static int user_event_set_tp_name(struct user_event *user) return 0; } +/* + * Counts how many ';' without a trailing space are in the args. + */ +static int count_semis_no_space(char *args) +{ + int count = 0; + + while ((args = strchr(args, ';'))) { + args++; + + if (!isspace(*args)) + count++; + } + + return count; +} + +/* + * Copies the arguments while ensuring all ';' have a trailing space. + */ +static char *fix_semis_no_space(char *args, int count) +{ + char *fixed, *pos; + char c, last; + int len; + + len = strlen(args) + count; + fixed = kmalloc(len + 1, GFP_KERNEL); + + if (!fixed) + return NULL; + + pos = fixed; + last = '\0'; + + while (len > 0) { + c = *args++; + + if (last == ';' && !isspace(c)) { + *pos++ = ' '; + len--; + } + + if (len > 0) { + *pos++ = c; + len--; + } + + last = c; + } + + /* +* len is the length of the copy excluding the null. +* This ensures we always have room for a null. +*/ + *pos = '\0'; + + return fixed; +} + +static char **user_event_argv_split(char *args, int *argc) +{ + /* Count how many ';' without a trailing space */ + int count = count_semis_no_space(args); + + if (count) { + /* We must fixup 'field;field' to 'field; field' */ + char *fixed = fix_semis_no_space(args, count); + char **split; + + if (!fixed) + return NULL; + + /* We do a normal split afterwards */ + split = argv_split(GFP_KERNEL, fixed, argc); + + /* We can free since argv_split makes a copy */ + kfree(fixed); + + return split; + } + + /* No fixup is required */ + return argv_split(GFP_KERNEL, args, argc); +} + /* * Parses the event name, arguments and flags then registers if successful. * The name buffer lifetime is owned by this method for success cases only. @@ -2012,7 +2098,7 @@ static int user_event_parse(struct user_event_group *group, char *name, return -EPERM; if (args) { - argv = argv_split(GFP_KERNEL, args, ); + argv = user_event_argv_split(args, ); if (!argv) return -ENOMEM; -- 2.34.1
[PATCH 0/2] tracing/user_events: Fix non-spaced field matching
When the ABI was updated to prevent same name w/different args, it missed an important corner case when fields don't end with a space. Typically, space is used for fields to help separate them, like "u8 field1; u8 field2". If no spaces are used, like "u8 field1;u8 field2", then the parsing works for the first time. However, the match check fails on a subsequent register, leading to confusion. This is because the match check uses argv_split() and assumes that all fields will be split upon the space. When spaces are used, we get back { "u8", "field1;" }, without spaces we get back { "u8", "field1;u8" }. This causes a mismatch, and the user program gets back -EADDRINUSE. Add a method to detect this case before calling argv_split(). If found force a space after the field separator character ';'. This ensures all cases work properly for matching. I could not find an existing function to accomplish this, so I had to hand code a copy with this logic. If there is a better way to achieve this, I'm all ears. This series also adds a selftest to ensure this doesn't break again. With this fix, the following are all treated as matching: u8 field1;u8 field2 u8 field1; u8 field2 u8 field1;\tu8 field2 u8 field1;\nu8 field2 Beau Belgrave (2): tracing/user_events: Fix non-spaced field matching selftests/user_events: Add non-spacing separator check kernel/trace/trace_events_user.c | 88 ++- .../selftests/user_events/ftrace_test.c | 8 ++ 2 files changed, 95 insertions(+), 1 deletion(-) base-commit: 0bbac3facb5d6cc0171c45c9873a2dc96bea9680 -- 2.34.1
Re: [RFC PATCH 0/4] perf: Correlating user process data to samples
On Fri, Apr 12, 2024 at 09:12:45AM +0200, Peter Zijlstra wrote: > > On Fri, Apr 12, 2024 at 12:17:28AM +, Beau Belgrave wrote: > > > An idea flow would look like this: > > User Task Profile > > do_work(); sample() -> IP + No activity > > ... > > set_activity(123); > > ... > > do_work(); sample() -> IP + activity (123) > > ... > > set_activity(124); > > ... > > do_work(); sample() -> IP + activity (124) > > This, start with this, because until I saw this, I was utterly confused > as to what the heck you were on about. > Will do. > I started by thinking we already have TID in samples so you can already > associate back to user processes and got increasingly confused the > further I went. > > What you seem to want to do however is have some task-state included so > you can see what the thread is doing. > Yeah, there is typically an external context (not on the machine) that wants to be tied to each sample. The context could be a simple integer, UUID, or something else entirely. For OTel, this is a 16-byte array [1]. > Anyway, since we typically run stuff from NMI context, accessing user > data is 'interesting'. As such I would really like to make this work > depend on the call-graph rework that pushes all the user access bits > into return-to-user. Cool, I assume that's the SFRAME work? Are there pointers to work I could look at and think about what a rebase looks like? Or do you have someone in mind I should work with for this? Thanks, -Beau 1. https://www.w3.org/TR/trace-context/#version-format
Re: [RFC PATCH 0/4] perf: Correlating user process data to samples
On Thu, Apr 11, 2024 at 09:52:22PM -0700, Ian Rogers wrote: > On Thu, Apr 11, 2024 at 5:17 PM Beau Belgrave > wrote: > > > > In the Open Telemetry profiling SIG [1], we are trying to find a way to > > grab a tracing association quickly on a per-sample basis. The team at > > Elastic has a bespoke way to do this [2], however, I'd like to see a > > more general way to achieve this. The folks I've been talking with seem > > open to the idea of just having a TLS value for this we could capture > > Presumably TLS == Thread Local Storage. > Yes, the initial idea is to use thread local storage (TLS). It seems to be the fastest option to save a per-thread value that changes at a fast rate. > > upon each sample. We could then just state, Open Telemetry SDKs should > > have a TLS value for span correlation. However, we need a way to sample > > the TLS or other value(s) when a sampling event is generated. This is > > supported today on Windows via EventActivityIdControl() [3]. Since > > Open Telemetry works on both Windows and Linux, ideally we can do > > something as efficient for Linux based workloads. > > > > This series is to explore how it would be best possible to collect > > supporting data from a user process when a profile sample is collected. > > Having a value stored in TLS makes a lot of sense for this however > > there are other ways to explore. Whatever is chosen, kernel samples > > taken in process context should be able to get this supporting data. > > In these patches on X64 the fsbase and gsbase are used for this. > > > > An option to explore suggested by Mathieu Desnoyers is to utilize rseq > > for processes to register a value location that can be included when > > profiling if desired. This would allow a tighter contract between user > > processes and a profiler. It would allow better labeling/categorizing > > the correlation values. > > It is hard to understand this idea. Are you saying stash a cookie in > TLS for samples to capture to indicate an activity? Restartable > sequences are about preemption on a CPU not of a thread, so at least > my intuition is that they feel different. You could stash information > like this today by changing the thread name which generates comm > events. I've wondered about having similar information in some form of > reserved for profiling stack slot, for example, to stash a pointer to > the name of a function being interpreted. Snapshotting all of a stack > is bad performance wise and for security. A stack slot would be able > to deal with nesting. > You are getting the idea. A slot or tag for a thread would be great! I'm not a fan of overriding the thread comm name (as that already has a use). TLS would be fine, if we could also pass an offset + size + type. Maybe a stack slot that just points to parts of TLS? That way you could have a set of slots that don't require much memory and selectively copy them out of TLS (or where ever those slots point to in user memory). When I was talking to Mathieu about this, it seems that rseq already had a place to potentially put these slots. I'm unsure though how the per thread aspects would work. Mathieu, can you post your ideas here about that? > > An idea flow would look like this: > > User Task Profile > > do_work(); sample() -> IP + No activity > > ... > > set_activity(123); > > ... > > do_work(); sample() -> IP + activity (123) > > ... > > set_activity(124); > > ... > > do_work(); sample() -> IP + activity (124) > > > > Ideally, the set_activity() method would not be a syscall. It needs to > > be very cheap as this should not bottleneck work. Ideally this is just > > a memcpy of 16-20 bytes as it is on Windows via EventActivityIdControl() > > using EVENT_ACTIVITY_CTRL_SET_ID. > > > > For those not aware, Open Telemetry allows collecting data from multiple > > machines and show where time was spent. The tracing context is already > > available for logs, but not for profiling samples. The idea is to show > > where slowdowns occur and have profile samples to explain why they > > slowed down. This must be possible without having to track context > > switches to do this correlation. This is because the profiling rates > > are typically 20hz - 1Khz, while the context switching rates are much > > higher. We do not want to have to consume high context switch rates > > just to know a correlation for a 20hz signal. Often these 20hz signals > > are always enabled in some environments. > > > > Regardless if TLS, rseq, or other source is used I believe we will need > > a way for pe
[RFC PATCH 2/4] perf: Introduce PERF_SAMPLE_TLS_USER sample type
When samples are generated, there is no way via the perf_event ABI to fetch per-thread data. This data is very useful in tracing scenarios that involve correlation IDs, such as OpenTelemetry. They are also useful for tracking per-thread performance details directly within a cooperating user process. The newly establish OpenTelemetry profiling group requires a way to get tracing correlations on both Linux and Windows. On Windows this correlation is on a per-thread basis directly via ETW. On Linux we need a fast mechanism to store these details and TLS seems like the best option, see links for more details. Add a new sample type (PERF_SAMPLE_TLS_USER) that fetches TLS data up to X bytes per-sample. Use the existing PERF_SAMPLE_STACK_USER ABI for outputting data out to consumers. Store requested data size by the user in the previously reserved u16 (__reserved_2) within perf_event_attr. Add tls_addr and tls_user_size to perf_sample_data and calculate them during sample preparation. This allows the output side to know if truncation is going to occur and not having to re-fetch the TLS value from the user process a second time. Add CONFIG_HAVE_PERF_USER_TLS_DUMP so that architectures can specify if they have a TLS specific register (or other logic) that can be used for dumping. This does not yet enable any architecture to do TLS dump, it simply makes it possible by allowing a arch defined method named arch_perf_user_tls_pointer(). Add perf_tls struct that arch_perf_user_tls_pointer() utilizes to set TLS details of the address and size (for 32bit on 64bit compat cases). Link: https://opentelemetry.io/blog/2024/profiling/ Link: https://www.elastic.co/blog/continuous-profiling-distributed-tracing-correlation Signed-off-by: Beau Belgrave --- arch/Kconfig| 7 +++ include/linux/perf_event.h | 7 +++ include/uapi/linux/perf_event.h | 5 +- kernel/events/core.c| 105 +++- kernel/events/internal.h| 16 + 5 files changed, 137 insertions(+), 3 deletions(-) diff --git a/arch/Kconfig b/arch/Kconfig index 9f066785bb71..6afaf5f46e2f 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -430,6 +430,13 @@ config HAVE_PERF_USER_STACK_DUMP access to the user stack pointer which is not unified across architectures. +config HAVE_PERF_USER_TLS_DUMP + bool + help + Support user tls dumps for perf event samples. This needs + access to the user tls pointer which is not unified across + architectures. + config HAVE_ARCH_JUMP_LABEL bool diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index d2a15c0c6f8a..7fac81929eed 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1202,8 +1202,15 @@ struct perf_sample_data { u64 data_page_size; u64 code_page_size; u64 aux_size; + u64 tls_addr; + u64 tls_user_size; } cacheline_aligned; +struct perf_tls { + unsigned long base; /* Base address for TLS */ + unsigned long size; /* Size of base address */ +}; + /* default value for data source */ #define PERF_MEM_NA (PERF_MEM_S(OP, NA) |\ PERF_MEM_S(LVL, NA) |\ diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 3a64499b0f5d..b62669cfe581 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -162,8 +162,9 @@ enum perf_event_sample_format { PERF_SAMPLE_DATA_PAGE_SIZE = 1U << 22, PERF_SAMPLE_CODE_PAGE_SIZE = 1U << 23, PERF_SAMPLE_WEIGHT_STRUCT = 1U << 24, + PERF_SAMPLE_TLS_USER= 1U << 25, - PERF_SAMPLE_MAX = 1U << 25, /* non-ABI */ + PERF_SAMPLE_MAX = 1U << 26, /* non-ABI */ }; #define PERF_SAMPLE_WEIGHT_TYPE(PERF_SAMPLE_WEIGHT | PERF_SAMPLE_WEIGHT_STRUCT) @@ -509,7 +510,7 @@ struct perf_event_attr { */ __u32 aux_watermark; __u16 sample_max_stack; - __u16 __reserved_2; + __u16 sample_tls_user; /* Size of TLS data to dump on samples */ __u32 aux_sample_size; __u32 __reserved_3; diff --git a/kernel/events/core.c b/kernel/events/core.c index 07de5cc2aa25..f848bf4be9bd 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6926,6 +6926,45 @@ static u64 perf_ustack_task_size(struct pt_regs *regs) return TASK_SIZE - addr; } +/* + * Get remaining task size from user tls pointer. + * + * Outputs the address to use for the dump to avoid doing + * this twice (prepare and output). + */ +static u64 +perf_utls_task_size(struct pt_regs *regs, u64 dump_size, u64 *tls_addr) +{ + struct perf_tls tls; + unsigne
[RFC PATCH 0/4] perf: Correlating user process data to samples
In the Open Telemetry profiling SIG [1], we are trying to find a way to grab a tracing association quickly on a per-sample basis. The team at Elastic has a bespoke way to do this [2], however, I'd like to see a more general way to achieve this. The folks I've been talking with seem open to the idea of just having a TLS value for this we could capture upon each sample. We could then just state, Open Telemetry SDKs should have a TLS value for span correlation. However, we need a way to sample the TLS or other value(s) when a sampling event is generated. This is supported today on Windows via EventActivityIdControl() [3]. Since Open Telemetry works on both Windows and Linux, ideally we can do something as efficient for Linux based workloads. This series is to explore how it would be best possible to collect supporting data from a user process when a profile sample is collected. Having a value stored in TLS makes a lot of sense for this however there are other ways to explore. Whatever is chosen, kernel samples taken in process context should be able to get this supporting data. In these patches on X64 the fsbase and gsbase are used for this. An option to explore suggested by Mathieu Desnoyers is to utilize rseq for processes to register a value location that can be included when profiling if desired. This would allow a tighter contract between user processes and a profiler. It would allow better labeling/categorizing the correlation values. An idea flow would look like this: User Task Profile do_work(); sample() -> IP + No activity ... set_activity(123); ... do_work(); sample() -> IP + activity (123) ... set_activity(124); ... do_work(); sample() -> IP + activity (124) Ideally, the set_activity() method would not be a syscall. It needs to be very cheap as this should not bottleneck work. Ideally this is just a memcpy of 16-20 bytes as it is on Windows via EventActivityIdControl() using EVENT_ACTIVITY_CTRL_SET_ID. For those not aware, Open Telemetry allows collecting data from multiple machines and show where time was spent. The tracing context is already available for logs, but not for profiling samples. The idea is to show where slowdowns occur and have profile samples to explain why they slowed down. This must be possible without having to track context switches to do this correlation. This is because the profiling rates are typically 20hz - 1Khz, while the context switching rates are much higher. We do not want to have to consume high context switch rates just to know a correlation for a 20hz signal. Often these 20hz signals are always enabled in some environments. Regardless if TLS, rseq, or other source is used I believe we will need a way for perf_events to include it within a sample. The changes in this series show how it could be done with TLS. There is some factoring work under perf to make it easier to add more dump types using the existing ABI. This is mostly to make the patches clearer, certainly the refactor parts could get dropped and we could have duplicated/specialized paths. 1. https://opentelemetry.io/blog/2024/profiling/ 2. https://www.elastic.co/blog/continuous-profiling-distributed-tracing-correlation 3. https://learn.microsoft.com/en-us/windows/win32/api/evntprov/nf-evntprov-eventactivityidcontrol Beau Belgrave (4): perf/core: Introduce perf_prepare_dump_data() perf: Introduce PERF_SAMPLE_TLS_USER sample type perf/core: Factor perf_output_sample_udump() perf/x86/core: Add tls dump support arch/Kconfig | 7 ++ arch/x86/Kconfig | 1 + arch/x86/events/core.c| 14 +++ arch/x86/include/asm/perf_event.h | 5 + include/linux/perf_event.h| 7 ++ include/uapi/linux/perf_event.h | 5 +- kernel/events/core.c | 166 +++--- kernel/events/internal.h | 16 +++ 8 files changed, 180 insertions(+), 41 deletions(-) base-commit: fec50db7033ea478773b159e0e2efb135270e3b7 -- 2.34.1
[RFC PATCH 4/4] perf/x86/core: Add tls dump support
Now that perf supports TLS dumps, x86-64 can provide the details for how to get TLS data for user threads. Enable HAVE_PERF_USER_TLS_DUMP Kconfig only for x86-64. I do not have access to x86 to validate 32-bit. Utilize mmap_is_ia32() to determine 32/64 bit threads. Use fsbase for 64-bit and gsbase for 32-bit with appropriate size. Signed-off-by: Beau Belgrave --- arch/x86/Kconfig | 1 + arch/x86/events/core.c| 14 ++ arch/x86/include/asm/perf_event.h | 5 + 3 files changed, 20 insertions(+) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 4fff6ed46e90..8d46ec8ded0c 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -263,6 +263,7 @@ config X86 select HAVE_PCI select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP + select HAVE_PERF_USER_TLS_DUMP if X86_64 select MMU_GATHER_RCU_TABLE_FREEif PARAVIRT select MMU_GATHER_MERGE_VMAS select HAVE_POSIX_CPU_TIMERS_TASK_WORK diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 09050641ce5d..3f851db4c591 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -41,6 +41,7 @@ #include #include #include +#include #include "perf_event.h" @@ -3002,3 +3003,16 @@ u64 perf_get_hw_event_config(int hw_event) return 0; } EXPORT_SYMBOL_GPL(perf_get_hw_event_config); + +#ifdef CONFIG_X86_64 +void arch_perf_user_tls_pointer(struct perf_tls *tls) +{ + if (!mmap_is_ia32()) { + tls->base = current->thread.fsbase; + tls->size = sizeof(u64); + } else { + tls->base = current->thread.gsbase; + tls->size = sizeof(u32); + } +} +#endif diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 3736b8a46c04..d0f65e572c20 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -628,4 +628,9 @@ static __always_inline void perf_lopwr_cb(bool lopwr_in) #define arch_perf_out_copy_user copy_from_user_nmi +#ifdef CONFIG_HAVE_PERF_USER_TLS_DUMP +struct perf_tls; +extern void arch_perf_user_tls_pointer(struct perf_tls *tls); +#endif + #endif /* _ASM_X86_PERF_EVENT_H */ -- 2.34.1
[RFC PATCH 3/4] perf/core: Factor perf_output_sample_udump()
We now have two user dump sources (stack and tls). Both are doing the same logic to ensure the user dump ABI output is properly handled. The only difference is one gets the address within the method, and the other is passed the address. Add perf_output_sample_udump() and utilize it for both stack and tls sample dumps. The sp register is now fetched outside of this method and passed to it. This allows both stack and tls to utilize the same code. Signed-off-by: Beau Belgrave --- kernel/events/core.c | 68 +--- 1 file changed, 19 insertions(+), 49 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index f848bf4be9bd..6b3cf5afdd32 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6998,47 +6998,10 @@ perf_sample_dump_size(u16 dump_size, u16 header_size, u64 task_size) } static void -perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size, - struct pt_regs *regs) -{ - /* Case of a kernel thread, nothing to dump */ - if (!regs) { - u64 size = 0; - perf_output_put(handle, size); - } else { - unsigned long sp; - unsigned int rem; - u64 dyn_size; - - /* -* We dump: -* static size -* - the size requested by user or the best one we can fit -* in to the sample max size -* data -* - user stack dump data -* dynamic size -* - the actual dumped size -*/ - - /* Static size. */ - perf_output_put(handle, dump_size); - - /* Data. */ - sp = perf_user_stack_pointer(regs); - rem = __output_copy_user(handle, (void *) sp, dump_size); - dyn_size = dump_size - rem; - - perf_output_skip(handle, rem); - - /* Dynamic size. */ - perf_output_put(handle, dyn_size); - } -} - -static void -perf_output_sample_utls(struct perf_output_handle *handle, u64 addr, - u64 dump_size, struct pt_regs *regs) +perf_output_sample_udump(struct perf_output_handle *handle, +unsigned long addr, +u64 dump_size, +struct pt_regs *regs) { /* Case of a kernel thread, nothing to dump */ if (!regs) { @@ -7054,7 +7017,7 @@ perf_output_sample_utls(struct perf_output_handle *handle, u64 addr, * - the size requested by user or the best one we can fit * in to the sample max size * data -* - user tls dump data +* - user dump data * dynamic size * - the actual dumped size */ @@ -7507,9 +7470,16 @@ void perf_output_sample(struct perf_output_handle *handle, } if (sample_type & PERF_SAMPLE_STACK_USER) { - perf_output_sample_ustack(handle, - data->stack_user_size, - data->regs_user.regs); + struct pt_regs *regs = data->regs_user.regs; + unsigned long sp = 0; + + if (regs) + sp = perf_user_stack_pointer(regs); + + perf_output_sample_udump(handle, +sp, +data->stack_user_size, +regs); } if (sample_type & PERF_SAMPLE_WEIGHT_TYPE) @@ -7551,10 +7521,10 @@ void perf_output_sample(struct perf_output_handle *handle, perf_output_put(handle, data->code_page_size); if (sample_type & PERF_SAMPLE_TLS_USER) { - perf_output_sample_utls(handle, - data->tls_addr, - data->tls_user_size, - data->regs_user.regs); + perf_output_sample_udump(handle, +data->tls_addr, +data->tls_user_size, +data->regs_user.regs); } if (sample_type & PERF_SAMPLE_AUX) { -- 2.34.1
[RFC PATCH 1/4] perf/core: Introduce perf_prepare_dump_data()
Factor out perf_prepare_dump_data() so that the same logic is used for dumping stack data as other types, such as TLS. Slightly refactor perf_sample_ustack_size() to perf_sample_dump_size(). Move reg checks up into perf_ustack_task_size() since the task size must now be calculated before preparing dump data. Signed-off-by: Beau Belgrave --- kernel/events/core.c | 79 ++-- 1 file changed, 47 insertions(+), 32 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 724e6d7e128f..07de5cc2aa25 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6912,7 +6912,13 @@ static void perf_sample_regs_intr(struct perf_regs *regs_intr, */ static u64 perf_ustack_task_size(struct pt_regs *regs) { - unsigned long addr = perf_user_stack_pointer(regs); + unsigned long addr; + + /* No regs, no stack pointer, no dump. */ + if (!regs) + return 0; + + addr = perf_user_stack_pointer(regs); if (!addr || addr >= TASK_SIZE) return 0; @@ -6921,42 +6927,35 @@ static u64 perf_ustack_task_size(struct pt_regs *regs) } static u16 -perf_sample_ustack_size(u16 stack_size, u16 header_size, - struct pt_regs *regs) +perf_sample_dump_size(u16 dump_size, u16 header_size, u64 task_size) { - u64 task_size; - - /* No regs, no stack pointer, no dump. */ - if (!regs) - return 0; - /* -* Check if we fit in with the requested stack size into the: +* Check if we fit in with the requested dump size into the: * - TASK_SIZE * If we don't, we limit the size to the TASK_SIZE. * * - remaining sample size -* If we don't, we customize the stack size to +* If we don't, we customize the dump size to * fit in to the remaining sample size. */ - task_size = min((u64) USHRT_MAX, perf_ustack_task_size(regs)); - stack_size = min(stack_size, (u16) task_size); + task_size = min((u64) USHRT_MAX, task_size); + dump_size = min(dump_size, (u16) task_size); /* Current header size plus static size and dynamic size. */ header_size += 2 * sizeof(u64); - /* Do we fit in with the current stack dump size? */ - if ((u16) (header_size + stack_size) < header_size) { + /* Do we fit in with the current dump size? */ + if ((u16) (header_size + dump_size) < header_size) { /* * If we overflow the maximum size for the sample, -* we customize the stack dump size to fit in. +* we customize the dump size to fit in. */ - stack_size = USHRT_MAX - header_size - sizeof(u64); - stack_size = round_up(stack_size, sizeof(u64)); + dump_size = USHRT_MAX - header_size - sizeof(u64); + dump_size = round_up(dump_size, sizeof(u64)); } - return stack_size; + return dump_size; } static void @@ -7648,6 +7647,32 @@ static __always_inline u64 __cond_set(u64 flags, u64 s, u64 d) return d * !!(flags & s); } +static inline u16 +perf_prepare_dump_data(struct perf_sample_data *data, + struct perf_event *event, + struct pt_regs *regs, + u16 dump_size, + u64 task_size) +{ + u16 header_size = perf_sample_data_size(data, event); + u16 size = sizeof(u64); + + dump_size = perf_sample_dump_size(dump_size, header_size, + task_size); + + /* +* If there is something to dump, add space for the dump +* itself and for the field that tells the dynamic size, +* which is how many have been actually dumped. +*/ + if (dump_size) + size += sizeof(u64) + dump_size; + + data->dyn_size += size; + + return dump_size; +} + void perf_prepare_sample(struct perf_sample_data *data, struct perf_event *event, struct pt_regs *regs) @@ -7725,22 +7750,12 @@ void perf_prepare_sample(struct perf_sample_data *data, * up the rest of the sample size. */ u16 stack_size = event->attr.sample_stack_user; - u16 header_size = perf_sample_data_size(data, event); - u16 size = sizeof(u64); - - stack_size = perf_sample_ustack_size(stack_size, header_size, -data->regs_user.regs); + u64 task_size = perf_ustack_task_size(regs); - /* -* If there is something to dump, add space for the dump -* itself and for the field that tells the dynamic size, -* which is how many hav
Re: Copying TLS/user register data per perf-sample?
On Fri, Apr 12, 2024 at 12:55:19AM +0900, Masami Hiramatsu wrote: > On Wed, 10 Apr 2024 08:35:42 -0700 > Beau Belgrave wrote: > > > On Wed, Apr 10, 2024 at 10:06:28PM +0900, Masami Hiramatsu wrote: > > > On Thu, 4 Apr 2024 12:26:41 -0700 > > > Beau Belgrave wrote: > > > > > > > Hello, > > > > > > > > I'm looking into the possibility of capturing user data that is pointed > > > > to by a user register (IE: fs/gs for TLS on x86/64) for each sample via > > > > perf_events. > > > > > > > > I was hoping to find a way to do this similar to PERF_SAMPLE_STACK_USER. > > > > I think it could even use roughly the same ABI in the perf ring buffer. > > > > Or it may be possible by some kprobe linked to the perf sample function. > > > > > > > > This would allow a profiler to collect TLS (or other values) on x64. In > > > > the Open Telemetry profiling SIG [1], we are trying to find a fast way > > > > to grab a tracing association quickly on a per-thread basis. The team > > > > at Elastic has a bespoke way to do this [2], however, I'd like to see a > > > > more general way to achieve this. The folks I've been talking with seem > > > > open to the idea of just having a TLS value for this we could capture > > > > upon each sample. We could then just state, Open Telemetry SDKs should > > > > have a TLS value for span correlation. However, we need a way to sample > > > > the TLS value(s) when a sampling event is generated. > > > > > > > > Is this already possible via some other means? It'd be great to be able > > > > to do this directly at the perf_event sample via the ABI or a probe. > > > > > > > > > > Have you tried to use uprobes? It should be able to access user-space > > > registers including fs/gs. > > > > > > > We need to get fs/gs during a sample interrupt from perf. If the sample > > interrupt lands during kernel code (IE: syscall) we would also like to > > get these TLS values when in process context. > > OK, those are not directly accessible from pt_regs. > Yeah, it's a per-arch thread attribute. > > > > I have some patches into the kernel to make this possible via > > perf_events that works well, however, I don't want to reinvent the wheel > > if there is some way to get these via perf samples already. > > I would like to see it. I think it is possible to introduce a helper > to get a base address of user TLS for probe events, and start supporting > from x86. > For sure, I'm hoping the patches start the right conversations. > > > > In OTel, we are trying to attribute samples to transactions that are > > occurring. So the TLS fetch has to be aligned exactly with the sample. > > You can do this via eBPF when it's available, however, we have > > environments where eBPF is not available. > > > > It's sounding like to do this properly without eBPF a new feature would > > be required. If so, I do have some patches I can share in a bit as an > > RFC. > > It is better to be shared in RFC stage, so that we can discuss it from > the direction level. > Agree, it could be that having the ability to run a probe on sample may be a better option. Not sure. Thanks, -Beau > Thank you, > > > > > Thanks, > > -Beau > > > > > Thank you, > > > > > > -- > > > Masami Hiramatsu (Google) > > > -- > Masami Hiramatsu (Google)
Re: Copying TLS/user register data per perf-sample?
On Tue, Apr 09, 2024 at 04:32:46PM -0700, Namhyung Kim wrote: > Hello, > > On Thu, Apr 4, 2024 at 12:26 PM Beau Belgrave > wrote: > > > > Hello, > > > > I'm looking into the possibility of capturing user data that is pointed > > to by a user register (IE: fs/gs for TLS on x86/64) for each sample via > > perf_events. > > > > I was hoping to find a way to do this similar to PERF_SAMPLE_STACK_USER. > > I think it could even use roughly the same ABI in the perf ring buffer. > > Or it may be possible by some kprobe linked to the perf sample function. > > > > This would allow a profiler to collect TLS (or other values) on x64. In > > the Open Telemetry profiling SIG [1], we are trying to find a fast way > > to grab a tracing association quickly on a per-thread basis. The team > > at Elastic has a bespoke way to do this [2], however, I'd like to see a > > more general way to achieve this. The folks I've been talking with seem > > open to the idea of just having a TLS value for this we could capture > > upon each sample. We could then just state, Open Telemetry SDKs should > > have a TLS value for span correlation. However, we need a way to sample > > the TLS value(s) when a sampling event is generated. > > > > Is this already possible via some other means? It'd be great to be able > > to do this directly at the perf_event sample via the ABI or a probe. > > I don't think the current perf ABI allows capturing %fs/%gs + offset. > IIRC kprobes/uprobes don't have that too but I could be wrong. > Yeah, I didn't see it either. I have some patches that I will submit in a bit as RFC that enable this functionality. I was hoping there was already an easy way to do this. Thanks, -Beau > Thanks, > Namhyung > > > > > 1. https://opentelemetry.io/blog/2024/profiling/ > > 2. > > https://www.elastic.co/blog/continuous-profiling-distributed-tracing-correlation
Re: Copying TLS/user register data per perf-sample?
On Wed, Apr 10, 2024 at 10:06:28PM +0900, Masami Hiramatsu wrote: > On Thu, 4 Apr 2024 12:26:41 -0700 > Beau Belgrave wrote: > > > Hello, > > > > I'm looking into the possibility of capturing user data that is pointed > > to by a user register (IE: fs/gs for TLS on x86/64) for each sample via > > perf_events. > > > > I was hoping to find a way to do this similar to PERF_SAMPLE_STACK_USER. > > I think it could even use roughly the same ABI in the perf ring buffer. > > Or it may be possible by some kprobe linked to the perf sample function. > > > > This would allow a profiler to collect TLS (or other values) on x64. In > > the Open Telemetry profiling SIG [1], we are trying to find a fast way > > to grab a tracing association quickly on a per-thread basis. The team > > at Elastic has a bespoke way to do this [2], however, I'd like to see a > > more general way to achieve this. The folks I've been talking with seem > > open to the idea of just having a TLS value for this we could capture > > upon each sample. We could then just state, Open Telemetry SDKs should > > have a TLS value for span correlation. However, we need a way to sample > > the TLS value(s) when a sampling event is generated. > > > > Is this already possible via some other means? It'd be great to be able > > to do this directly at the perf_event sample via the ABI or a probe. > > > > Have you tried to use uprobes? It should be able to access user-space > registers including fs/gs. > We need to get fs/gs during a sample interrupt from perf. If the sample interrupt lands during kernel code (IE: syscall) we would also like to get these TLS values when in process context. I have some patches into the kernel to make this possible via perf_events that works well, however, I don't want to reinvent the wheel if there is some way to get these via perf samples already. In OTel, we are trying to attribute samples to transactions that are occurring. So the TLS fetch has to be aligned exactly with the sample. You can do this via eBPF when it's available, however, we have environments where eBPF is not available. It's sounding like to do this properly without eBPF a new feature would be required. If so, I do have some patches I can share in a bit as an RFC. Thanks, -Beau > Thank you, > > -- > Masami Hiramatsu (Google)
Copying TLS/user register data per perf-sample?
Hello, I'm looking into the possibility of capturing user data that is pointed to by a user register (IE: fs/gs for TLS on x86/64) for each sample via perf_events. I was hoping to find a way to do this similar to PERF_SAMPLE_STACK_USER. I think it could even use roughly the same ABI in the perf ring buffer. Or it may be possible by some kprobe linked to the perf sample function. This would allow a profiler to collect TLS (or other values) on x64. In the Open Telemetry profiling SIG [1], we are trying to find a fast way to grab a tracing association quickly on a per-thread basis. The team at Elastic has a bespoke way to do this [2], however, I'd like to see a more general way to achieve this. The folks I've been talking with seem open to the idea of just having a TLS value for this we could capture upon each sample. We could then just state, Open Telemetry SDKs should have a TLS value for span correlation. However, we need a way to sample the TLS value(s) when a sampling event is generated. Is this already possible via some other means? It'd be great to be able to do this directly at the perf_event sample via the ABI or a probe. Thanks, -Beau 1. https://opentelemetry.io/blog/2024/profiling/ 2. https://www.elastic.co/blog/continuous-profiling-distributed-tracing-correlation
[PATCH v4 4/4] tracing/user_events: Document multi-format flag
User programs can now ask user_events to handle the synchronization of multiple different formats for an event with the same name via the new USER_EVENT_REG_MULTI_FORMAT flag. Add a section for USER_EVENT_REG_MULTI_FORMAT that explains the intended purpose and caveats of using it. Explain how deletion works in these cases and how to use /sys/kernel/tracing/dynamic_events for per-version deletion. Signed-off-by: Beau Belgrave --- Documentation/trace/user_events.rst | 27 ++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/Documentation/trace/user_events.rst b/Documentation/trace/user_events.rst index d8f12442aaa6..1d5a7626e6a6 100644 --- a/Documentation/trace/user_events.rst +++ b/Documentation/trace/user_events.rst @@ -92,6 +92,24 @@ The following flags are currently supported. process closes or unregisters the event. Requires CAP_PERFMON otherwise -EPERM is returned. ++ USER_EVENT_REG_MULTI_FORMAT: The event can contain multiple formats. This + allows programs to prevent themselves from being blocked when their event + format changes and they wish to use the same name. When this flag is used the + tracepoint name will be in the new format of "name.unique_id" vs the older + format of "name". A tracepoint will be created for each unique pair of name + and format. This means if several processes use the same name and format, + they will use the same tracepoint. If yet another process uses the same name, + but a different format than the other processes, it will use a different + tracepoint with a new unique id. Recording programs need to scan tracefs for + the various different formats of the event name they are interested in + recording. The system name of the tracepoint will also use "user_events_multi" + instead of "user_events". This prevents single-format event names conflicting + with any multi-format event names within tracefs. The unique_id is output as + a hex string. Recording programs should ensure the tracepoint name starts with + the event name they registered and has a suffix that starts with . and only + has hex characters. For example to find all versions of the event "test" you + can use the regex "^test\.[0-9a-fA-F]+$". + Upon successful registration the following is set. + write_index: The index to use for this file descriptor that represents this @@ -106,6 +124,9 @@ or perf record -e user_events:[name] when attaching/recording. **NOTE:** The event subsystem name by default is "user_events". Callers should not assume it will always be "user_events". Operators reserve the right in the future to change the subsystem name per-process to accommodate event isolation. +In addition if the USER_EVENT_REG_MULTI_FORMAT flag is used the tracepoint name +will have a unique id appended to it and the system name will be +"user_events_multi" as described above. Command Format ^^ @@ -156,7 +177,11 @@ to request deletes than the one used for registration due to this. to the event. If programs do not want auto-delete, they must use the USER_EVENT_REG_PERSIST flag when registering the event. Once that flag is used the event exists until DIAG_IOCSDEL is invoked. Both register and delete of an -event that persists requires CAP_PERFMON, otherwise -EPERM is returned. +event that persists requires CAP_PERFMON, otherwise -EPERM is returned. When +there are multiple formats of the same event name, all events with the same +name will be attempted to be deleted. If only a specific version is wanted to +be deleted then the /sys/kernel/tracing/dynamic_events file should be used for +that specific format of the event. Unregistering - -- 2.34.1
[PATCH v4 1/4] tracing/user_events: Prepare find/delete for same name events
The current code for finding and deleting events assumes that there will never be cases when user_events are registered with the same name, but different formats. Scenarios exist where programs want to use the same name but have different formats. An example is multiple versions of a program running side-by-side using the same event name, but with updated formats in each version. This change does not yet allow for multi-format events. If user_events are registered with the same name but different arguments the programs see the same return values as before. This change simply makes it possible to easily accomodate for this. Update find_user_event() to take in argument parameters and register flags to accomodate future multi-format event scenarios. Have find validate argument matching and return error pointers to cover when an existing event has the same name but different format. Update callers to handle error pointer logic. Move delete_user_event() to use hash walking directly now that find_user_event() has changed. Delete all events found that match the register name, stop if an error occurs and report back to the user. Update user_fields_match() to cover list_empty() scenarios now that find_user_event() uses it directly. This makes the logic consistent accross several callsites. Signed-off-by: Beau Belgrave --- kernel/trace/trace_events_user.c | 107 +-- 1 file changed, 59 insertions(+), 48 deletions(-) diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index 9365ce407426..dda58681247e 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -202,6 +202,8 @@ static struct user_event_mm *user_event_mm_get(struct user_event_mm *mm); static struct user_event_mm *user_event_mm_get_all(struct user_event *user); static void user_event_mm_put(struct user_event_mm *mm); static int destroy_user_event(struct user_event *user); +static bool user_fields_match(struct user_event *user, int argc, + const char **argv); static u32 user_event_key(char *name) { @@ -1493,17 +1495,24 @@ static int destroy_user_event(struct user_event *user) } static struct user_event *find_user_event(struct user_event_group *group, - char *name, u32 *outkey) + char *name, int argc, const char **argv, + u32 flags, u32 *outkey) { struct user_event *user; u32 key = user_event_key(name); *outkey = key; - hash_for_each_possible(group->register_table, user, node, key) - if (!strcmp(EVENT_NAME(user), name)) + hash_for_each_possible(group->register_table, user, node, key) { + if (strcmp(EVENT_NAME(user), name)) + continue; + + if (user_fields_match(user, argc, argv)) return user_event_get(user); + return ERR_PTR(-EADDRINUSE); + } + return NULL; } @@ -1860,6 +1869,9 @@ static bool user_fields_match(struct user_event *user, int argc, struct list_head *head = >fields; int i = 0; + if (argc == 0) + return list_empty(head); + list_for_each_entry_reverse(field, head, link) { if (!user_field_match(field, argc, argv, )) return false; @@ -1880,10 +1892,8 @@ static bool user_event_match(const char *system, const char *event, match = strcmp(EVENT_NAME(user), event) == 0 && (!system || strcmp(system, USER_EVENTS_SYSTEM) == 0); - if (match && argc > 0) + if (match) match = user_fields_match(user, argc, argv); - else if (match && argc == 0) - match = list_empty(>fields); return match; } @@ -1922,11 +1932,11 @@ static int user_event_parse(struct user_event_group *group, char *name, char *args, char *flags, struct user_event **newuser, int reg_flags) { - int ret; - u32 key; struct user_event *user; + char **argv = NULL; int argc = 0; - char **argv; + int ret; + u32 key; /* Currently don't support any text based flags */ if (flags != NULL) @@ -1935,41 +1945,34 @@ static int user_event_parse(struct user_event_group *group, char *name, if (!user_event_capable(reg_flags)) return -EPERM; + if (args) { + argv = argv_split(GFP_KERNEL, args, ); + + if (!argv) + return -ENOMEM; + } + /* Prevent dyn_event from racing */ mutex_lock(_mutex); - user = find_user_event(group, name, ); + user = find_user_event(group, name, argc, (const char **)argv, + reg_flags, ); m
[PATCH v4 2/4] tracing/user_events: Introduce multi-format events
Currently user_events supports 1 event with the same name and must have the exact same format when referenced by multiple programs. This opens an opportunity for malicous or poorly thought through programs to create events that others use with different formats. Another scenario is user programs wishing to use the same event name but add more fields later when the software updates. Various versions of a program may be running side-by-side, which is prevented by the current single format requirement. Add a new register flag (USER_EVENT_REG_MULTI_FORMAT) which indicates the user program wishes to use the same user_event name, but may have several different formats of the event. When this flag is used, create the underlying tracepoint backing the user_event with a unique name per-version of the format. It's important that existing ABI users do not get this logic automatically, even if one of the multi format events matches the format. This ensures existing programs that create events and assume the tracepoint name will match exactly continue to work as expected. Add logic to only check multi-format events with other multi-format events and single-format events to only check single-format events during find. Change system name of the multi-format event tracepoint to ensure that multi-format events are isolated completely from single-format events. This prevents single-format names from conflicting with multi-format events if they end with the same suffix as the multi-format events. Add a register_name (reg_name) to the user_event struct which allows for split naming of events. We now have the name that was used to register within user_events as well as the unique name for the tracepoint. Upon registering events ensure matches based on first the reg_name, followed by the fields and format of the event. This allows for multiple events with the same registered name to have different formats. The underlying tracepoint will have a unique name in the format of {reg_name}.{unique_id}. For example, if both "test u32 value" and "test u64 value" are used with the USER_EVENT_REG_MULTI_FORMAT the system would have 2 unique tracepoints. The dynamic_events file would then show the following: u:test u64 count u:test u32 count The actual tracepoint names look like this: test.0 test.1 Both would be under the new user_events_multi system name to prevent the older ABI from being used to squat on multi-formatted events and block their use. Deleting events via "!u:test u64 count" would only delete the first tracepoint that matched that format. When the delete ABI is used all events with the same name will be attempted to be deleted. If per-version deletion is required, user programs should either not use persistent events or delete them via dynamic_events. Signed-off-by: Beau Belgrave --- include/uapi/linux/user_events.h | 6 +- kernel/trace/trace_events_user.c | 102 +++ 2 files changed, 95 insertions(+), 13 deletions(-) diff --git a/include/uapi/linux/user_events.h b/include/uapi/linux/user_events.h index f74f3aedd49c..a03de03dccbc 100644 --- a/include/uapi/linux/user_events.h +++ b/include/uapi/linux/user_events.h @@ -12,6 +12,7 @@ #include #define USER_EVENTS_SYSTEM "user_events" +#define USER_EVENTS_MULTI_SYSTEM "user_events_multi" #define USER_EVENTS_PREFIX "u:" /* Create dynamic location entry within a 32-bit value */ @@ -22,8 +23,11 @@ enum user_reg_flag { /* Event will not delete upon last reference closing */ USER_EVENT_REG_PERSIST = 1U << 0, + /* Event will be allowed to have multiple formats */ + USER_EVENT_REG_MULTI_FORMAT = 1U << 1, + /* This value or above is currently non-ABI */ - USER_EVENT_REG_MAX = 1U << 1, + USER_EVENT_REG_MAX = 1U << 2, }; /* diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index dda58681247e..044920c415e7 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -34,7 +34,8 @@ /* Limit how long of an event name plus args within the subsystem. */ #define MAX_EVENT_DESC 512 -#define EVENT_NAME(user_event) ((user_event)->tracepoint.name) +#define EVENT_NAME(user_event) ((user_event)->reg_name) +#define EVENT_TP_NAME(user_event) ((user_event)->tracepoint.name) #define MAX_FIELD_ARRAY_SIZE 1024 /* @@ -54,10 +55,13 @@ * allows isolation for events by various means. */ struct user_event_group { - char*system_name; - struct hlist_node node; - struct mutex reg_mutex; + char*system_name; + char*system_multi_name; + struct hlist_node node; + struct mutexreg_mutex; DECLARE_HASHTABLE(register_table, 8); + /* ID that moves forward within
[PATCH v4 3/4] selftests/user_events: Test multi-format events
User_events now has multi-format events which allow for the same register name, but with different formats. When this occurs, different tracepoints are created with unique names. Add a new test that ensures the same name can be used for two different formats. Ensure they are isolated from each other and that name and arg matching still works if yet another register comes in with the same format as one of the two. Signed-off-by: Beau Belgrave --- .../testing/selftests/user_events/abi_test.c | 134 ++ 1 file changed, 134 insertions(+) diff --git a/tools/testing/selftests/user_events/abi_test.c b/tools/testing/selftests/user_events/abi_test.c index cef1ff1af223..7288a05136ba 100644 --- a/tools/testing/selftests/user_events/abi_test.c +++ b/tools/testing/selftests/user_events/abi_test.c @@ -16,6 +16,8 @@ #include #include #include +#include +#include #include #include "../kselftest_harness.h" @@ -23,6 +25,62 @@ const char *data_file = "/sys/kernel/tracing/user_events_data"; const char *enable_file = "/sys/kernel/tracing/events/user_events/__abi_event/enable"; +const char *multi_dir_glob = "/sys/kernel/tracing/events/user_events_multi/__abi_event.*"; + +static int wait_for_delete(char *dir) +{ + struct stat buf; + int i; + + for (i = 0; i < 1; ++i) { + if (stat(dir, ) == -1 && errno == ENOENT) + return 0; + + usleep(1000); + } + + return -1; +} + +static int find_multi_event_dir(char *unique_field, char *out_dir, int dir_len) +{ + char path[256]; + glob_t buf; + int i, ret; + + ret = glob(multi_dir_glob, GLOB_ONLYDIR, NULL, ); + + if (ret) + return -1; + + ret = -1; + + for (i = 0; i < buf.gl_pathc; ++i) { + FILE *fp; + + snprintf(path, sizeof(path), "%s/format", buf.gl_pathv[i]); + fp = fopen(path, "r"); + + if (!fp) + continue; + + while (fgets(path, sizeof(path), fp) != NULL) { + if (strstr(path, unique_field)) { + fclose(fp); + /* strscpy is not available, use snprintf */ + snprintf(out_dir, dir_len, "%s", buf.gl_pathv[i]); + ret = 0; + goto out; + } + } + + fclose(fp); + } +out: + globfree(); + + return ret; +} static bool event_exists(void) { @@ -74,6 +132,39 @@ static int event_delete(void) return ret; } +static int reg_enable_multi(void *enable, int size, int bit, int flags, + char *args) +{ + struct user_reg reg = {0}; + char full_args[512] = {0}; + int fd = open(data_file, O_RDWR); + int len; + int ret; + + if (fd < 0) + return -1; + + len = snprintf(full_args, sizeof(full_args), "__abi_event %s", args); + + if (len > sizeof(full_args)) { + ret = -E2BIG; + goto out; + } + + reg.size = sizeof(reg); + reg.name_args = (__u64)full_args; + reg.flags = USER_EVENT_REG_MULTI_FORMAT | flags; + reg.enable_bit = bit; + reg.enable_addr = (__u64)enable; + reg.enable_size = size; + + ret = ioctl(fd, DIAG_IOCSREG, ); +out: + close(fd); + + return ret; +} + static int reg_enable_flags(void *enable, int size, int bit, int flags) { struct user_reg reg = {0}; @@ -207,6 +298,49 @@ TEST_F(user, bit_sizes) { ASSERT_NE(0, reg_enable(>check, 128, 0)); } +TEST_F(user, multi_format) { + char first_dir[256]; + char second_dir[256]; + struct stat buf; + + /* Multiple formats for the same name should work */ + ASSERT_EQ(0, reg_enable_multi(>check, sizeof(int), 0, + 0, "u32 multi_first")); + + ASSERT_EQ(0, reg_enable_multi(>check, sizeof(int), 1, + 0, "u64 multi_second")); + + /* Same name with same format should also work */ + ASSERT_EQ(0, reg_enable_multi(>check, sizeof(int), 2, + 0, "u64 multi_second")); + + ASSERT_EQ(0, find_multi_event_dir("multi_first", + first_dir, sizeof(first_dir))); + + ASSERT_EQ(0, find_multi_event_dir("multi_second", + second_dir, sizeof(second_dir))); + + /* Should not be found in the same dir */ + ASSERT_NE(0, strcmp(first_dir, second_dir)); + + /* First dir should still exist */ + ASSERT_EQ(0, stat(first_dir, )); + + /* Disabling first register sho
[PATCH v4 0/4] tracing/user_events: Introduce multi-format events
Currently user_events supports 1 event with the same name and must have the exact same format when referenced by multiple programs. This opens an opportunity for malicous or poorly thought through programs to create events that others use with different formats. Another scenario is user programs wishing to use the same event name but add more fields later when the software updates. Various versions of a program may be running side-by-side, which is prevented by the current single format requirement. Add a new register flag (USER_EVENT_REG_MULTI_FORMAT) which indicates the user program wishes to use the same user_event name, but may have several different formats of the event. When this flag is used, create the underlying tracepoint backing the user_event with a unique name per-version of the format. It's important that existing ABI users do not get this logic automatically, even if one of the multi format events matches the format. This ensures existing programs that create events and assume the tracepoint name will match exactly continue to work as expected. Add logic to only check multi-format events with other multi-format events and single-format events to only check single-format events during find. Change system name of the multi-format event tracepoint to ensure that multi-format events are isolated completely from single-format events. This prevents single-format names from conflicting with multi-format events if they end with the same suffix as the multi-format events. Add a register_name (reg_name) to the user_event struct which allows for split naming of events. We now have the name that was used to register within user_events as well as the unique name for the tracepoint. Upon registering events ensure matches based on first the reg_name, followed by the fields and format of the event. This allows for multiple events with the same registered name to have different formats. The underlying tracepoint will have a unique name in the format of {reg_name}.{unique_id}. For example, if both "test u32 value" and "test u64 value" are used with the USER_EVENT_REG_MULTI_FORMAT the system would have 2 unique tracepoints. The dynamic_events file would then show the following: u:test u64 count u:test u32 count The actual tracepoint names look like this: test.0 test.1 Both would be under the new user_events_multi system name to prevent the older ABI from being used to squat on multi-formatted events and block their use. Deleting events via "!u:test u64 count" would only delete the first tracepoint that matched that format. When the delete ABI is used all events with the same name will be attempted to be deleted. If per-version deletion is required, user programs should either not use persistent events or delete them via dynamic_events. Changes in v4: Use kstrdup() in user_event_group_system_multi_name() vs kmalloc() and snprintf(). Use kasprintf() in user_event_set_tp_name() vs kzalloc() and snprintf(). Grammar fixes in change logs. Changes in v3: Use hash_for_each_possible_safe() in destroy_user_event() to prevent use after free (caught by kernel test robot ). Changes in v2: Tracepoint names changed from "name:[id]" to "name.id". Feedback was the : could conflict with system name formats. []'s are also special characters for bash. Updated self-test and docs to reflect the new suffix format. Updated docs to include a regex example to help guide recording programs find the correct event in ambiguous cases. Beau Belgrave (4): tracing/user_events: Prepare find/delete for same name events tracing/user_events: Introduce multi-format events selftests/user_events: Test multi-format events tracing/user_events: Document multi-format flag Documentation/trace/user_events.rst | 27 ++- include/uapi/linux/user_events.h | 6 +- kernel/trace/trace_events_user.c | 209 +- .../testing/selftests/user_events/abi_test.c | 134 +++ 4 files changed, 314 insertions(+), 62 deletions(-) base-commit: 610a9b8f49fbcf1100716370d3b5f6f884a2835a -- 2.34.1
Re: [PATCH v3 1/4] tracing/user_events: Prepare find/delete for same name events
On Wed, Feb 21, 2024 at 10:17:21AM -0500, Steven Rostedt wrote: > On Wed, 14 Feb 2024 17:50:43 + > Beau Belgrave wrote: > > So the patches look good, but since I gave you some updates, I'm now going > to go though "nits". Like grammar and such ;-) > Sure thing. > > The current code for finding and deleting events assumes that there will > > never be cases when user_events are registered with the same name, but > > different formats. In the future this scenario will exist to ensure > > > user programs can be updated or modify their events and run different > > versions of their programs side-by-side without being blocked. > > Can you change the last sentence above. I read it three times and it's > still awkward to understand it. Particularly, the "user programs can be > updated or modify their events". That just doesn't want to compute. > Yeah, I'll clean this up. > > > > This change does not yet allow for multi-format events. If user_events > > are registered with the same name but different arguments the programs > > see the same return values as before. This change simply makes it > > possible to easily accomodate for this in future changes. > > I think you can drop the "in future changes" part. > Agreed. > > > > Update find_user_event() to take in argument parameters and register > > flags to accomodate future multi-format event scenarios. Have find > > validate argument matching and return error pointers to cover address > > in use cases, or allocation errors. Update callers to handle error > > "to cover address in use cases" ? Yeah, if the ABI is using a single-format event and it's already in use, we return -EADDRINUSE. It does not happen in multi-format event cases, since that is allowed. I'll try to clarify this a bit. > > > pointer logic. > > > > Move delete_user_event() to use hash walking directly now that find has > > changed. Delete all events found that match the register name, stop > > "now that find has changed" ? You mean the "find function"? > Yeah, I'll just use the function name here, find_user_event(). > > if an error occurs and report back to the user. > > > > Update user_fields_match() to cover list_empty() scenarios instead of > > each callsite doing it now that find_user_event() uses it directly. > > The above is a bit of a run-on sentence. > I'll clean it up a bit. Thanks, -Beau > -- Steve > > > > > Signed-off-by: Beau Belgrave > > ---
Re: [PATCH v3 2/4] tracing/user_events: Introduce multi-format events
On Wed, Feb 21, 2024 at 10:08:33AM -0500, Steven Rostedt wrote: > On Wed, 14 Feb 2024 17:50:44 + > Beau Belgrave wrote: > > > +static char *user_event_group_system_multi_name(void) > > +{ > > + char *system_name; > > + int len = sizeof(USER_EVENTS_MULTI_SYSTEM) + 1; > > FYI, the sizeof() will include the "\0" so no need for "+ 1", but I don't > think this matters as for what I mention below. > > > + > > + system_name = kmalloc(len, GFP_KERNEL); > > + > > + if (!system_name) > > + return NULL; > > + > > + snprintf(system_name, len, "%s", USER_EVENTS_MULTI_SYSTEM); > > + > > + return system_name; > > Hmm, the above looks like an open coded version of: > > system_name = kstrdup(USER_EVENTS_MULTI_SYSTEM, GFP_KERNEL); > That's much cleaner, I'll move to that. > > > +} > > + > > static struct user_event_group *current_user_event_group(void) > > { > > return init_group; > > @@ -367,6 +390,11 @@ static struct user_event_group > > *user_event_group_create(void) > > if (!group->system_name) > > goto error; > > > > + group->system_multi_name = user_event_group_system_multi_name(); > > + > > + if (!group->system_multi_name) > > + goto error; > > + > > mutex_init(>reg_mutex); > > hash_init(group->register_table); > > > > @@ -1482,6 +1510,11 @@ static int destroy_user_event(struct user_event > > *user) > > hash_del(>node); > > > > user_event_destroy_validators(user); > > + > > + /* If we have different names, both must be freed */ > > + if (EVENT_NAME(user) != EVENT_TP_NAME(user)) > > + kfree(EVENT_TP_NAME(user)); > > + > > kfree(user->call.print_fmt); > > kfree(EVENT_NAME(user)); > > kfree(user); > > @@ -1504,12 +1537,24 @@ static struct user_event *find_user_event(struct > > user_event_group *group, > > *outkey = key; > > > > hash_for_each_possible(group->register_table, user, node, key) { > > + /* > > +* Single-format events shouldn't return multi-format > > +* events. Callers expect the underlying tracepoint to match > > +* the name exactly in these cases. Only check like-formats. > > +*/ > > + if (EVENT_MULTI_FORMAT(flags) != > > EVENT_MULTI_FORMAT(user->reg_flags)) > > + continue; > > + > > if (strcmp(EVENT_NAME(user), name)) > > continue; > > > > if (user_fields_match(user, argc, argv)) > > return user_event_get(user); > > > > + /* Scan others if this is a multi-format event */ > > + if (EVENT_MULTI_FORMAT(flags)) > > + continue; > > + > > return ERR_PTR(-EADDRINUSE); > > } > > > > @@ -1889,8 +1934,12 @@ static bool user_event_match(const char *system, > > const char *event, > > struct user_event *user = container_of(ev, struct user_event, devent); > > bool match; > > > > - match = strcmp(EVENT_NAME(user), event) == 0 && > > - (!system || strcmp(system, USER_EVENTS_SYSTEM) == 0); > > + match = strcmp(EVENT_NAME(user), event) == 0; > > + > > + if (match && system) { > > + match = strcmp(system, user->group->system_name) == 0 || > > + strcmp(system, user->group->system_multi_name) == 0; > > + } > > > > if (match) > > match = user_fields_match(user, argc, argv); > > @@ -1923,6 +1972,39 @@ static int user_event_trace_register(struct > > user_event *user) > > return ret; > > } > > > > +static int user_event_set_tp_name(struct user_event *user) > > +{ > > + lockdep_assert_held(>group->reg_mutex); > > + > > + if (EVENT_MULTI_FORMAT(user->reg_flags)) { > > + char *multi_name; > > + int len; > > + > > + len = snprintf(NULL, 0, "%s.%llx", user->reg_name, > > + user->group->multi_id) + 1; > > + > > + multi_name = kzalloc(len, GFP_KERNEL_ACCOUNT); > > + > > + if (!multi_name) > > + return -ENOMEM; > > + > > + snprintf(multi_name, len, "%s.%llx", user->reg_name, > > +user->group->multi_id); > > I believe the above can be replaced with: > > multi_name = kasprintf(GFP_KERNEL_ACCOUNT, "%s.%llx", > user->reg_name, > user->group->multi_id); > if (!multi_name) > return -ENOMEM; > Great, I'll move to that as well and validate. Thanks, -Beau > -- Steve > > > + > > + user->call.name = multi_name; > > + user->tracepoint.name = multi_name; > > + > > + /* Inc to ensure unique multi-event name next time */ > > + user->group->multi_id++; > > + } else { > > + /* Non Multi-format uses register name */ > > + user->call.name = user->reg_name; > > + user->tracepoint.name = user->reg_name; > > + } > > + > > + return 0; > > +}
Re: [PATCH v3 2/4] tracing/user_events: Introduce multi-format events
On Wed, Feb 21, 2024 at 10:21:04AM -0500, Steven Rostedt wrote: > On Wed, 14 Feb 2024 17:50:44 + > Beau Belgrave wrote: > > > Currently user_events supports 1 event with the same name and must have > > the exact same format when referenced by multiple programs. This opens > > an opportunity for malicous or poorly thought through programs to > > create events that others use with different formats. Another scenario > > is user programs wishing to use the same event name but add more fields > > later when the software updates. Various versions of a program may be > > running side-by-side, which is prevented by the current single format > > requirement. > > > > Add a new register flag (USER_EVENT_REG_MULTI_FORMAT) which indicates > > the user program wishes to use the same user_event name, but may have > > several different formats of the event in the future. When this flag is > > "of the event in the future." Does it have to be in the future? Is there > use case where an application might legitimately want the same event name > with different formats? > You're right, our use cases are mostly around future facing/compat. There are valid cases where you just want several different formats with the same name. I'll drop the "in the future", so it'll just be "several different formats". Thanks, -Beau > -- Steve > > > used, create the underlying tracepoint backing the user_event with a > > unique name per-version of the format. It's important that existing ABI > > users do not get this logic automatically, even if one of the multi > > format events matches the format. This ensures existing programs that > > create events and assume the tracepoint name will match exactly continue > > to work as expected. Add logic to only check multi-format events with > > other multi-format events and single-format events to only check > > single-format events during find. > >
[PATCH v3 4/4] tracing/user_events: Document multi-format flag
User programs can now ask user_events to handle the synchronization of multiple different formats for an event with the same name via the new USER_EVENT_REG_MULTI_FORMAT flag. Add a section for USER_EVENT_REG_MULTI_FORMAT that explains the intended purpose and caveats of using it. Explain how deletion works in these cases and how to use /sys/kernel/tracing/dynamic_events for per-version deletion. Signed-off-by: Beau Belgrave --- Documentation/trace/user_events.rst | 27 ++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/Documentation/trace/user_events.rst b/Documentation/trace/user_events.rst index d8f12442aaa6..1d5a7626e6a6 100644 --- a/Documentation/trace/user_events.rst +++ b/Documentation/trace/user_events.rst @@ -92,6 +92,24 @@ The following flags are currently supported. process closes or unregisters the event. Requires CAP_PERFMON otherwise -EPERM is returned. ++ USER_EVENT_REG_MULTI_FORMAT: The event can contain multiple formats. This + allows programs to prevent themselves from being blocked when their event + format changes and they wish to use the same name. When this flag is used the + tracepoint name will be in the new format of "name.unique_id" vs the older + format of "name". A tracepoint will be created for each unique pair of name + and format. This means if several processes use the same name and format, + they will use the same tracepoint. If yet another process uses the same name, + but a different format than the other processes, it will use a different + tracepoint with a new unique id. Recording programs need to scan tracefs for + the various different formats of the event name they are interested in + recording. The system name of the tracepoint will also use "user_events_multi" + instead of "user_events". This prevents single-format event names conflicting + with any multi-format event names within tracefs. The unique_id is output as + a hex string. Recording programs should ensure the tracepoint name starts with + the event name they registered and has a suffix that starts with . and only + has hex characters. For example to find all versions of the event "test" you + can use the regex "^test\.[0-9a-fA-F]+$". + Upon successful registration the following is set. + write_index: The index to use for this file descriptor that represents this @@ -106,6 +124,9 @@ or perf record -e user_events:[name] when attaching/recording. **NOTE:** The event subsystem name by default is "user_events". Callers should not assume it will always be "user_events". Operators reserve the right in the future to change the subsystem name per-process to accommodate event isolation. +In addition if the USER_EVENT_REG_MULTI_FORMAT flag is used the tracepoint name +will have a unique id appended to it and the system name will be +"user_events_multi" as described above. Command Format ^^ @@ -156,7 +177,11 @@ to request deletes than the one used for registration due to this. to the event. If programs do not want auto-delete, they must use the USER_EVENT_REG_PERSIST flag when registering the event. Once that flag is used the event exists until DIAG_IOCSDEL is invoked. Both register and delete of an -event that persists requires CAP_PERFMON, otherwise -EPERM is returned. +event that persists requires CAP_PERFMON, otherwise -EPERM is returned. When +there are multiple formats of the same event name, all events with the same +name will be attempted to be deleted. If only a specific version is wanted to +be deleted then the /sys/kernel/tracing/dynamic_events file should be used for +that specific format of the event. Unregistering - -- 2.34.1
[PATCH v3 1/4] tracing/user_events: Prepare find/delete for same name events
The current code for finding and deleting events assumes that there will never be cases when user_events are registered with the same name, but different formats. In the future this scenario will exist to ensure user programs can be updated or modify their events and run different versions of their programs side-by-side without being blocked. This change does not yet allow for multi-format events. If user_events are registered with the same name but different arguments the programs see the same return values as before. This change simply makes it possible to easily accomodate for this in future changes. Update find_user_event() to take in argument parameters and register flags to accomodate future multi-format event scenarios. Have find validate argument matching and return error pointers to cover address in use cases, or allocation errors. Update callers to handle error pointer logic. Move delete_user_event() to use hash walking directly now that find has changed. Delete all events found that match the register name, stop if an error occurs and report back to the user. Update user_fields_match() to cover list_empty() scenarios instead of each callsite doing it now that find_user_event() uses it directly. Signed-off-by: Beau Belgrave --- kernel/trace/trace_events_user.c | 107 +-- 1 file changed, 59 insertions(+), 48 deletions(-) diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index 9365ce407426..dda58681247e 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -202,6 +202,8 @@ static struct user_event_mm *user_event_mm_get(struct user_event_mm *mm); static struct user_event_mm *user_event_mm_get_all(struct user_event *user); static void user_event_mm_put(struct user_event_mm *mm); static int destroy_user_event(struct user_event *user); +static bool user_fields_match(struct user_event *user, int argc, + const char **argv); static u32 user_event_key(char *name) { @@ -1493,17 +1495,24 @@ static int destroy_user_event(struct user_event *user) } static struct user_event *find_user_event(struct user_event_group *group, - char *name, u32 *outkey) + char *name, int argc, const char **argv, + u32 flags, u32 *outkey) { struct user_event *user; u32 key = user_event_key(name); *outkey = key; - hash_for_each_possible(group->register_table, user, node, key) - if (!strcmp(EVENT_NAME(user), name)) + hash_for_each_possible(group->register_table, user, node, key) { + if (strcmp(EVENT_NAME(user), name)) + continue; + + if (user_fields_match(user, argc, argv)) return user_event_get(user); + return ERR_PTR(-EADDRINUSE); + } + return NULL; } @@ -1860,6 +1869,9 @@ static bool user_fields_match(struct user_event *user, int argc, struct list_head *head = >fields; int i = 0; + if (argc == 0) + return list_empty(head); + list_for_each_entry_reverse(field, head, link) { if (!user_field_match(field, argc, argv, )) return false; @@ -1880,10 +1892,8 @@ static bool user_event_match(const char *system, const char *event, match = strcmp(EVENT_NAME(user), event) == 0 && (!system || strcmp(system, USER_EVENTS_SYSTEM) == 0); - if (match && argc > 0) + if (match) match = user_fields_match(user, argc, argv); - else if (match && argc == 0) - match = list_empty(>fields); return match; } @@ -1922,11 +1932,11 @@ static int user_event_parse(struct user_event_group *group, char *name, char *args, char *flags, struct user_event **newuser, int reg_flags) { - int ret; - u32 key; struct user_event *user; + char **argv = NULL; int argc = 0; - char **argv; + int ret; + u32 key; /* Currently don't support any text based flags */ if (flags != NULL) @@ -1935,41 +1945,34 @@ static int user_event_parse(struct user_event_group *group, char *name, if (!user_event_capable(reg_flags)) return -EPERM; + if (args) { + argv = argv_split(GFP_KERNEL, args, ); + + if (!argv) + return -ENOMEM; + } + /* Prevent dyn_event from racing */ mutex_lock(_mutex); - user = find_user_event(group, name, ); + user = find_user_event(group, name, argc, (const char **)argv, + reg_flags, ); mutex_unlock(_mutex); - if (user) { - if (args) { -
[PATCH v3 2/4] tracing/user_events: Introduce multi-format events
Currently user_events supports 1 event with the same name and must have the exact same format when referenced by multiple programs. This opens an opportunity for malicous or poorly thought through programs to create events that others use with different formats. Another scenario is user programs wishing to use the same event name but add more fields later when the software updates. Various versions of a program may be running side-by-side, which is prevented by the current single format requirement. Add a new register flag (USER_EVENT_REG_MULTI_FORMAT) which indicates the user program wishes to use the same user_event name, but may have several different formats of the event in the future. When this flag is used, create the underlying tracepoint backing the user_event with a unique name per-version of the format. It's important that existing ABI users do not get this logic automatically, even if one of the multi format events matches the format. This ensures existing programs that create events and assume the tracepoint name will match exactly continue to work as expected. Add logic to only check multi-format events with other multi-format events and single-format events to only check single-format events during find. Change system name of the multi-format event tracepoint to ensure that multi-format events are isolated completely from single-format events. This prevents single-format names from conflicting with multi-format events if they end with the same suffix as the multi-format events. Add a register_name (reg_name) to the user_event struct which allows for split naming of events. We now have the name that was used to register within user_events as well as the unique name for the tracepoint. Upon registering events ensure matches based on first the reg_name, followed by the fields and format of the event. This allows for multiple events with the same registered name to have different formats. The underlying tracepoint will have a unique name in the format of {reg_name}.{unique_id}. For example, if both "test u32 value" and "test u64 value" are used with the USER_EVENT_REG_MULTI_FORMAT the system would have 2 unique tracepoints. The dynamic_events file would then show the following: u:test u64 count u:test u32 count The actual tracepoint names look like this: test.0 test.1 Both would be under the new user_events_multi system name to prevent the older ABI from being used to squat on multi-formatted events and block their use. Deleting events via "!u:test u64 count" would only delete the first tracepoint that matched that format. When the delete ABI is used all events with the same name will be attempted to be deleted. If per-version deletion is required, user programs should either not use persistent events or delete them via dynamic_events. Signed-off-by: Beau Belgrave --- include/uapi/linux/user_events.h | 6 +- kernel/trace/trace_events_user.c | 118 +++ 2 files changed, 111 insertions(+), 13 deletions(-) diff --git a/include/uapi/linux/user_events.h b/include/uapi/linux/user_events.h index f74f3aedd49c..a03de03dccbc 100644 --- a/include/uapi/linux/user_events.h +++ b/include/uapi/linux/user_events.h @@ -12,6 +12,7 @@ #include #define USER_EVENTS_SYSTEM "user_events" +#define USER_EVENTS_MULTI_SYSTEM "user_events_multi" #define USER_EVENTS_PREFIX "u:" /* Create dynamic location entry within a 32-bit value */ @@ -22,8 +23,11 @@ enum user_reg_flag { /* Event will not delete upon last reference closing */ USER_EVENT_REG_PERSIST = 1U << 0, + /* Event will be allowed to have multiple formats */ + USER_EVENT_REG_MULTI_FORMAT = 1U << 1, + /* This value or above is currently non-ABI */ - USER_EVENT_REG_MAX = 1U << 1, + USER_EVENT_REG_MAX = 1U << 2, }; /* diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index dda58681247e..53980982a575 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -34,7 +34,8 @@ /* Limit how long of an event name plus args within the subsystem. */ #define MAX_EVENT_DESC 512 -#define EVENT_NAME(user_event) ((user_event)->tracepoint.name) +#define EVENT_NAME(user_event) ((user_event)->reg_name) +#define EVENT_TP_NAME(user_event) ((user_event)->tracepoint.name) #define MAX_FIELD_ARRAY_SIZE 1024 /* @@ -54,10 +55,13 @@ * allows isolation for events by various means. */ struct user_event_group { - char*system_name; - struct hlist_node node; - struct mutex reg_mutex; + char*system_name; + char*system_multi_name; + struct hlist_node node; + struct mutexreg_mutex; DECLARE_HASHTABLE(register_table, 8); + /* ID that mov
[PATCH v3 3/4] selftests/user_events: Test multi-format events
User_events now has multi-format events which allow for the same register name, but with different formats. When this occurs, different tracepoints are created with unique names. Add a new test that ensures the same name can be used for two different formats. Ensure they are isolated from each other and that name and arg matching still works if yet another register comes in with the same format as one of the two. Signed-off-by: Beau Belgrave --- .../testing/selftests/user_events/abi_test.c | 134 ++ 1 file changed, 134 insertions(+) diff --git a/tools/testing/selftests/user_events/abi_test.c b/tools/testing/selftests/user_events/abi_test.c index cef1ff1af223..7288a05136ba 100644 --- a/tools/testing/selftests/user_events/abi_test.c +++ b/tools/testing/selftests/user_events/abi_test.c @@ -16,6 +16,8 @@ #include #include #include +#include +#include #include #include "../kselftest_harness.h" @@ -23,6 +25,62 @@ const char *data_file = "/sys/kernel/tracing/user_events_data"; const char *enable_file = "/sys/kernel/tracing/events/user_events/__abi_event/enable"; +const char *multi_dir_glob = "/sys/kernel/tracing/events/user_events_multi/__abi_event.*"; + +static int wait_for_delete(char *dir) +{ + struct stat buf; + int i; + + for (i = 0; i < 1; ++i) { + if (stat(dir, ) == -1 && errno == ENOENT) + return 0; + + usleep(1000); + } + + return -1; +} + +static int find_multi_event_dir(char *unique_field, char *out_dir, int dir_len) +{ + char path[256]; + glob_t buf; + int i, ret; + + ret = glob(multi_dir_glob, GLOB_ONLYDIR, NULL, ); + + if (ret) + return -1; + + ret = -1; + + for (i = 0; i < buf.gl_pathc; ++i) { + FILE *fp; + + snprintf(path, sizeof(path), "%s/format", buf.gl_pathv[i]); + fp = fopen(path, "r"); + + if (!fp) + continue; + + while (fgets(path, sizeof(path), fp) != NULL) { + if (strstr(path, unique_field)) { + fclose(fp); + /* strscpy is not available, use snprintf */ + snprintf(out_dir, dir_len, "%s", buf.gl_pathv[i]); + ret = 0; + goto out; + } + } + + fclose(fp); + } +out: + globfree(); + + return ret; +} static bool event_exists(void) { @@ -74,6 +132,39 @@ static int event_delete(void) return ret; } +static int reg_enable_multi(void *enable, int size, int bit, int flags, + char *args) +{ + struct user_reg reg = {0}; + char full_args[512] = {0}; + int fd = open(data_file, O_RDWR); + int len; + int ret; + + if (fd < 0) + return -1; + + len = snprintf(full_args, sizeof(full_args), "__abi_event %s", args); + + if (len > sizeof(full_args)) { + ret = -E2BIG; + goto out; + } + + reg.size = sizeof(reg); + reg.name_args = (__u64)full_args; + reg.flags = USER_EVENT_REG_MULTI_FORMAT | flags; + reg.enable_bit = bit; + reg.enable_addr = (__u64)enable; + reg.enable_size = size; + + ret = ioctl(fd, DIAG_IOCSREG, ); +out: + close(fd); + + return ret; +} + static int reg_enable_flags(void *enable, int size, int bit, int flags) { struct user_reg reg = {0}; @@ -207,6 +298,49 @@ TEST_F(user, bit_sizes) { ASSERT_NE(0, reg_enable(>check, 128, 0)); } +TEST_F(user, multi_format) { + char first_dir[256]; + char second_dir[256]; + struct stat buf; + + /* Multiple formats for the same name should work */ + ASSERT_EQ(0, reg_enable_multi(>check, sizeof(int), 0, + 0, "u32 multi_first")); + + ASSERT_EQ(0, reg_enable_multi(>check, sizeof(int), 1, + 0, "u64 multi_second")); + + /* Same name with same format should also work */ + ASSERT_EQ(0, reg_enable_multi(>check, sizeof(int), 2, + 0, "u64 multi_second")); + + ASSERT_EQ(0, find_multi_event_dir("multi_first", + first_dir, sizeof(first_dir))); + + ASSERT_EQ(0, find_multi_event_dir("multi_second", + second_dir, sizeof(second_dir))); + + /* Should not be found in the same dir */ + ASSERT_NE(0, strcmp(first_dir, second_dir)); + + /* First dir should still exist */ + ASSERT_EQ(0, stat(first_dir, )); + + /* Disabling first register sho
[PATCH v3 0/4] tracing/user_events: Introduce multi-format events
Currently user_events supports 1 event with the same name and must have the exact same format when referenced by multiple programs. This opens an opportunity for malicous or poorly thought through programs to create events that others use with different formats. Another scenario is user programs wishing to use the same event name but add more fields later when the software updates. Various versions of a program may be running side-by-side, which is prevented by the current single format requirement. Add a new register flag (USER_EVENT_REG_MULTI_FORMAT) which indicates the user program wishes to use the same user_event name, but may have several different formats of the event in the future. When this flag is used, create the underlying tracepoint backing the user_event with a unique name per-version of the format. It's important that existing ABI users do not get this logic automatically, even if one of the multi format events matches the format. This ensures existing programs that create events and assume the tracepoint name will match exactly continue to work as expected. Add logic to only check multi-format events with other multi-format events and single-format events to only check single-format events during find. Change system name of the multi-format event tracepoint to ensure that multi-format events are isolated completely from single-format events. This prevents single-format names from conflicting with multi-format events if they end with the same suffix as the multi-format events. Add a register_name (reg_name) to the user_event struct which allows for split naming of events. We now have the name that was used to register within user_events as well as the unique name for the tracepoint. Upon registering events ensure matches based on first the reg_name, followed by the fields and format of the event. This allows for multiple events with the same registered name to have different formats. The underlying tracepoint will have a unique name in the format of {reg_name}.{unique_id}. For example, if both "test u32 value" and "test u64 value" are used with the USER_EVENT_REG_MULTI_FORMAT the system would have 2 unique tracepoints. The dynamic_events file would then show the following: u:test u64 count u:test u32 count The actual tracepoint names look like this: test.0 test.1 Both would be under the new user_events_multi system name to prevent the older ABI from being used to squat on multi-formatted events and block their use. Deleting events via "!u:test u64 count" would only delete the first tracepoint that matched that format. When the delete ABI is used all events with the same name will be attempted to be deleted. If per-version deletion is required, user programs should either not use persistent events or delete them via dynamic_events. Changes in v3: Use hash_for_each_possible_safe() in destroy_user_event() to prevent use after free (caught by kernel test robot ). Changes in v2: Tracepoint names changed from "name:[id]" to "name.id". Feedback was the : could conflict with system name formats. []'s are also special characters for bash. Updated self-test and docs to reflect the new suffix format. Updated docs to include a regex example to help guide recording programs find the correct event in ambiguous cases. Beau Belgrave (4): tracing/user_events: Prepare find/delete for same name events tracing/user_events: Introduce multi-format events selftests/user_events: Test multi-format events tracing/user_events: Document multi-format flag Documentation/trace/user_events.rst | 27 ++- include/uapi/linux/user_events.h | 6 +- kernel/trace/trace_events_user.c | 225 +- .../testing/selftests/user_events/abi_test.c | 134 +++ 4 files changed, 330 insertions(+), 62 deletions(-) base-commit: 610a9b8f49fbcf1100716370d3b5f6f884a2835a -- 2.34.1
[PATCH v2 3/4] selftests/user_events: Test multi-format events
User_events now has multi-format events which allow for the same register name, but with different formats. When this occurs, different tracepoints are created with unique names. Add a new test that ensures the same name can be used for two different formats. Ensure they are isolated from each other and that name and arg matching still works if yet another register comes in with the same format as one of the two. Signed-off-by: Beau Belgrave --- .../testing/selftests/user_events/abi_test.c | 134 ++ 1 file changed, 134 insertions(+) diff --git a/tools/testing/selftests/user_events/abi_test.c b/tools/testing/selftests/user_events/abi_test.c index cef1ff1af223..7288a05136ba 100644 --- a/tools/testing/selftests/user_events/abi_test.c +++ b/tools/testing/selftests/user_events/abi_test.c @@ -16,6 +16,8 @@ #include #include #include +#include +#include #include #include "../kselftest_harness.h" @@ -23,6 +25,62 @@ const char *data_file = "/sys/kernel/tracing/user_events_data"; const char *enable_file = "/sys/kernel/tracing/events/user_events/__abi_event/enable"; +const char *multi_dir_glob = "/sys/kernel/tracing/events/user_events_multi/__abi_event.*"; + +static int wait_for_delete(char *dir) +{ + struct stat buf; + int i; + + for (i = 0; i < 1; ++i) { + if (stat(dir, ) == -1 && errno == ENOENT) + return 0; + + usleep(1000); + } + + return -1; +} + +static int find_multi_event_dir(char *unique_field, char *out_dir, int dir_len) +{ + char path[256]; + glob_t buf; + int i, ret; + + ret = glob(multi_dir_glob, GLOB_ONLYDIR, NULL, ); + + if (ret) + return -1; + + ret = -1; + + for (i = 0; i < buf.gl_pathc; ++i) { + FILE *fp; + + snprintf(path, sizeof(path), "%s/format", buf.gl_pathv[i]); + fp = fopen(path, "r"); + + if (!fp) + continue; + + while (fgets(path, sizeof(path), fp) != NULL) { + if (strstr(path, unique_field)) { + fclose(fp); + /* strscpy is not available, use snprintf */ + snprintf(out_dir, dir_len, "%s", buf.gl_pathv[i]); + ret = 0; + goto out; + } + } + + fclose(fp); + } +out: + globfree(); + + return ret; +} static bool event_exists(void) { @@ -74,6 +132,39 @@ static int event_delete(void) return ret; } +static int reg_enable_multi(void *enable, int size, int bit, int flags, + char *args) +{ + struct user_reg reg = {0}; + char full_args[512] = {0}; + int fd = open(data_file, O_RDWR); + int len; + int ret; + + if (fd < 0) + return -1; + + len = snprintf(full_args, sizeof(full_args), "__abi_event %s", args); + + if (len > sizeof(full_args)) { + ret = -E2BIG; + goto out; + } + + reg.size = sizeof(reg); + reg.name_args = (__u64)full_args; + reg.flags = USER_EVENT_REG_MULTI_FORMAT | flags; + reg.enable_bit = bit; + reg.enable_addr = (__u64)enable; + reg.enable_size = size; + + ret = ioctl(fd, DIAG_IOCSREG, ); +out: + close(fd); + + return ret; +} + static int reg_enable_flags(void *enable, int size, int bit, int flags) { struct user_reg reg = {0}; @@ -207,6 +298,49 @@ TEST_F(user, bit_sizes) { ASSERT_NE(0, reg_enable(>check, 128, 0)); } +TEST_F(user, multi_format) { + char first_dir[256]; + char second_dir[256]; + struct stat buf; + + /* Multiple formats for the same name should work */ + ASSERT_EQ(0, reg_enable_multi(>check, sizeof(int), 0, + 0, "u32 multi_first")); + + ASSERT_EQ(0, reg_enable_multi(>check, sizeof(int), 1, + 0, "u64 multi_second")); + + /* Same name with same format should also work */ + ASSERT_EQ(0, reg_enable_multi(>check, sizeof(int), 2, + 0, "u64 multi_second")); + + ASSERT_EQ(0, find_multi_event_dir("multi_first", + first_dir, sizeof(first_dir))); + + ASSERT_EQ(0, find_multi_event_dir("multi_second", + second_dir, sizeof(second_dir))); + + /* Should not be found in the same dir */ + ASSERT_NE(0, strcmp(first_dir, second_dir)); + + /* First dir should still exist */ + ASSERT_EQ(0, stat(first_dir, )); + + /* Disabling first register sho
[PATCH v2 2/4] tracing/user_events: Introduce multi-format events
Currently user_events supports 1 event with the same name and must have the exact same format when referenced by multiple programs. This opens an opportunity for malicous or poorly thought through programs to create events that others use with different formats. Another scenario is user programs wishing to use the same event name but add more fields later when the software updates. Various versions of a program may be running side-by-side, which is prevented by the current single format requirement. Add a new register flag (USER_EVENT_REG_MULTI_FORMAT) which indicates the user program wishes to use the same user_event name, but may have several different formats of the event in the future. When this flag is used, create the underlying tracepoint backing the user_event with a unique name per-version of the format. It's important that existing ABI users do not get this logic automatically, even if one of the multi format events matches the format. This ensures existing programs that create events and assume the tracepoint name will match exactly continue to work as expected. Add logic to only check multi-format events with other multi-format events and single-format events to only check single-format events during find. Change system name of the multi-format event tracepoint to ensure that multi-format events are isolated completely from single-format events. This prevents single-format names from conflicting with multi-format events if they end with the same suffix as the multi-format events. Add a register_name (reg_name) to the user_event struct which allows for split naming of events. We now have the name that was used to register within user_events as well as the unique name for the tracepoint. Upon registering events ensure matches based on first the reg_name, followed by the fields and format of the event. This allows for multiple events with the same registered name to have different formats. The underlying tracepoint will have a unique name in the format of {reg_name}.{unique_id}. For example, if both "test u32 value" and "test u64 value" are used with the USER_EVENT_REG_MULTI_FORMAT the system would have 2 unique tracepoints. The dynamic_events file would then show the following: u:test u64 count u:test u32 count The actual tracepoint names look like this: test.0 test.1 Both would be under the new user_events_multi system name to prevent the older ABI from being used to squat on multi-formatted events and block their use. Deleting events via "!u:test u64 count" would only delete the first tracepoint that matched that format. When the delete ABI is used all events with the same name will be attempted to be deleted. If per-version deletion is required, user programs should either not use persistent events or delete them via dynamic_events. Signed-off-by: Beau Belgrave --- include/uapi/linux/user_events.h | 6 +- kernel/trace/trace_events_user.c | 118 +++ 2 files changed, 111 insertions(+), 13 deletions(-) diff --git a/include/uapi/linux/user_events.h b/include/uapi/linux/user_events.h index f74f3aedd49c..a03de03dccbc 100644 --- a/include/uapi/linux/user_events.h +++ b/include/uapi/linux/user_events.h @@ -12,6 +12,7 @@ #include #define USER_EVENTS_SYSTEM "user_events" +#define USER_EVENTS_MULTI_SYSTEM "user_events_multi" #define USER_EVENTS_PREFIX "u:" /* Create dynamic location entry within a 32-bit value */ @@ -22,8 +23,11 @@ enum user_reg_flag { /* Event will not delete upon last reference closing */ USER_EVENT_REG_PERSIST = 1U << 0, + /* Event will be allowed to have multiple formats */ + USER_EVENT_REG_MULTI_FORMAT = 1U << 1, + /* This value or above is currently non-ABI */ - USER_EVENT_REG_MAX = 1U << 1, + USER_EVENT_REG_MAX = 1U << 2, }; /* diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index 0480579ba563..608750e4e6df 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -34,7 +34,8 @@ /* Limit how long of an event name plus args within the subsystem. */ #define MAX_EVENT_DESC 512 -#define EVENT_NAME(user_event) ((user_event)->tracepoint.name) +#define EVENT_NAME(user_event) ((user_event)->reg_name) +#define EVENT_TP_NAME(user_event) ((user_event)->tracepoint.name) #define MAX_FIELD_ARRAY_SIZE 1024 /* @@ -54,10 +55,13 @@ * allows isolation for events by various means. */ struct user_event_group { - char*system_name; - struct hlist_node node; - struct mutex reg_mutex; + char*system_name; + char*system_multi_name; + struct hlist_node node; + struct mutexreg_mutex; DECLARE_HASHTABLE(register_table, 8); + /* ID that mov
[PATCH v2 1/4] tracing/user_events: Prepare find/delete for same name events
The current code for finding and deleting events assumes that there will never be cases when user_events are registered with the same name, but different formats. In the future this scenario will exist to ensure user programs can be updated or modify their events and run different versions of their programs side-by-side without being blocked. This change does not yet allow for multi-format events. If user_events are registered with the same name but different arguments the programs see the same return values as before. This change simply makes it possible to easily accomodate for this in future changes. Update find_user_event() to take in argument parameters and register flags to accomodate future multi-format event scenarios. Have find validate argument matching and return error pointers to cover address in use cases, or allocation errors. Update callers to handle error pointer logic. Move delete_user_event() to use hash walking directly now that find has changed. Delete all events found that match the register name, stop if an error occurs and report back to the user. Update user_fields_match() to cover list_empty() scenarios instead of each callsite doing it now that find_user_event() uses it directly. Signed-off-by: Beau Belgrave --- kernel/trace/trace_events_user.c | 106 +-- 1 file changed, 58 insertions(+), 48 deletions(-) diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index 9365ce407426..0480579ba563 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -202,6 +202,8 @@ static struct user_event_mm *user_event_mm_get(struct user_event_mm *mm); static struct user_event_mm *user_event_mm_get_all(struct user_event *user); static void user_event_mm_put(struct user_event_mm *mm); static int destroy_user_event(struct user_event *user); +static bool user_fields_match(struct user_event *user, int argc, + const char **argv); static u32 user_event_key(char *name) { @@ -1493,17 +1495,24 @@ static int destroy_user_event(struct user_event *user) } static struct user_event *find_user_event(struct user_event_group *group, - char *name, u32 *outkey) + char *name, int argc, const char **argv, + u32 flags, u32 *outkey) { struct user_event *user; u32 key = user_event_key(name); *outkey = key; - hash_for_each_possible(group->register_table, user, node, key) - if (!strcmp(EVENT_NAME(user), name)) + hash_for_each_possible(group->register_table, user, node, key) { + if (strcmp(EVENT_NAME(user), name)) + continue; + + if (user_fields_match(user, argc, argv)) return user_event_get(user); + return ERR_PTR(-EADDRINUSE); + } + return NULL; } @@ -1860,6 +1869,9 @@ static bool user_fields_match(struct user_event *user, int argc, struct list_head *head = >fields; int i = 0; + if (argc == 0) + return list_empty(head); + list_for_each_entry_reverse(field, head, link) { if (!user_field_match(field, argc, argv, )) return false; @@ -1880,10 +1892,8 @@ static bool user_event_match(const char *system, const char *event, match = strcmp(EVENT_NAME(user), event) == 0 && (!system || strcmp(system, USER_EVENTS_SYSTEM) == 0); - if (match && argc > 0) + if (match) match = user_fields_match(user, argc, argv); - else if (match && argc == 0) - match = list_empty(>fields); return match; } @@ -1922,11 +1932,11 @@ static int user_event_parse(struct user_event_group *group, char *name, char *args, char *flags, struct user_event **newuser, int reg_flags) { - int ret; - u32 key; struct user_event *user; + char **argv = NULL; int argc = 0; - char **argv; + int ret; + u32 key; /* Currently don't support any text based flags */ if (flags != NULL) @@ -1935,41 +1945,34 @@ static int user_event_parse(struct user_event_group *group, char *name, if (!user_event_capable(reg_flags)) return -EPERM; + if (args) { + argv = argv_split(GFP_KERNEL, args, ); + + if (!argv) + return -ENOMEM; + } + /* Prevent dyn_event from racing */ mutex_lock(_mutex); - user = find_user_event(group, name, ); + user = find_user_event(group, name, argc, (const char **)argv, + reg_flags, ); mutex_unlock(_mutex); - if (user) { - if (args) { -
[PATCH v2 4/4] tracing/user_events: Document multi-format flag
User programs can now ask user_events to handle the synchronization of multiple different formats for an event with the same name via the new USER_EVENT_REG_MULTI_FORMAT flag. Add a section for USER_EVENT_REG_MULTI_FORMAT that explains the intended purpose and caveats of using it. Explain how deletion works in these cases and how to use /sys/kernel/tracing/dynamic_events for per-version deletion. Signed-off-by: Beau Belgrave --- Documentation/trace/user_events.rst | 27 ++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/Documentation/trace/user_events.rst b/Documentation/trace/user_events.rst index d8f12442aaa6..1d5a7626e6a6 100644 --- a/Documentation/trace/user_events.rst +++ b/Documentation/trace/user_events.rst @@ -92,6 +92,24 @@ The following flags are currently supported. process closes or unregisters the event. Requires CAP_PERFMON otherwise -EPERM is returned. ++ USER_EVENT_REG_MULTI_FORMAT: The event can contain multiple formats. This + allows programs to prevent themselves from being blocked when their event + format changes and they wish to use the same name. When this flag is used the + tracepoint name will be in the new format of "name.unique_id" vs the older + format of "name". A tracepoint will be created for each unique pair of name + and format. This means if several processes use the same name and format, + they will use the same tracepoint. If yet another process uses the same name, + but a different format than the other processes, it will use a different + tracepoint with a new unique id. Recording programs need to scan tracefs for + the various different formats of the event name they are interested in + recording. The system name of the tracepoint will also use "user_events_multi" + instead of "user_events". This prevents single-format event names conflicting + with any multi-format event names within tracefs. The unique_id is output as + a hex string. Recording programs should ensure the tracepoint name starts with + the event name they registered and has a suffix that starts with . and only + has hex characters. For example to find all versions of the event "test" you + can use the regex "^test\.[0-9a-fA-F]+$". + Upon successful registration the following is set. + write_index: The index to use for this file descriptor that represents this @@ -106,6 +124,9 @@ or perf record -e user_events:[name] when attaching/recording. **NOTE:** The event subsystem name by default is "user_events". Callers should not assume it will always be "user_events". Operators reserve the right in the future to change the subsystem name per-process to accommodate event isolation. +In addition if the USER_EVENT_REG_MULTI_FORMAT flag is used the tracepoint name +will have a unique id appended to it and the system name will be +"user_events_multi" as described above. Command Format ^^ @@ -156,7 +177,11 @@ to request deletes than the one used for registration due to this. to the event. If programs do not want auto-delete, they must use the USER_EVENT_REG_PERSIST flag when registering the event. Once that flag is used the event exists until DIAG_IOCSDEL is invoked. Both register and delete of an -event that persists requires CAP_PERFMON, otherwise -EPERM is returned. +event that persists requires CAP_PERFMON, otherwise -EPERM is returned. When +there are multiple formats of the same event name, all events with the same +name will be attempted to be deleted. If only a specific version is wanted to +be deleted then the /sys/kernel/tracing/dynamic_events file should be used for +that specific format of the event. Unregistering - -- 2.34.1
[PATCH v2 0/4] tracing/user_events: Introduce multi-format events
Currently user_events supports 1 event with the same name and must have the exact same format when referenced by multiple programs. This opens an opportunity for malicous or poorly thought through programs to create events that others use with different formats. Another scenario is user programs wishing to use the same event name but add more fields later when the software updates. Various versions of a program may be running side-by-side, which is prevented by the current single format requirement. Add a new register flag (USER_EVENT_REG_MULTI_FORMAT) which indicates the user program wishes to use the same user_event name, but may have several different formats of the event in the future. When this flag is used, create the underlying tracepoint backing the user_event with a unique name per-version of the format. It's important that existing ABI users do not get this logic automatically, even if one of the multi format events matches the format. This ensures existing programs that create events and assume the tracepoint name will match exactly continue to work as expected. Add logic to only check multi-format events with other multi-format events and single-format events to only check single-format events during find. Change system name of the multi-format event tracepoint to ensure that multi-format events are isolated completely from single-format events. This prevents single-format names from conflicting with multi-format events if they end with the same suffix as the multi-format events. Add a register_name (reg_name) to the user_event struct which allows for split naming of events. We now have the name that was used to register within user_events as well as the unique name for the tracepoint. Upon registering events ensure matches based on first the reg_name, followed by the fields and format of the event. This allows for multiple events with the same registered name to have different formats. The underlying tracepoint will have a unique name in the format of {reg_name}.{unique_id}. For example, if both "test u32 value" and "test u64 value" are used with the USER_EVENT_REG_MULTI_FORMAT the system would have 2 unique tracepoints. The dynamic_events file would then show the following: u:test u64 count u:test u32 count The actual tracepoint names look like this: test.0 test.1 Both would be under the new user_events_multi system name to prevent the older ABI from being used to squat on multi-formatted events and block their use. Deleting events via "!u:test u64 count" would only delete the first tracepoint that matched that format. When the delete ABI is used all events with the same name will be attempted to be deleted. If per-version deletion is required, user programs should either not use persistent events or delete them via dynamic_events. Changes in v2: Tracepoint names changed from "name:[id]" to "name.id". Feedback was the : could conflict with system name formats. []'s are also special characters for bash. Updated self-test and docs to reflect the new suffix format. Updated docs to include a regex example to help guide recording programs find the correct event in ambiguous cases. Beau Belgrave (4): tracing/user_events: Prepare find/delete for same name events tracing/user_events: Introduce multi-format events selftests/user_events: Test multi-format events tracing/user_events: Document multi-format flag Documentation/trace/user_events.rst | 27 ++- include/uapi/linux/user_events.h | 6 +- kernel/trace/trace_events_user.c | 224 +- .../testing/selftests/user_events/abi_test.c | 134 +++ 4 files changed, 329 insertions(+), 62 deletions(-) base-commit: 610a9b8f49fbcf1100716370d3b5f6f884a2835a -- 2.34.1
Re: [PATCH 2/4] tracing/user_events: Introduce multi-format events
On Tue, Jan 30, 2024 at 01:52:30PM -0500, Steven Rostedt wrote: > On Tue, 30 Jan 2024 10:05:15 -0800 > Beau Belgrave wrote: > > > On Mon, Jan 29, 2024 at 09:24:07PM -0500, Steven Rostedt wrote: > > > On Mon, 29 Jan 2024 09:29:07 -0800 > > > Beau Belgrave wrote: > > > > > > > Thanks, yeah ideally we wouldn't use special characters. > > > > > > > > I'm not picky about this. However, I did want something that clearly > > > > allowed a glob pattern to find all versions of a given register name of > > > > user_events by user programs that record. The dot notation will pull in > > > > more than expected if dotted namespace style names are used. > > > > > > > > An example is "Asserts" and "Asserts.Verbose" from different programs. > > > > If we tried to find all versions of "Asserts" via glob of "Asserts.*" it > > > > will pull in "Asserts.Verbose.1" in addition to "Asserts.0". > > > > > > Do you prevent brackets in names? > > > > > > > No. However, since brackets have a start and end token that are distinct > > finding all versions of your event is trivial compared to a single dot. > > > > Imagine two events: > > Asserts > > Asserts[MyCoolIndex] > > > > Resolves to tracepoints of: > > Asserts:[0] > > Asserts[MyCoolIndex]:[1] > > > > Regardless of brackets in the names, a simple glob of Asserts:\[*\] only > > finds Asserts:[0]. This is because we have that end bracket in the glob > > and the full event name including the start bracket. > > > > If I register another "version" of Asserts, thne I'll have: > > Asserts:[0] > > Asserts[MyCoolIndex]:[1] > > Asserts:[2] > > > > The glob of Asserts:\[*\] will return both: > > Asserts:[0] > > Asserts:[2] > > But what if you had registered "Asserts:[MyCoolIndex]:[1]" > Good point, the above would still require a regex type pattern to not get pulled in. > Do you prevent colons? > No, nothing is prevented at this point. It seems we could either prevent certain characters to make it easier or define a good regex that we should document. I'm leaning toward just doing a simple suffix and documenting the regex well. > > > > At this point the program can either record all versions or scan further > > to find which version of Asserts is wanted. > > > > > > > > > > While a glob of "Asserts.[0-9]" works when the unique ID is 0-9, it > > > > doesn't work if the number is higher, like 128. If we ever decide to > > > > change the ID from an integer to say hex to save space, these globs > > > > would break. > > > > > > > > Is there some scheme that fits the C-variable name that addresses the > > > > above scenarios? Brackets gave me a simple glob that seemed to prevent a > > > > lot of this ("Asserts.\[*\]" in this case). > > > > > > Prevent a lot of what? I'm not sure what your example here is. > > > > > > > I'll try again :) > > > > We have 2 events registered via user_events: > > Asserts > > Asserts.Verbose > > > > Using dot notation these would result in tracepoints of: > > user_events_multi/Asserts.0 > > user_events_multi/Asserts.Verbose.1 > > > > Using bracket notation these would result in tracepoints of: > > user_events_multi/Asserts:[0] > > user_events_multi/Asserts.Verbose:[1] > > > > A recording program only wants to enable the Asserts tracepoint. It does > > not want to record the Asserts.Verbose tracepoint. > > > > The program must find the right tracepoint by scanning tracefs under the > > user_events_multi system. > > > > A single dot suffix does not allow a simple glob to be used. The glob > > Asserts.* will return both Asserts.0 and Asserts.Verbose.1. > > > > A simple glob of Asserts:\[*\] will only find Asserts:[0], it will not > > find Asserts.Verbose:[1]. > > > > We could just use brackets and not have the colon (Asserts[0] in this > > case). But brackets are still special for bash. > > Are these shell scripts or programs. I use regex in programs all the time. > And if you have shell scripts, use awk or something. > They could be both. In our case, it is a program. > Unless you prevent something from being added, I don't see the protection. > Yeah, it just makes it way less likely. Given that, I'm starting to lean toward just docu
Re: [PATCH 0/4] tracing/user_events: Introduce multi-format events
On Tue, Jan 30, 2024 at 11:09:33AM +0900, Masami Hiramatsu wrote: > Hi Beau, > > On Tue, 23 Jan 2024 22:08:40 +0000 > Beau Belgrave wrote: > > > Currently user_events supports 1 event with the same name and must have > > the exact same format when referenced by multiple programs. This opens > > an opportunity for malicous or poorly thought through programs to > > create events that others use with different formats. Another scenario > > is user programs wishing to use the same event name but add more fields > > later when the software updates. Various versions of a program may be > > running side-by-side, which is prevented by the current single format > > requirement. > > > > Add a new register flag (USER_EVENT_REG_MULTI_FORMAT) which indicates > > the user program wishes to use the same user_event name, but may have > > several different formats of the event in the future. When this flag is > > used, create the underlying tracepoint backing the user_event with a > > unique name per-version of the format. It's important that existing ABI > > users do not get this logic automatically, even if one of the multi > > format events matches the format. This ensures existing programs that > > create events and assume the tracepoint name will match exactly continue > > to work as expected. Add logic to only check multi-format events with > > other multi-format events and single-format events to only check > > single-format events during find. > > Thanks for this work! This will allow many instance to use the same > user-events at the same time. > > BTW, can we force this flag set by default? My concern is if any user > program use this user-event interface in the container (maybe it is > possible if we bind-mount it). In this case, the user program can > detect the other program is using the event if this flag is not set. > Moreover, if there is a malicious program running in the container, > it can prevent using the event name from other programs even if it > is isolated by the name-space. > The multi-format use a different system name (user_events_multi). So you cannot use the single-format flag to detect multi-format names, etc. You can only use it to find other single-format names like you could always do. Likewise, you cannot use the single-event flag to block or prevent multi-format events from being created. Changing this behavior to default would break all of our environments. So I'm pretty sure it would break others. The current environment expects tracepoints to show up as their registered name under the user_events system name. If this changed out from under us on a specific kernel version, that would be bad. I'd like eventually to have a tracer namespace concept for containers. Then we would have a user_event_group per tracer namespace. Then all user_events within the container have a unique system name which fully isolates them. However, even with that isolation, we still need a way to allow programs in the same container to have different versions of the same event name. Multi-format events fixes this problem. I think isolation should be dealt with via true namespace isolation at the tracing level. > Steve suggested that if a user program which is running in a namespace > uses user-event without this flag, we can reject that by default. > > What would you think about? > This would break all of our environments. It would make previously compiled programs using the existing ABI from working as expected. I'd much prefer that level of isolation to happen at the namespace level and why user_events as plumbing for different groups to achieve this. It's also why the ABI does not allow programs to state a system name. I'm trying to reserve the system name for the group/tracer/namespace isolation work. Thanks, -Beau > Thank you, > > > > > > Add a register_name (reg_name) to the user_event struct which allows for > > split naming of events. We now have the name that was used to register > > within user_events as well as the unique name for the tracepoint. Upon > > registering events ensure matches based on first the reg_name, followed > > by the fields and format of the event. This allows for multiple events > > with the same registered name to have different formats. The underlying > > tracepoint will have a unique name in the format of {reg_name}:[unique_id]. > > The unique_id is the time, in nanoseconds, of the event creation converted > > to hex. Since this is done under the register mutex, it is extremely > > unlikely for these IDs to ever match. It's also very unlikely a malicious > > program could consistently guess what the name would be and attempt to > > squat on it via the single format ABI. > > &g
Re: [PATCH 2/4] tracing/user_events: Introduce multi-format events
On Tue, Jan 30, 2024 at 11:12:22PM +0900, Masami Hiramatsu wrote: > On Mon, 29 Jan 2024 09:29:07 -0800 > Beau Belgrave wrote: > > > On Fri, Jan 26, 2024 at 03:04:45PM -0500, Steven Rostedt wrote: > > > On Fri, 26 Jan 2024 11:10:07 -0800 > > > Beau Belgrave wrote: > > > > > > > > OK, so the each different event has suffixed name. But this will > > > > > introduce non C-variable name. > > > > > > > > > > Steve, do you think your library can handle these symbols? It will > > > > > be something like "event:[1]" as the event name. > > > > > Personally I like "event.1" style. (of course we need to ensure the > > > > > user given event name is NOT including such suffix numbers) > > > > > > > > > > > > > Just to clarify around events including a suffix number. This is why > > > > multi-events use "user_events_multi" system name and the single-events > > > > using just "user_events". > > > > > > > > Even if a user program did include a suffix, the suffix would still get > > > > appended. An example is "test" vs "test:[0]" using multi-format would > > > > result in two tracepoints ("test:[0]" and "test:[0]:[1]" respectively > > > > (assuming these are the first multi-events on the system). > > > > > > > > I'm with you, we really don't want any spoofing or squatting possible. > > > > By using different system names and always appending the suffix I > > > > believe covers this. > > > > > > > > Looking forward to hearing Steven's thoughts on this as well. > > > > > > I'm leaning towards Masami's suggestion to use dots, as that won't > > > conflict > > > with special characters from bash, as '[' and ']' do. > > > > > > > Thanks, yeah ideally we wouldn't use special characters. > > > > I'm not picky about this. However, I did want something that clearly > > allowed a glob pattern to find all versions of a given register name of > > user_events by user programs that record. The dot notation will pull in > > more than expected if dotted namespace style names are used. > > > > An example is "Asserts" and "Asserts.Verbose" from different programs. > > If we tried to find all versions of "Asserts" via glob of "Asserts.*" it > > will pull in "Asserts.Verbose.1" in addition to "Asserts.0". > > If we use dot for the suffix number, we can prohibit user to use it > for their name. They still can use '_' (or change the group name?) We could, however, we have user_event integration in OpenTelemetry and I'm unsure if we should really try to restrict names. We'll also at some point have libside integration, which might not have the same restrictions on the user-tracer side as the kernel-tracer side. I'm trying to restrict the user_event group name from changing outside of an eventual tracer namespace. I'd like for each container to inherit a tracer namespace long-term which decides what the actual group name will be instead of users self-selecting names to prevent squatting or spoofing of events. > I just concerned that the name can be parsed by existing tools. Since > ':' is used as a separator for group and event name in some case (e.g. > tracefs "set_event" is using, so trace-cmd and perf is using it.) > Good point. What about just event_name[unique_id]? IE: Drop the colon. Brackets are still special in bash, but it would prevent simple glob patterns from matching to incorrect tracepoints under user_events_multi. > > While a glob of "Asserts.[0-9]" works when the unique ID is 0-9, it > > doesn't work if the number is higher, like 128. If we ever decide to > > change the ID from an integer to say hex to save space, these globs > > would break. > > Hmm, why can't we use regexp? We can use regex, but we'll need to agree the suffix format. We won't be able to change it after that point. I'd prefer a base-16/Hex suffix either in brackets or a simple dot. > And if we limits the number of events up to 1000 for each same-name event > we can use fixed numbers, like Assets.[0-9][0-9][0-9] > I'm always wrong when I guess how many events programs will end up using. Folks always surprise me. I'd rather have a solution that scales to the max number of tracepoints allowed on the system (currently 16-bit max value). Thanks, -Beau > Thank you, > > > > > Is there some scheme that fits the C-variable name that addresses the > > above scenarios? Brackets gave me a simple glob that seemed to prevent a > > lot of this ("Asserts.\[*\]" in this case). > > > > Are we confident that we always want to represent the ID as a base-10 > > integer vs a base-16 integer? The suffix will be ABI to ensure recording > > programs can find their events easily. > > > > Thanks, > > -Beau > > > > > -- Steve > > > -- > Masami Hiramatsu (Google)
Re: [PATCH 2/4] tracing/user_events: Introduce multi-format events
On Mon, Jan 29, 2024 at 09:24:07PM -0500, Steven Rostedt wrote: > On Mon, 29 Jan 2024 09:29:07 -0800 > Beau Belgrave wrote: > > > Thanks, yeah ideally we wouldn't use special characters. > > > > I'm not picky about this. However, I did want something that clearly > > allowed a glob pattern to find all versions of a given register name of > > user_events by user programs that record. The dot notation will pull in > > more than expected if dotted namespace style names are used. > > > > An example is "Asserts" and "Asserts.Verbose" from different programs. > > If we tried to find all versions of "Asserts" via glob of "Asserts.*" it > > will pull in "Asserts.Verbose.1" in addition to "Asserts.0". > > Do you prevent brackets in names? > No. However, since brackets have a start and end token that are distinct finding all versions of your event is trivial compared to a single dot. Imagine two events: Asserts Asserts[MyCoolIndex] Resolves to tracepoints of: Asserts:[0] Asserts[MyCoolIndex]:[1] Regardless of brackets in the names, a simple glob of Asserts:\[*\] only finds Asserts:[0]. This is because we have that end bracket in the glob and the full event name including the start bracket. If I register another "version" of Asserts, thne I'll have: Asserts:[0] Asserts[MyCoolIndex]:[1] Asserts:[2] The glob of Asserts:\[*\] will return both: Asserts:[0] Asserts:[2] At this point the program can either record all versions or scan further to find which version of Asserts is wanted. > > > > While a glob of "Asserts.[0-9]" works when the unique ID is 0-9, it > > doesn't work if the number is higher, like 128. If we ever decide to > > change the ID from an integer to say hex to save space, these globs > > would break. > > > > Is there some scheme that fits the C-variable name that addresses the > > above scenarios? Brackets gave me a simple glob that seemed to prevent a > > lot of this ("Asserts.\[*\]" in this case). > > Prevent a lot of what? I'm not sure what your example here is. > I'll try again :) We have 2 events registered via user_events: Asserts Asserts.Verbose Using dot notation these would result in tracepoints of: user_events_multi/Asserts.0 user_events_multi/Asserts.Verbose.1 Using bracket notation these would result in tracepoints of: user_events_multi/Asserts:[0] user_events_multi/Asserts.Verbose:[1] A recording program only wants to enable the Asserts tracepoint. It does not want to record the Asserts.Verbose tracepoint. The program must find the right tracepoint by scanning tracefs under the user_events_multi system. A single dot suffix does not allow a simple glob to be used. The glob Asserts.* will return both Asserts.0 and Asserts.Verbose.1. A simple glob of Asserts:\[*\] will only find Asserts:[0], it will not find Asserts.Verbose:[1]. We could just use brackets and not have the colon (Asserts[0] in this case). But brackets are still special for bash. > > > > Are we confident that we always want to represent the ID as a base-10 > > integer vs a base-16 integer? The suffix will be ABI to ensure recording > > programs can find their events easily. > > Is there a difference to what we choose? > If a simple glob of event_name:\[*\] cannot be used, then we must document what the suffix format is, so an appropriate regex can be created. If we start with base-10 then later move to base-16 we will break existing regex patterns on the recording side. I prefer, and have in this series, a base-16 output since it saves on the tracepoint name size. Either way we go, we need to define how recording programs should find the events they care about. So we must be very clear, IMHO, about the format of the tracepoint names in our documentation. I personally think recording programs are likely to get this wrong without proper guidance. Thanks, -Beau > -- Steve
Re: [PATCH 2/4] tracing/user_events: Introduce multi-format events
On Fri, Jan 26, 2024 at 03:04:45PM -0500, Steven Rostedt wrote: > On Fri, 26 Jan 2024 11:10:07 -0800 > Beau Belgrave wrote: > > > > OK, so the each different event has suffixed name. But this will > > > introduce non C-variable name. > > > > > > Steve, do you think your library can handle these symbols? It will > > > be something like "event:[1]" as the event name. > > > Personally I like "event.1" style. (of course we need to ensure the > > > user given event name is NOT including such suffix numbers) > > > > > > > Just to clarify around events including a suffix number. This is why > > multi-events use "user_events_multi" system name and the single-events > > using just "user_events". > > > > Even if a user program did include a suffix, the suffix would still get > > appended. An example is "test" vs "test:[0]" using multi-format would > > result in two tracepoints ("test:[0]" and "test:[0]:[1]" respectively > > (assuming these are the first multi-events on the system). > > > > I'm with you, we really don't want any spoofing or squatting possible. > > By using different system names and always appending the suffix I > > believe covers this. > > > > Looking forward to hearing Steven's thoughts on this as well. > > I'm leaning towards Masami's suggestion to use dots, as that won't conflict > with special characters from bash, as '[' and ']' do. > Thanks, yeah ideally we wouldn't use special characters. I'm not picky about this. However, I did want something that clearly allowed a glob pattern to find all versions of a given register name of user_events by user programs that record. The dot notation will pull in more than expected if dotted namespace style names are used. An example is "Asserts" and "Asserts.Verbose" from different programs. If we tried to find all versions of "Asserts" via glob of "Asserts.*" it will pull in "Asserts.Verbose.1" in addition to "Asserts.0". While a glob of "Asserts.[0-9]" works when the unique ID is 0-9, it doesn't work if the number is higher, like 128. If we ever decide to change the ID from an integer to say hex to save space, these globs would break. Is there some scheme that fits the C-variable name that addresses the above scenarios? Brackets gave me a simple glob that seemed to prevent a lot of this ("Asserts.\[*\]" in this case). Are we confident that we always want to represent the ID as a base-10 integer vs a base-16 integer? The suffix will be ABI to ensure recording programs can find their events easily. Thanks, -Beau > -- Steve
Re: [PATCH 2/4] tracing/user_events: Introduce multi-format events
On Sat, Jan 27, 2024 at 12:01:04AM +0900, Masami Hiramatsu wrote: > On Tue, 23 Jan 2024 22:08:42 + > Beau Belgrave wrote: > > > Add a register_name (reg_name) to the user_event struct which allows for > > split naming of events. We now have the name that was used to register > > within user_events as well as the unique name for the tracepoint. Upon > > registering events ensure matches based on first the reg_name, followed > > by the fields and format of the event. This allows for multiple events > > with the same registered name to have different formats. The underlying > > tracepoint will have a unique name in the format of {reg_name}:[unique_id]. > > > > For example, if both "test u32 value" and "test u64 value" are used with > > the USER_EVENT_REG_MULTI_FORMAT the system would have 2 unique > > tracepoints. The dynamic_events file would then show the following: > > u:test u64 count > > u:test u32 count > > > > The actual tracepoint names look like this: > > test:[d5874fdac44] > > test:[d5914662cd4] > > > > Both would be under the new user_events_multi system name to prevent the > > older ABI from being used to squat on multi-formatted events and block > > their use. > [...] > > @@ -1923,6 +1972,39 @@ static int user_event_trace_register(struct > > user_event *user) > > return ret; > > } > > > > +static int user_event_set_tp_name(struct user_event *user) > > +{ > > + lockdep_assert_held(>group->reg_mutex); > > + > > + if (EVENT_MULTI_FORMAT(user->reg_flags)) { > > + char *multi_name; > > + int len; > > + > > + len = snprintf(NULL, 0, "%s:[%llx]", user->reg_name, > > + user->group->multi_id) + 1; > > + > > + multi_name = kzalloc(len, GFP_KERNEL_ACCOUNT); > > + > > + if (!multi_name) > > + return -ENOMEM; > > + > > + snprintf(multi_name, len, "%s:[%llx]", user->reg_name, > > +user->group->multi_id); > > OK, so the each different event has suffixed name. But this will > introduce non C-variable name. > > Steve, do you think your library can handle these symbols? It will > be something like "event:[1]" as the event name. > Personally I like "event.1" style. (of course we need to ensure the > user given event name is NOT including such suffix numbers) > Just to clarify around events including a suffix number. This is why multi-events use "user_events_multi" system name and the single-events using just "user_events". Even if a user program did include a suffix, the suffix would still get appended. An example is "test" vs "test:[0]" using multi-format would result in two tracepoints ("test:[0]" and "test:[0]:[1]" respectively (assuming these are the first multi-events on the system). I'm with you, we really don't want any spoofing or squatting possible. By using different system names and always appending the suffix I believe covers this. Looking forward to hearing Steven's thoughts on this as well. Thanks, -Beau > Thank you. > > -- > Masami Hiramatsu (Google)
Re: [PATCH 0/4] tracing/user_events: Introduce multi-format events
It appears to put an outdated coversheet onto this series. Below is the updated coversheet that reflects changes made: Currently user_events supports 1 event with the same name and must have the exact same format when referenced by multiple programs. This opens an opportunity for malicous or poorly thought through programs to create events that others use with different formats. Another scenario is user programs wishing to use the same event name but add more fields later when the software updates. Various versions of a program may be running side-by-side, which is prevented by the current single format requirement. Add a new register flag (USER_EVENT_REG_MULTI_FORMAT) which indicates the user program wishes to use the same user_event name, but may have several different formats of the event in the future. When this flag is used, create the underlying tracepoint backing the user_event with a unique name per-version of the format. It's important that existing ABI users do not get this logic automatically, even if one of the multi format events matches the format. This ensures existing programs that create events and assume the tracepoint name will match exactly continue to work as expected. Add logic to only check multi-format events with other multi-format events and single-format events to only check single-format events during find. Change system name of the multi-format event tracepoint to ensure that multi-format events are isolated completely from single-format events. Add a register_name (reg_name) to the user_event struct which allows for split naming of events. We now have the name that was used to register within user_events as well as the unique name for the tracepoint. Upon registering events ensure matches based on first the reg_name, followed by the fields and format of the event. This allows for multiple events with the same registered name to have different formats. The underlying tracepoint will have a unique name in the format of {reg_name}:[unique_id]. For example, if both "test u32 value" and "test u64 value" are used with the USER_EVENT_REG_MULTI_FORMAT the system would have 2 unique tracepoints. The dynamic_events file would then show the following: u:test u64 count u:test u32 count The actual tracepoint names look like this: test:[d5874fdac44] test:[d5914662cd4] Both would be under the new user_events_multi system name to prevent the older ABI from being used to squat on multi-formatted events and block their use. Deleting events via "!u:test u64 count" would only delete the first tracepoint that matched that format. When the delete ABI is used all events with the same name will be attempted to be deleted. If per-version deletion is required, user programs should either not use persistent events or delete them via dynamic_events. Thanks, -Beau
Re: [PATCH 1/4] tracing/user_events: Prepare find/delete for same name events
On Thu, Jan 25, 2024 at 09:59:03AM +0900, Masami Hiramatsu wrote: > On Tue, 23 Jan 2024 22:08:41 + > Beau Belgrave wrote: > > > The current code for finding and deleting events assumes that there will > > never be cases when user_events are registered with the same name, but > > different formats. In the future this scenario will exist to ensure > > user programs can be updated or modify their events and run different > > versions of their programs side-by-side without being blocked. > > Ah, this is a very important point. Kernel always has only one instance > but user program doesn't. Thus it can define the same event name. > For the similar problem, uprobe event assumes that the user (here > admin) will define different group name to avoid it. But for the user > event, it is embedded, hmm. > Yes, the series will handle if multi-processes use the same name, we will find a matching version of that name within the user_event group. If there isn't one, a new one is created. Each is backed by an independent tracepoint which does match up with how uprobe does it. This actually got brought up in the tracefs meetings we've had and it seemed to get wide agreement on how to best handle this. > > > > This change does not yet allow for multi-format events. If user_events > > are registered with the same name but different arguments the programs > > see the same return values as before. This change simply makes it > > possible to easily accomodate for this in future changes. > > > > Update find_user_event() to take in argument parameters and register > > flags to accomodate future multi-format event scenarios. Have find > > validate argument matching and return error pointers to cover address > > in use cases, or allocation errors. Update callers to handle error > > pointer logic. > > Understand, that is similar to what probe events do. > > > > > Move delete_user_event() to use hash walking directly now that find has > > changed. Delete all events found that match the register name, stop > > if an error occurs and report back to the user. > > What happen if we run 2 different version of the applications and terminate > one of them? The event which is used by others will be kept? > Each unique version of a user_event has it's own ref-count. If one version is not-used, but another version is, only the not-used version will get deleted. The other version that is in use will return a -EBUSY when it gets to that version via enumeration. While we only have a single tracepoint per-version, we have several user_event structures in memory that have the same name, yet different formats. Each of which have their own lifetime, enablers and ref-counts to keep them isolated from each other. Thanks, -Beau > Thank you, > > > > > Update user_fields_match() to cover list_empty() scenarios instead of > > each callsite doing it now that find_user_event() uses it directly. > > > > Signed-off-by: Beau Belgrave > > --- > > kernel/trace/trace_events_user.c | 106 +-- > > 1 file changed, 58 insertions(+), 48 deletions(-) > > > > diff --git a/kernel/trace/trace_events_user.c > > b/kernel/trace/trace_events_user.c > > index 9365ce407426..0480579ba563 100644 > > --- a/kernel/trace/trace_events_user.c > > +++ b/kernel/trace/trace_events_user.c > > @@ -202,6 +202,8 @@ static struct user_event_mm *user_event_mm_get(struct > > user_event_mm *mm); > > static struct user_event_mm *user_event_mm_get_all(struct user_event > > *user); > > static void user_event_mm_put(struct user_event_mm *mm); > > static int destroy_user_event(struct user_event *user); > > +static bool user_fields_match(struct user_event *user, int argc, > > + const char **argv); > > > > static u32 user_event_key(char *name) > > { > > @@ -1493,17 +1495,24 @@ static int destroy_user_event(struct user_event > > *user) > > } > > > > static struct user_event *find_user_event(struct user_event_group *group, > > - char *name, u32 *outkey) > > + char *name, int argc, const char > > **argv, > > + u32 flags, u32 *outkey) > > { > > struct user_event *user; > > u32 key = user_event_key(name); > > > > *outkey = key; > > > > - hash_for_each_possible(group->register_table, user, node, key) > > - if (!strcmp(EVENT_NAME(user), name)) > > + hash_for_each_possible(group->register_table, user, node, key) { > > +
[PATCH 2/4] tracing/user_events: Introduce multi-format events
Currently user_events supports 1 event with the same name and must have the exact same format when referenced by multiple programs. This opens an opportunity for malicous or poorly thought through programs to create events that others use with different formats. Another scenario is user programs wishing to use the same event name but add more fields later when the software updates. Various versions of a program may be running side-by-side, which is prevented by the current single format requirement. Add a new register flag (USER_EVENT_REG_MULTI_FORMAT) which indicates the user program wishes to use the same user_event name, but may have several different formats of the event in the future. When this flag is used, create the underlying tracepoint backing the user_event with a unique name per-version of the format. It's important that existing ABI users do not get this logic automatically, even if one of the multi format events matches the format. This ensures existing programs that create events and assume the tracepoint name will match exactly continue to work as expected. Add logic to only check multi-format events with other multi-format events and single-format events to only check single-format events during find. Change system name of the multi-format event tracepoint to ensure that multi-format events are isolated completely from single-format events. Add a register_name (reg_name) to the user_event struct which allows for split naming of events. We now have the name that was used to register within user_events as well as the unique name for the tracepoint. Upon registering events ensure matches based on first the reg_name, followed by the fields and format of the event. This allows for multiple events with the same registered name to have different formats. The underlying tracepoint will have a unique name in the format of {reg_name}:[unique_id]. For example, if both "test u32 value" and "test u64 value" are used with the USER_EVENT_REG_MULTI_FORMAT the system would have 2 unique tracepoints. The dynamic_events file would then show the following: u:test u64 count u:test u32 count The actual tracepoint names look like this: test:[d5874fdac44] test:[d5914662cd4] Both would be under the new user_events_multi system name to prevent the older ABI from being used to squat on multi-formatted events and block their use. Deleting events via "!u:test u64 count" would only delete the first tracepoint that matched that format. When the delete ABI is used all events with the same name will be attempted to be deleted. If per-version deletion is required, user programs should either not use persistent events or delete them via dynamic_events. Signed-off-by: Beau Belgrave --- include/uapi/linux/user_events.h | 6 +- kernel/trace/trace_events_user.c | 118 +++ 2 files changed, 111 insertions(+), 13 deletions(-) diff --git a/include/uapi/linux/user_events.h b/include/uapi/linux/user_events.h index f74f3aedd49c..a03de03dccbc 100644 --- a/include/uapi/linux/user_events.h +++ b/include/uapi/linux/user_events.h @@ -12,6 +12,7 @@ #include #define USER_EVENTS_SYSTEM "user_events" +#define USER_EVENTS_MULTI_SYSTEM "user_events_multi" #define USER_EVENTS_PREFIX "u:" /* Create dynamic location entry within a 32-bit value */ @@ -22,8 +23,11 @@ enum user_reg_flag { /* Event will not delete upon last reference closing */ USER_EVENT_REG_PERSIST = 1U << 0, + /* Event will be allowed to have multiple formats */ + USER_EVENT_REG_MULTI_FORMAT = 1U << 1, + /* This value or above is currently non-ABI */ - USER_EVENT_REG_MAX = 1U << 1, + USER_EVENT_REG_MAX = 1U << 2, }; /* diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index 0480579ba563..f9c0781285b6 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -34,7 +34,8 @@ /* Limit how long of an event name plus args within the subsystem. */ #define MAX_EVENT_DESC 512 -#define EVENT_NAME(user_event) ((user_event)->tracepoint.name) +#define EVENT_NAME(user_event) ((user_event)->reg_name) +#define EVENT_TP_NAME(user_event) ((user_event)->tracepoint.name) #define MAX_FIELD_ARRAY_SIZE 1024 /* @@ -54,10 +55,13 @@ * allows isolation for events by various means. */ struct user_event_group { - char*system_name; - struct hlist_node node; - struct mutex reg_mutex; + char*system_name; + char*system_multi_name; + struct hlist_node node; + struct mutexreg_mutex; DECLARE_HASHTABLE(register_table, 8); + /* ID that moves forward within the group for multi-event names */ + u64 multi_id; }; /* Group for init
[PATCH 3/4] selftests/user_events: Test multi-format events
User_events now has multi-format events which allow for the same register name, but with different formats. When this occurs, different tracepoints are created with unique names. Add a new test that ensures the same name can be used for two different formats. Ensure they are isolated from each other and that name and arg matching still works if yet another register comes in with the same format as one of the two. Signed-off-by: Beau Belgrave --- .../testing/selftests/user_events/abi_test.c | 134 ++ 1 file changed, 134 insertions(+) diff --git a/tools/testing/selftests/user_events/abi_test.c b/tools/testing/selftests/user_events/abi_test.c index cef1ff1af223..b3480fad65a5 100644 --- a/tools/testing/selftests/user_events/abi_test.c +++ b/tools/testing/selftests/user_events/abi_test.c @@ -16,6 +16,8 @@ #include #include #include +#include +#include #include #include "../kselftest_harness.h" @@ -23,6 +25,62 @@ const char *data_file = "/sys/kernel/tracing/user_events_data"; const char *enable_file = "/sys/kernel/tracing/events/user_events/__abi_event/enable"; +const char *multi_dir_glob = "/sys/kernel/tracing/events/user_events_multi/__abi_event:*"; + +static int wait_for_delete(char *dir) +{ + struct stat buf; + int i; + + for (i = 0; i < 1; ++i) { + if (stat(dir, ) == -1 && errno == ENOENT) + return 0; + + usleep(1000); + } + + return -1; +} + +static int find_multi_event_dir(char *unique_field, char *out_dir, int dir_len) +{ + char path[256]; + glob_t buf; + int i, ret; + + ret = glob(multi_dir_glob, GLOB_ONLYDIR, NULL, ); + + if (ret) + return -1; + + ret = -1; + + for (i = 0; i < buf.gl_pathc; ++i) { + FILE *fp; + + snprintf(path, sizeof(path), "%s/format", buf.gl_pathv[i]); + fp = fopen(path, "r"); + + if (!fp) + continue; + + while (fgets(path, sizeof(path), fp) != NULL) { + if (strstr(path, unique_field)) { + fclose(fp); + /* strscpy is not available, use snprintf */ + snprintf(out_dir, dir_len, "%s", buf.gl_pathv[i]); + ret = 0; + goto out; + } + } + + fclose(fp); + } +out: + globfree(); + + return ret; +} static bool event_exists(void) { @@ -74,6 +132,39 @@ static int event_delete(void) return ret; } +static int reg_enable_multi(void *enable, int size, int bit, int flags, + char *args) +{ + struct user_reg reg = {0}; + char full_args[512] = {0}; + int fd = open(data_file, O_RDWR); + int len; + int ret; + + if (fd < 0) + return -1; + + len = snprintf(full_args, sizeof(full_args), "__abi_event %s", args); + + if (len > sizeof(full_args)) { + ret = -E2BIG; + goto out; + } + + reg.size = sizeof(reg); + reg.name_args = (__u64)full_args; + reg.flags = USER_EVENT_REG_MULTI_FORMAT | flags; + reg.enable_bit = bit; + reg.enable_addr = (__u64)enable; + reg.enable_size = size; + + ret = ioctl(fd, DIAG_IOCSREG, ); +out: + close(fd); + + return ret; +} + static int reg_enable_flags(void *enable, int size, int bit, int flags) { struct user_reg reg = {0}; @@ -207,6 +298,49 @@ TEST_F(user, bit_sizes) { ASSERT_NE(0, reg_enable(>check, 128, 0)); } +TEST_F(user, multi_format) { + char first_dir[256]; + char second_dir[256]; + struct stat buf; + + /* Multiple formats for the same name should work */ + ASSERT_EQ(0, reg_enable_multi(>check, sizeof(int), 0, + 0, "u32 multi_first")); + + ASSERT_EQ(0, reg_enable_multi(>check, sizeof(int), 1, + 0, "u64 multi_second")); + + /* Same name with same format should also work */ + ASSERT_EQ(0, reg_enable_multi(>check, sizeof(int), 2, + 0, "u64 multi_second")); + + ASSERT_EQ(0, find_multi_event_dir("multi_first", + first_dir, sizeof(first_dir))); + + ASSERT_EQ(0, find_multi_event_dir("multi_second", + second_dir, sizeof(second_dir))); + + /* Should not be found in the same dir */ + ASSERT_NE(0, strcmp(first_dir, second_dir)); + + /* First dir should still exist */ + ASSERT_EQ(0, stat(first_dir, )); + + /* Disabling first register sho
[PATCH 4/4] tracing/user_events: Document multi-format flag
User programs can now ask user_events to handle the synchronization of multiple different formats for an event with the same name via the new USER_EVENT_REG_MULTI_FORMAT flag. Add a section for USER_EVENT_REG_MULTI_FORMAT that explains the intended purpose and caveats of using it. Explain how deletion works in these cases and how to use /sys/kernel/tracing/dynamic_events for per-version deletion. Signed-off-by: Beau Belgrave --- Documentation/trace/user_events.rst | 23 ++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/Documentation/trace/user_events.rst b/Documentation/trace/user_events.rst index d8f12442aaa6..35a2524bc73c 100644 --- a/Documentation/trace/user_events.rst +++ b/Documentation/trace/user_events.rst @@ -92,6 +92,20 @@ The following flags are currently supported. process closes or unregisters the event. Requires CAP_PERFMON otherwise -EPERM is returned. ++ USER_EVENT_REG_MULTI_FORMAT: The event can contain multiple formats. This + allows programs to prevent themselves from being blocked when their event + format changes and they wish to use the same name. When this flag is used the + tracepoint name will be in the new format of "name:[unique_id]" vs the older + format of "name". A tracepoint will be created for each unique pair of name + and format. This means if several processes use the same name and format, + they will use the same tracepoint. If yet another process uses the same name, + but a different format than the other processes, it will use a different + tracepoint with a new unique id. Recording programs need to scan tracefs for + the various different formats of the event name they are interested in + recording. The system name of the tracepoint will also use "user_events_multi" + instead of "user_events". This prevents single-format event names conflicting + with any multi-format event names within tracefs. + Upon successful registration the following is set. + write_index: The index to use for this file descriptor that represents this @@ -106,6 +120,9 @@ or perf record -e user_events:[name] when attaching/recording. **NOTE:** The event subsystem name by default is "user_events". Callers should not assume it will always be "user_events". Operators reserve the right in the future to change the subsystem name per-process to accommodate event isolation. +In addition if the USER_EVENT_REG_MULTI_FORMAT flag is used the tracepoint name +will have a unique id appended to it and the system name will be +"user_events_multi" as described above. Command Format ^^ @@ -156,7 +173,11 @@ to request deletes than the one used for registration due to this. to the event. If programs do not want auto-delete, they must use the USER_EVENT_REG_PERSIST flag when registering the event. Once that flag is used the event exists until DIAG_IOCSDEL is invoked. Both register and delete of an -event that persists requires CAP_PERFMON, otherwise -EPERM is returned. +event that persists requires CAP_PERFMON, otherwise -EPERM is returned. When +there are multiple formats of the same event name, all events with the same +name will be attempted to be deleted. If only a specific version is wanted to +be deleted then the /sys/kernel/tracing/dynamic_events file should be used for +that specific format of the event. Unregistering - -- 2.34.1
[PATCH 0/4] tracing/user_events: Introduce multi-format events
Currently user_events supports 1 event with the same name and must have the exact same format when referenced by multiple programs. This opens an opportunity for malicous or poorly thought through programs to create events that others use with different formats. Another scenario is user programs wishing to use the same event name but add more fields later when the software updates. Various versions of a program may be running side-by-side, which is prevented by the current single format requirement. Add a new register flag (USER_EVENT_REG_MULTI_FORMAT) which indicates the user program wishes to use the same user_event name, but may have several different formats of the event in the future. When this flag is used, create the underlying tracepoint backing the user_event with a unique name per-version of the format. It's important that existing ABI users do not get this logic automatically, even if one of the multi format events matches the format. This ensures existing programs that create events and assume the tracepoint name will match exactly continue to work as expected. Add logic to only check multi-format events with other multi-format events and single-format events to only check single-format events during find. Add a register_name (reg_name) to the user_event struct which allows for split naming of events. We now have the name that was used to register within user_events as well as the unique name for the tracepoint. Upon registering events ensure matches based on first the reg_name, followed by the fields and format of the event. This allows for multiple events with the same registered name to have different formats. The underlying tracepoint will have a unique name in the format of {reg_name}:[unique_id]. The unique_id is the time, in nanoseconds, of the event creation converted to hex. Since this is done under the register mutex, it is extremely unlikely for these IDs to ever match. It's also very unlikely a malicious program could consistently guess what the name would be and attempt to squat on it via the single format ABI. For example, if both "test u32 value" and "test u64 value" are used with the USER_EVENT_REG_MULTI_FORMAT the system would have 2 unique tracepoints. The dynamic_events file would then show the following: u:test u64 count u:test u32 count The actual tracepoint names look like this: test:[d5874fdac44] test:[d5914662cd4] Deleting events via "!u:test u64 count" would only delete the first tracepoint that matched that format. When the delete ABI is used all events with the same name will be attempted to be deleted. If per-version deletion is required, user programs should either not use persistent events or delete them via dynamic_events. Beau Belgrave (4): tracing/user_events: Prepare find/delete for same name events tracing/user_events: Introduce multi-format events selftests/user_events: Test multi-format events tracing/user_events: Document multi-format flag Documentation/trace/user_events.rst | 23 +- include/uapi/linux/user_events.h | 6 +- kernel/trace/trace_events_user.c | 224 +- .../testing/selftests/user_events/abi_test.c | 134 +++ 4 files changed, 325 insertions(+), 62 deletions(-) base-commit: 610a9b8f49fbcf1100716370d3b5f6f884a2835a -- 2.34.1
[PATCH 1/4] tracing/user_events: Prepare find/delete for same name events
The current code for finding and deleting events assumes that there will never be cases when user_events are registered with the same name, but different formats. In the future this scenario will exist to ensure user programs can be updated or modify their events and run different versions of their programs side-by-side without being blocked. This change does not yet allow for multi-format events. If user_events are registered with the same name but different arguments the programs see the same return values as before. This change simply makes it possible to easily accomodate for this in future changes. Update find_user_event() to take in argument parameters and register flags to accomodate future multi-format event scenarios. Have find validate argument matching and return error pointers to cover address in use cases, or allocation errors. Update callers to handle error pointer logic. Move delete_user_event() to use hash walking directly now that find has changed. Delete all events found that match the register name, stop if an error occurs and report back to the user. Update user_fields_match() to cover list_empty() scenarios instead of each callsite doing it now that find_user_event() uses it directly. Signed-off-by: Beau Belgrave --- kernel/trace/trace_events_user.c | 106 +-- 1 file changed, 58 insertions(+), 48 deletions(-) diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index 9365ce407426..0480579ba563 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -202,6 +202,8 @@ static struct user_event_mm *user_event_mm_get(struct user_event_mm *mm); static struct user_event_mm *user_event_mm_get_all(struct user_event *user); static void user_event_mm_put(struct user_event_mm *mm); static int destroy_user_event(struct user_event *user); +static bool user_fields_match(struct user_event *user, int argc, + const char **argv); static u32 user_event_key(char *name) { @@ -1493,17 +1495,24 @@ static int destroy_user_event(struct user_event *user) } static struct user_event *find_user_event(struct user_event_group *group, - char *name, u32 *outkey) + char *name, int argc, const char **argv, + u32 flags, u32 *outkey) { struct user_event *user; u32 key = user_event_key(name); *outkey = key; - hash_for_each_possible(group->register_table, user, node, key) - if (!strcmp(EVENT_NAME(user), name)) + hash_for_each_possible(group->register_table, user, node, key) { + if (strcmp(EVENT_NAME(user), name)) + continue; + + if (user_fields_match(user, argc, argv)) return user_event_get(user); + return ERR_PTR(-EADDRINUSE); + } + return NULL; } @@ -1860,6 +1869,9 @@ static bool user_fields_match(struct user_event *user, int argc, struct list_head *head = >fields; int i = 0; + if (argc == 0) + return list_empty(head); + list_for_each_entry_reverse(field, head, link) { if (!user_field_match(field, argc, argv, )) return false; @@ -1880,10 +1892,8 @@ static bool user_event_match(const char *system, const char *event, match = strcmp(EVENT_NAME(user), event) == 0 && (!system || strcmp(system, USER_EVENTS_SYSTEM) == 0); - if (match && argc > 0) + if (match) match = user_fields_match(user, argc, argv); - else if (match && argc == 0) - match = list_empty(>fields); return match; } @@ -1922,11 +1932,11 @@ static int user_event_parse(struct user_event_group *group, char *name, char *args, char *flags, struct user_event **newuser, int reg_flags) { - int ret; - u32 key; struct user_event *user; + char **argv = NULL; int argc = 0; - char **argv; + int ret; + u32 key; /* Currently don't support any text based flags */ if (flags != NULL) @@ -1935,41 +1945,34 @@ static int user_event_parse(struct user_event_group *group, char *name, if (!user_event_capable(reg_flags)) return -EPERM; + if (args) { + argv = argv_split(GFP_KERNEL, args, ); + + if (!argv) + return -ENOMEM; + } + /* Prevent dyn_event from racing */ mutex_lock(_mutex); - user = find_user_event(group, name, ); + user = find_user_event(group, name, argc, (const char **)argv, + reg_flags, ); mutex_unlock(_mutex); - if (user) { - if (args) { -
Re: [PATCH] tracing user_events: Simplify user_event_parse_field() parsing
On Mon, Jan 08, 2024 at 01:37:23PM -0500, Steven Rostedt wrote: > From: "Steven Rostedt (Google)" > > Instead of having a bunch of if statements with: > >len = str_has_prefix(field, "__data_loc unsigned "); >if (len) >goto skip_next; > >len = str_has_prefix(field, "__data_loc "); >if (len) >goto skip_next; > >len = str_has_prefix(field, "__rel_loc unsigned "); >if (len) >goto skip_next; > >len = str_has_prefix(field, "__rel_loc "); >if (len) >goto skip_next; > > goto parse; > > skip_next: > > Consolidate it into a negative check and jump to parse if all the > str_has_prefix() calls fail. If one succeeds, it will just continue with > len equal to the proper string: > >if (!(len = str_has_prefix(field, "__data_loc unsigned ")) && >!(len = str_has_prefix(field, "__data_loc ")) && >!(len = str_has_prefix(field, "__rel_loc unsigned ")) && >!(len = str_has_prefix(field, "__rel_loc "))) { >goto parse; >} > > skip_next: > > Signed-off-by: Steven Rostedt (Google) > --- > kernel/trace/trace_events_user.c | 22 ++ > 1 file changed, 6 insertions(+), 16 deletions(-) > > diff --git a/kernel/trace/trace_events_user.c > b/kernel/trace/trace_events_user.c > index 9365ce407426..ce0c5f1ded48 100644 > --- a/kernel/trace/trace_events_user.c > +++ b/kernel/trace/trace_events_user.c > @@ -1175,23 +1175,13 @@ static int user_event_parse_field(char *field, struct > user_event *user, > goto skip_next; > } > > - len = str_has_prefix(field, "__data_loc unsigned "); > - if (len) > - goto skip_next; > - > - len = str_has_prefix(field, "__data_loc "); > - if (len) > - goto skip_next; > - > - len = str_has_prefix(field, "__rel_loc unsigned "); > - if (len) > - goto skip_next; > - > - len = str_has_prefix(field, "__rel_loc "); > - if (len) > - goto skip_next; > + if (!(len = str_has_prefix(field, "__data_loc unsigned ")) && > + !(len = str_has_prefix(field, "__data_loc ")) && > + !(len = str_has_prefix(field, "__rel_loc unsigned ")) && > + !(len = str_has_prefix(field, "__rel_loc "))) { > + goto parse; > + } This now triggers a checkpatch error: ERROR: do not use assignment in if condition #1184: FILE: kernel/trace/trace_events_user.c:1184: + if (!(len = str_has_prefix(field, "__data_loc unsigned ")) && I personally prefer to keep these files fully checkpatch clean. However, I did test these changes under the self-tests and it passed. Do they bug you that much? :) Thanks, -Beau > > - goto parse; > skip_next: > type = field; > field = strpbrk(field + len, " "); > -- > 2.43.0
[PATCH] eventfs: Fix events beyond NAME_MAX blocking tasks
Eventfs uses simple_lookup(), however, it will fail if the name of the entry is beyond NAME_MAX length. When this error is encountered, eventfs still tries to create dentries instead of skipping the dentry creation. When the dentry is attempted to be created in this state d_wait_lookup() will loop forever, waiting for the lookup to be removed. Fix eventfs to return the error in simple_lookup() back to the caller instead of continuing to try to create the dentry. Fixes: 63940449555e ("eventfs: Implement eventfs lookup, read, open functions") Link: https://lore.kernel.org/linux-trace-kernel/20231208183601.ga46-be...@linux.microsoft.com/ Signed-off-by: Beau Belgrave --- Please note the fixes tag is the first add of simple_lookup logic, there have been other changes beyond that may be a better fit to targetting. fs/tracefs/event_inode.c | 4 1 file changed, 4 insertions(+) diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c index f8a594a50ae6..d2c06ba26db4 100644 --- a/fs/tracefs/event_inode.c +++ b/fs/tracefs/event_inode.c @@ -561,6 +561,8 @@ static struct dentry *eventfs_root_lookup(struct inode *dir, if (strcmp(ei_child->name, name) != 0) continue; ret = simple_lookup(dir, dentry, flags); + if (IS_ERR(ret)) + goto out; create_dir_dentry(ei, ei_child, ei_dentry, true); created = true; break; @@ -583,6 +585,8 @@ static struct dentry *eventfs_root_lookup(struct inode *dir, if (r <= 0) continue; ret = simple_lookup(dir, dentry, flags); + if (IS_ERR(ret)) + goto out; create_file_dentry(ei, i, ei_dentry, name, mode, cdata, fops, true); break; -- 2.34.1
trace_event names with more than NAME_MAX chars
While developing some unrelated features I happened to create a trace_event that was more than NAME_MAX (255) characters. When this happened the creation worked, but tracefs would hang any task that tried to list the directory of the trace_event or remove it. I followed the code down to the reason being eventfs would call simple_lookup(), and if it failed, it would still try to create the dentry. In this case DCACHE_PAR_LOOKUP would get set and never cleared. This caused d_wait_lookup() to loop forever, since that flag is used in d_in_lookup(). Both tracefs and eventfs use simple_lookup() and it fails for dentries that exceed NAME_MAX. Should we even allow trace_events to be created that exceed this limit? Or should tracefs/eventfs allow this but somehow represent these differently? I have a fix that appears to work for myself, but unsure if there are other locations (attached at the end of this mail). Thanks, -Beau diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c index f8a594a50ae6..d2c06ba26db4 100644 --- a/fs/tracefs/event_inode.c +++ b/fs/tracefs/event_inode.c @@ -561,6 +561,8 @@ static struct dentry *eventfs_root_lookup(struct inode *dir, if (strcmp(ei_child->name, name) != 0) continue; ret = simple_lookup(dir, dentry, flags); + if (IS_ERR(ret)) + goto out; create_dir_dentry(ei, ei_child, ei_dentry, true); created = true; break; @@ -583,6 +585,8 @@ static struct dentry *eventfs_root_lookup(struct inode *dir, if (r <= 0) continue; ret = simple_lookup(dir, dentry, flags); + if (IS_ERR(ret)) + goto out; create_file_dentry(ei, i, ei_dentry, name, mode, cdata, fops, true); break;
Re: [PATCH] tracing: Have trace_event_file have ref counters
On Tue, Oct 31, 2023 at 12:24:53PM -0400, Steven Rostedt wrote: > From: "Steven Rostedt (Google)" > > The following can crash the kernel: > > # cd /sys/kernel/tracing > # echo 'p:sched schedule' > kprobe_events > # exec 5>>events/kprobes/sched/enable > # > kprobe_events > # exec 5>&- > > The above commands: > > 1. Change directory to the tracefs directory > 2. Create a kprobe event (doesn't matter what one) > 3. Open bash file descriptor 5 on the enable file of the kprobe event > 4. Delete the kprobe event (removes the files too) > 5. Close the bash file descriptor 5 > > The above causes a crash! > > BUG: kernel NULL pointer dereference, address: 0028 > #PF: supervisor read access in kernel mode > #PF: error_code(0x) - not-present page > PGD 0 P4D 0 > Oops: [#1] PREEMPT SMP PTI > CPU: 6 PID: 877 Comm: bash Not tainted > 6.5.0-rc4-test-8-g2c6b6b1029d4-dirty #186 > Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS > 1.16.2-debian-1.16.2-1 04/01/2014 > RIP: 0010:tracing_release_file_tr+0xc/0x50 > > What happens here is that the kprobe event creates a trace_event_file > "file" descriptor that represents the file in tracefs to the event. It > maintains state of the event (is it enabled for the given instance?). > Opening the "enable" file gets a reference to the event "file" descriptor > via the open file descriptor. When the kprobe event is deleted, the file is > also deleted from the tracefs system which also frees the event "file" > descriptor. > > But as the tracefs file is still opened by user space, it will not be > totally removed until the final dput() is called on it. But this is not > true with the event "file" descriptor that is already freed. If the user > does a write to or simply closes the file descriptor it will reference the > event "file" descriptor that was just freed, causing a use-after-free bug. > > To solve this, add a ref count to the event "file" descriptor as well as a > new flag called "FREED". The "file" will not be freed until the last > reference is released. But the FREE flag will be set when the event is > removed to prevent any more modifications to that event from happening, > even if there's still a reference to the event "file" descriptor. > > Link: > https://lore.kernel.org/linux-trace-kernel/2023103131.1e705...@gandalf.local.home/ > I ran this patch with your repro and via the user_events ftrace self test repro. Both worked correctly, thanks! Tested-by: Beau Belgrave Thanks, -Beau > Cc: sta...@vger.kernel.org > Fixes: f5ca233e2e66d ("tracing: Increase trace array ref count on enable and > filter files") > Reported-by: Beau Belgrave > Signed-off-by: Steven Rostedt (Google) > --- > include/linux/trace_events.h | 4 > kernel/trace/trace.c | 15 +++ > kernel/trace/trace.h | 3 +++ > kernel/trace/trace_events.c| 31 ++ > kernel/trace/trace_events_filter.c | 3 +++ > 5 files changed, 52 insertions(+), 4 deletions(-) > > diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h > index 12207dc6722d..696f8dc4aa53 100644 > --- a/include/linux/trace_events.h > +++ b/include/linux/trace_events.h > @@ -492,6 +492,7 @@ enum { > EVENT_FILE_FL_TRIGGER_COND_BIT, > EVENT_FILE_FL_PID_FILTER_BIT, > EVENT_FILE_FL_WAS_ENABLED_BIT, > + EVENT_FILE_FL_FREED_BIT, > }; > > extern struct trace_event_file *trace_get_event_file(const char *instance, > @@ -630,6 +631,7 @@ extern int __kprobe_event_add_fields(struct dynevent_cmd > *cmd, ...); > * TRIGGER_COND - When set, one or more triggers has an associated filter > * PID_FILTER- When set, the event is filtered based on pid > * WAS_ENABLED - Set when enabled to know to clear trace on module removal > + * FREED - File descriptor is freed, all fields should be > considered invalid > */ > enum { > EVENT_FILE_FL_ENABLED = (1 << EVENT_FILE_FL_ENABLED_BIT), > @@ -643,6 +645,7 @@ enum { > EVENT_FILE_FL_TRIGGER_COND = (1 << EVENT_FILE_FL_TRIGGER_COND_BIT), > EVENT_FILE_FL_PID_FILTER= (1 << EVENT_FILE_FL_PID_FILTER_BIT), > EVENT_FILE_FL_WAS_ENABLED = (1 << EVENT_FILE_FL_WAS_ENABLED_BIT), > + EVENT_FILE_FL_FREED = (1 << EVENT_FILE_FL_FREED_BIT), > }; > > struct trace_event_file { > @@ -671,6 +674,7 @@ struct trace_event_file { >* caching and such. Which is mostly OK ;-) &g
Re: [PATCH] tracing/user_events: align uaddr on unsigned long alignment
On Mon, Sep 25, 2023 at 09:53:16AM +0200, Clément Léger wrote: > > > On 22/09/2023 21:22, Beau Belgrave wrote: > > On Tue, Sep 19, 2023 at 02:59:12PM +0200, Clément Léger wrote: > >> > >> > >> On 14/09/2023 19:29, Steven Rostedt wrote: > >>> On Thu, 14 Sep 2023 13:17:00 -0400 > >>> Steven Rostedt wrote: > >>> > >>>> Now lets look at big endian layout: > >>>> > >>>> uaddr = 0xbeef0004 > >>>> enabler = 1; > >>>> > >>>> memory at 0xbeef: 00 00 00 00 00 00 00 02 > >>>> ^ > >>>> addr: 0xbeef0004 > >>>> > >>>> (enabler is set ) > >>>> > >>>> bitoffset = uaddr & (sizeof(unsigned long) - 1); bitoffset = 4 > >>>> bit_offset *= 8; bitoffset = 32 > >>>> uaddr &= ~(sizeof(unsigned long) - 1); uaddr = 0xbeef > >>>> > >>>> ptr = kaddr + (uaddr & ~PAGE_MASK); > >>>> > >>>> clear_bit(1 + 32, ptr); > >>>> > >>>> memory at 0xbeef: 00 00 00 00 00 00 00 02 > >>>> ^ > >>>> bit 33 of 0xbeef > >>>> > >>>> I don't think that's what you expected! > >>> > >>> I believe the above can be fixed with: > >>> > >>> bit_offset = uaddr & (sizeof(unsigned long) - 1); > >>> if (bit_offset) { > >>> #ifdef CONFIG_CPU_BIG_ENDIAN > >>> bit_offest = 0; > >>> #else > >>> bit_offset *= BITS_PER_BYTE; > >>> #endif > >>> uaddr &= ~(sizeof(unsigned long) - 1); > >>> } > >>> > >>> -- Steve > >> > >> > >> Actually, after looking more in depth at that, it seems like there are > >> actually 2 problems that can happen. > >> > >> First one is atomic access misalignment due to enable_size == 4 and > >> uaddr not being aligned on a (long) boundary on 64 bits architecture. > >> This can generate misaligned exceptions on various architectures. This > >> can be fixed in a more general way according to Masami snippet. > >> > >> Second one that I can see is on 64 bits, big endian architectures with > >> enable_size == 4. In that case, the bit provided by the userspace won't > >> be correctly set since this code kind of assume that the atomic are done > >> on 32bits value. Since the kernel assume long sized atomic operation, on > >> big endian 64 bits architecture, the updated bit will actually be in the > >> next 32 bits word. > >> > >> Can someone confirm my understanding ? > >> > > > > I have a ppc 64bit BE VM I've been validating this on. If we do the > > shifting within user_events (vs a generic set_bit_aligned approach) > > 64bit BE does not need additional bit manipulation. However, if we were > > to blindly pass the address and bit as is to set_bit_aligned() it > > assumes the bit number is for a long, not a 32 bit word. So for that > > approach we would need to offset the bit in the unaligned case. > > > > Here's a patch I have that I've validated on ppc64 BE, aarch64 LE, and > > x86_64 LE. I personally feel more comfortable with this approach than > > the generic set_bit_aligned() one. > > > > Thanks, > > -Beau > > > > diff --git a/kernel/trace/trace_events_user.c > > b/kernel/trace/trace_events_user.c > > index e3f2b8d72e01..ae854374d0b7 100644 > > --- a/kernel/trace/trace_events_user.c > > +++ b/kernel/trace/trace_events_user.c > > @@ -162,6 +162,23 @@ struct user_event_validator { > > int flags; > > }; > > > > +static inline void align_addr_bit(unsigned long *addr, int *bit) > > +{ > > + if (IS_ALIGNED(*addr, sizeof(long))) > > + return; > > + > > + *addr = ALIGN_DOWN(*addr, sizeof(long)); > > + > > + /* > > +* We only support 32 and 64 bit values. The only time we need > > +* to align is a 32 bit value on a 64 bit kernel, which on LE > > +* is always 32 bits, and on BE requires no change. > > +*/ > > +#ifdef __LITTLE_ENDIAN > > + *bit += 32; > > +#endif > > Hi Beau, except the specific alignment that
Re: [PATCH] tracing/user_events: align uaddr on unsigned long alignment
On Tue, Sep 19, 2023 at 02:59:12PM +0200, Clément Léger wrote: > > > On 14/09/2023 19:29, Steven Rostedt wrote: > > On Thu, 14 Sep 2023 13:17:00 -0400 > > Steven Rostedt wrote: > > > >> Now lets look at big endian layout: > >> > >> uaddr = 0xbeef0004 > >> enabler = 1; > >> > >> memory at 0xbeef: 00 00 00 00 00 00 00 02 > >> ^ > >> addr: 0xbeef0004 > >> > >>(enabler is set ) > >> > >>bitoffset = uaddr & (sizeof(unsigned long) - 1); bitoffset = 4 > >>bit_offset *= 8; bitoffset = 32 > >>uaddr &= ~(sizeof(unsigned long) - 1); uaddr = 0xbeef > >> > >>ptr = kaddr + (uaddr & ~PAGE_MASK); > >> > >>clear_bit(1 + 32, ptr); > >> > >> memory at 0xbeef: 00 00 00 00 00 00 00 02 > >> ^ > >>bit 33 of 0xbeef > >> > >> I don't think that's what you expected! > > > > I believe the above can be fixed with: > > > > bit_offset = uaddr & (sizeof(unsigned long) - 1); > > if (bit_offset) { > > #ifdef CONFIG_CPU_BIG_ENDIAN > > bit_offest = 0; > > #else > > bit_offset *= BITS_PER_BYTE; > > #endif > > uaddr &= ~(sizeof(unsigned long) - 1); > > } > > > > -- Steve > > > Actually, after looking more in depth at that, it seems like there are > actually 2 problems that can happen. > > First one is atomic access misalignment due to enable_size == 4 and > uaddr not being aligned on a (long) boundary on 64 bits architecture. > This can generate misaligned exceptions on various architectures. This > can be fixed in a more general way according to Masami snippet. > > Second one that I can see is on 64 bits, big endian architectures with > enable_size == 4. In that case, the bit provided by the userspace won't > be correctly set since this code kind of assume that the atomic are done > on 32bits value. Since the kernel assume long sized atomic operation, on > big endian 64 bits architecture, the updated bit will actually be in the > next 32 bits word. > > Can someone confirm my understanding ? > Actually, I take back some of what I said [1]. If a 32-bit on a 64-bit kernel comes in on BE, and is aligned, we do need to offset the bits as well (just verified on my ppc64 BE VM). You should be able to use that patch as a base though and add a flag to struct user_event_enabler when this case occurs. Then in the align_addr_bit() adjust the bits as well upon aligned cases. Thanks, -Beau 1. https://lore.kernel.org/linux-trace-kernel/20230922192231.ga1828-be...@linux.microsoft.com/ > Clément
Re: [PATCH] tracing/user_events: align uaddr on unsigned long alignment
On Tue, Sep 19, 2023 at 02:59:12PM +0200, Clément Léger wrote: > > > On 14/09/2023 19:29, Steven Rostedt wrote: > > On Thu, 14 Sep 2023 13:17:00 -0400 > > Steven Rostedt wrote: > > > >> Now lets look at big endian layout: > >> > >> uaddr = 0xbeef0004 > >> enabler = 1; > >> > >> memory at 0xbeef: 00 00 00 00 00 00 00 02 > >> ^ > >> addr: 0xbeef0004 > >> > >>(enabler is set ) > >> > >>bitoffset = uaddr & (sizeof(unsigned long) - 1); bitoffset = 4 > >>bit_offset *= 8; bitoffset = 32 > >>uaddr &= ~(sizeof(unsigned long) - 1); uaddr = 0xbeef > >> > >>ptr = kaddr + (uaddr & ~PAGE_MASK); > >> > >>clear_bit(1 + 32, ptr); > >> > >> memory at 0xbeef: 00 00 00 00 00 00 00 02 > >> ^ > >>bit 33 of 0xbeef > >> > >> I don't think that's what you expected! > > > > I believe the above can be fixed with: > > > > bit_offset = uaddr & (sizeof(unsigned long) - 1); > > if (bit_offset) { > > #ifdef CONFIG_CPU_BIG_ENDIAN > > bit_offest = 0; > > #else > > bit_offset *= BITS_PER_BYTE; > > #endif > > uaddr &= ~(sizeof(unsigned long) - 1); > > } > > > > -- Steve > > > Actually, after looking more in depth at that, it seems like there are > actually 2 problems that can happen. > > First one is atomic access misalignment due to enable_size == 4 and > uaddr not being aligned on a (long) boundary on 64 bits architecture. > This can generate misaligned exceptions on various architectures. This > can be fixed in a more general way according to Masami snippet. > > Second one that I can see is on 64 bits, big endian architectures with > enable_size == 4. In that case, the bit provided by the userspace won't > be correctly set since this code kind of assume that the atomic are done > on 32bits value. Since the kernel assume long sized atomic operation, on > big endian 64 bits architecture, the updated bit will actually be in the > next 32 bits word. > > Can someone confirm my understanding ? > I have a ppc 64bit BE VM I've been validating this on. If we do the shifting within user_events (vs a generic set_bit_aligned approach) 64bit BE does not need additional bit manipulation. However, if we were to blindly pass the address and bit as is to set_bit_aligned() it assumes the bit number is for a long, not a 32 bit word. So for that approach we would need to offset the bit in the unaligned case. Here's a patch I have that I've validated on ppc64 BE, aarch64 LE, and x86_64 LE. I personally feel more comfortable with this approach than the generic set_bit_aligned() one. Thanks, -Beau diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index e3f2b8d72e01..ae854374d0b7 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -162,6 +162,23 @@ struct user_event_validator { int flags; }; +static inline void align_addr_bit(unsigned long *addr, int *bit) +{ + if (IS_ALIGNED(*addr, sizeof(long))) + return; + + *addr = ALIGN_DOWN(*addr, sizeof(long)); + + /* +* We only support 32 and 64 bit values. The only time we need +* to align is a 32 bit value on a 64 bit kernel, which on LE +* is always 32 bits, and on BE requires no change. +*/ +#ifdef __LITTLE_ENDIAN + *bit += 32; +#endif +} + typedef void (*user_event_func_t) (struct user_event *user, struct iov_iter *i, void *tpdata, bool *faulted); @@ -481,6 +498,7 @@ static int user_event_enabler_write(struct user_event_mm *mm, unsigned long *ptr; struct page *page; void *kaddr; + int bit = ENABLE_BIT(enabler); int ret; lockdep_assert_held(_mutex); @@ -496,6 +514,8 @@ static int user_event_enabler_write(struct user_event_mm *mm, test_bit(ENABLE_VAL_FREEING_BIT, ENABLE_BITOPS(enabler return -EBUSY; + align_addr_bit(, ); + ret = pin_user_pages_remote(mm->mm, uaddr, 1, FOLL_WRITE | FOLL_NOFAULT, , NULL); @@ -514,9 +534,9 @@ static int user_event_enabler_write(struct user_event_mm *mm, /* Update bit atomically, user tracers must be atomic as well */ if (enabler->event && enabler->event->status) - set_bit(ENABLE_BIT(enabler), ptr); + set_bit(bit, ptr); else - clear_bit(ENABLE_BIT(enabler), ptr); + clear_bit(bit, ptr); kunmap_local(kaddr); unpin_user_pages_dirty_lock(, 1, true);
Re: [PATCH] tracing/user_events: align uaddr on unsigned long alignment
On Thu, Sep 14, 2023 at 03:11:02PM +0200, Clément Léger wrote: > enabler->uaddr can be aligned on 32 or 64 bits. If aligned on 32 bits, > this will result in a misaligned access on 64 bits architectures since > set_bit()/clear_bit() are expecting an unsigned long (aligned) pointer. > On architecture that do not support misaligned access, this will crash > the kernel. Align uaddr on unsigned long size to avoid such behavior. > This bug was found while running kselftests on RISC-V. > > Fixes: 7235759084a4 ("tracing/user_events: Use remote writes for event > enablement") > Signed-off-by: Clément Léger Thanks for fixing! I have a few comments on this. I unfortunately do not have RISC-V hardware to validate this on. > --- > kernel/trace/trace_events_user.c | 12 +--- > 1 file changed, 9 insertions(+), 3 deletions(-) > > diff --git a/kernel/trace/trace_events_user.c > b/kernel/trace/trace_events_user.c > index 6f046650e527..580c0fe4b23e 100644 > --- a/kernel/trace/trace_events_user.c > +++ b/kernel/trace/trace_events_user.c > @@ -479,7 +479,7 @@ static int user_event_enabler_write(struct user_event_mm > *mm, > bool fixup_fault, int *attempt) > { > unsigned long uaddr = enabler->addr; > - unsigned long *ptr; > + unsigned long *ptr, bit_offset; > struct page *page; > void *kaddr; > int ret; > @@ -511,13 +511,19 @@ static int user_event_enabler_write(struct > user_event_mm *mm, > } > > kaddr = kmap_local_page(page); > + > + bit_offset = uaddr & (sizeof(unsigned long) - 1); > + if (bit_offset) { > + bit_offset *= 8; I think for future readers of this code it would be more clear to use BITS_PER_BYTE instead of the hardcoded 8. Given we always align on a "natural" boundary, I believe the bit_offset will always be 32 bits. A comment here might help clarify why we do this as well in case folks don't see the change description. > + uaddr &= ~(sizeof(unsigned long) - 1); Shouldn't this uaddr change be done before calling pin_user_pages_remote() to ensure things cannot go bad? (I don't think they can, but it looks a little odd). Thanks, -Beau > + } > ptr = kaddr + (uaddr & ~PAGE_MASK); > > /* Update bit atomically, user tracers must be atomic as well */ > if (enabler->event && enabler->event->status) > - set_bit(ENABLE_BIT(enabler), ptr); > + set_bit(ENABLE_BIT(enabler) + bit_offset, ptr); > else > - clear_bit(ENABLE_BIT(enabler), ptr); > + clear_bit(ENABLE_BIT(enabler) + bit_offset, ptr); > > kunmap_local(kaddr); > unpin_user_pages_dirty_lock(, 1, true); > -- > 2.40.1
[PATCH v2 1/3] tracing/user_events: Allow events to persist for perfmon_capable users
There are several scenarios that have come up where having a user_event persist even if the process that registered it exits. The main one is having a daemon create events on bootup that shouldn't get deleted if the daemon has to exit or reload. Another is within OpenTelemetry exporters, they wish to potentially check if a user_event exists on the system to determine if exporting the data out should occur. The user_event in this case must exist even in the absence of the owning process running (such as the above daemon case). Expose the previously internal flag USER_EVENT_REG_PERSIST to user processes. Upon register or delete of events with this flag, ensure the user is perfmon_capable to prevent random user processes with access to tracefs from creating events that persist after exit. Signed-off-by: Beau Belgrave --- include/uapi/linux/user_events.h | 11 +- kernel/trace/trace_events_user.c | 36 +++- 2 files changed, 32 insertions(+), 15 deletions(-) diff --git a/include/uapi/linux/user_events.h b/include/uapi/linux/user_events.h index 2984aae4a2b4..f74f3aedd49c 100644 --- a/include/uapi/linux/user_events.h +++ b/include/uapi/linux/user_events.h @@ -17,6 +17,15 @@ /* Create dynamic location entry within a 32-bit value */ #define DYN_LOC(offset, size) ((size) << 16 | (offset)) +/* List of supported registration flags */ +enum user_reg_flag { + /* Event will not delete upon last reference closing */ + USER_EVENT_REG_PERSIST = 1U << 0, + + /* This value or above is currently non-ABI */ + USER_EVENT_REG_MAX = 1U << 1, +}; + /* * Describes an event registration and stores the results of the registration. * This structure is passed to the DIAG_IOCSREG ioctl, callers at a minimum @@ -33,7 +42,7 @@ struct user_reg { /* Input: Enable size in bytes at address */ __u8enable_size; - /* Input: Flags for future use, set to 0 */ + /* Input: Flags to use, if any */ __u16 flags; /* Input: Address to update when enabled */ diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index 6f046650e527..e3f2b8d72e01 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -49,18 +49,6 @@ #define EVENT_STATUS_PERF BIT(1) #define EVENT_STATUS_OTHER BIT(7) -/* - * User register flags are not allowed yet, keep them here until we are - * ready to expose them out to the user ABI. - */ -enum user_reg_flag { - /* Event will not delete upon last reference closing */ - USER_EVENT_REG_PERSIST = 1U << 0, - - /* This value or above is currently non-ABI */ - USER_EVENT_REG_MAX = 1U << 1, -}; - /* * Stores the system name, tables, and locks for a group of events. This * allows isolation for events by various means. @@ -191,6 +179,17 @@ static u32 user_event_key(char *name) return jhash(name, strlen(name), 0); } +static bool user_event_capable(u16 reg_flags) +{ + /* Persistent events require CAP_PERFMON / CAP_SYS_ADMIN */ + if (reg_flags & USER_EVENT_REG_PERSIST) { + if (!perfmon_capable()) + return false; + } + + return true; +} + static struct user_event *user_event_get(struct user_event *user) { refcount_inc(>refcnt); @@ -1773,6 +1772,9 @@ static int user_event_free(struct dyn_event *ev) if (!user_event_last_ref(user)) return -EBUSY; + if (!user_event_capable(user->reg_flags)) + return -EPERM; + return destroy_user_event(user); } @@ -1888,10 +1890,13 @@ static int user_event_parse(struct user_event_group *group, char *name, int argc = 0; char **argv; - /* User register flags are not ready yet */ - if (reg_flags != 0 || flags != NULL) + /* Currently don't support any text based flags */ + if (flags != NULL) return -EINVAL; + if (!user_event_capable(reg_flags)) + return -EPERM; + /* Prevent dyn_event from racing */ mutex_lock(_mutex); user = find_user_event(group, name, ); @@ -2024,6 +2029,9 @@ static int delete_user_event(struct user_event_group *group, char *name) if (!user_event_last_ref(user)) return -EBUSY; + if (!user_event_capable(user->reg_flags)) + return -EPERM; + return destroy_user_event(user); } -- 2.34.1
[PATCH v2 0/3] tracing/user_events: Allow events to persist for perfmon_capable users
There are several scenarios that have come up where having a user_event persist even if the process that registered it exits. The main one is having a daemon create events on bootup that shouldn't get deleted if the daemon has to exit or reload. Another is within OpenTelemetry exporters, they wish to potentially check if a user_event exists on the system to determine if exporting the data out should occur. The user_event in this case must exist even in the absence of the owning process running (such as the above daemon case). Since persistent events aren't automatically cleaned up, we want to ensure only trusted users are allowed to do this. It seems reasonable to use CAP_PERFMON as that boundary, since those users can already do many things via perf_event_open without requiring full CAP_SYS_ADMIN. This patchset brings back the ability to use /sys/kernel/tracing/dynamic_events to create user_events, as persist is now back to being supported. Both the register and delete of events that persist require CAP_PERFMON, which prevents a non-perfmon user from making an event go away that a perfmon user decided should persist. Change History: V2: Added common user_event_capable() function to check access given the register flags. Ensure access check is done for dynamic_events when implicitly removing events, such as "echo 'u:test' > /sys/kernel/tracing/dynamic_events". Beau Belgrave (3): tracing/user_events: Allow events to persist for perfmon_capable users selftests/user_events: Test persist flag cases tracing/user_events: Document persist event flags Documentation/trace/user_events.rst | 21 ++- include/uapi/linux/user_events.h | 11 +++- kernel/trace/trace_events_user.c | 36 +++- .../testing/selftests/user_events/abi_test.c | 55 ++- .../testing/selftests/user_events/dyn_test.c | 54 +- 5 files changed, 157 insertions(+), 20 deletions(-) base-commit: fc1653abba0d554aad80224e51bcad42b09895ed -- 2.34.1
[PATCH v2 3/3] tracing/user_events: Document persist event flags
Users need to know how to make events persist now that we allow for that. We also now allow the dynamic_events file to create events by utilizing the persist flag during event register. Add back in to documentation how /sys/kernel/tracing/dynamic_events can be used to create persistent user_events. Add a section under registering for the currently supported flags (USER_EVENT_REG_PERSIST) and the required permissions. Add a note under deleting that deleting a persistent event also requires sufficient permission. Signed-off-by: Beau Belgrave --- Documentation/trace/user_events.rst | 21 ++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/Documentation/trace/user_events.rst b/Documentation/trace/user_events.rst index e7b07313550a..576d2c35f22e 100644 --- a/Documentation/trace/user_events.rst +++ b/Documentation/trace/user_events.rst @@ -14,6 +14,11 @@ Programs can view status of the events via /sys/kernel/tracing/user_events_status and can both register and write data out via /sys/kernel/tracing/user_events_data. +Programs can also use /sys/kernel/tracing/dynamic_events to register and +delete user based events via the u: prefix. The format of the command to +dynamic_events is the same as the ioctl with the u: prefix applied. This +requires CAP_PERFMON due to the event persisting, otherwise -EPERM is returned. + Typically programs will register a set of events that they wish to expose to tools that can read trace_events (such as ftrace and perf). The registration process tells the kernel which address and bit to reflect if any tool has @@ -45,7 +50,7 @@ This command takes a packed struct user_reg as an argument:: /* Input: Enable size in bytes at address */ __u8 enable_size; -/* Input: Flags for future use, set to 0 */ +/* Input: Flags to use, if any */ __u16 flags; /* Input: Address to update when enabled */ @@ -69,7 +74,7 @@ The struct user_reg requires all the above inputs to be set appropriately. This must be 4 (32-bit) or 8 (64-bit). 64-bit values are only allowed to be used on 64-bit kernels, however, 32-bit can be used on all kernels. -+ flags: The flags to use, if any. For the initial version this must be 0. ++ flags: The flags to use, if any. Callers should first attempt to use flags and retry without flags to ensure support for lower versions of the kernel. If a flag is not supported -EINVAL is returned. @@ -80,6 +85,13 @@ The struct user_reg requires all the above inputs to be set appropriately. + name_args: The name and arguments to describe the event, see command format for details. +The following flags are currently supported. + ++ USER_EVENT_REG_PERSIST: The event will not delete upon the last reference + closing. Callers may use this if an event should exist even after the + process closes or unregisters the event. Requires CAP_PERFMON otherwise + -EPERM is returned. + Upon successful registration the following is set. + write_index: The index to use for this file descriptor that represents this @@ -141,7 +153,10 @@ event (in both user and kernel space). User programs should use a separate file to request deletes than the one used for registration due to this. **NOTE:** By default events will auto-delete when there are no references left -to the event. Flags in the future may change this logic. +to the event. If programs do not want auto-delete, they must use the +USER_EVENT_REG_PERSIST flag when registering the event. Once that flag is used +the event exists until DIAG_IOCSDEL is invoked. Both register and delete of an +event that persists requires CAP_PERFMON, otherwise -EPERM is returned. Unregistering - -- 2.34.1
[PATCH v2 2/3] selftests/user_events: Test persist flag cases
Now that we have exposed USER_EVENT_REG_PERSIST events can persist both via the ABI and in the /sys/kernel/tracing/dynamic_events file. Ensure both the ABI and DYN cases work by calling both during the parse tests. Add new flags test that ensures only USER_EVENT_REG_PERSIST is honored and any other flag is invalid. Signed-off-by: Beau Belgrave --- .../testing/selftests/user_events/abi_test.c | 55 ++- .../testing/selftests/user_events/dyn_test.c | 54 +- 2 files changed, 107 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/user_events/abi_test.c b/tools/testing/selftests/user_events/abi_test.c index 5125c42efe65..b95fc15496ba 100644 --- a/tools/testing/selftests/user_events/abi_test.c +++ b/tools/testing/selftests/user_events/abi_test.c @@ -23,6 +23,18 @@ const char *data_file = "/sys/kernel/tracing/user_events_data"; const char *enable_file = "/sys/kernel/tracing/events/user_events/__abi_event/enable"; +static bool event_exists(void) +{ + int fd = open(enable_file, O_RDWR); + + if (fd < 0) + return false; + + close(fd); + + return true; +} + static int change_event(bool enable) { int fd = open(enable_file, O_RDWR); @@ -46,7 +58,22 @@ static int change_event(bool enable) return ret; } -static int reg_enable(long *enable, int size, int bit) +static int event_delete(void) +{ + int fd = open(data_file, O_RDWR); + int ret; + + if (fd < 0) + return -1; + + ret = ioctl(fd, DIAG_IOCSDEL, "__abi_event"); + + close(fd); + + return ret; +} + +static int reg_enable_flags(long *enable, int size, int bit, int flags) { struct user_reg reg = {0}; int fd = open(data_file, O_RDWR); @@ -57,6 +84,7 @@ static int reg_enable(long *enable, int size, int bit) reg.size = sizeof(reg); reg.name_args = (__u64)"__abi_event"; + reg.flags = flags; reg.enable_bit = bit; reg.enable_addr = (__u64)enable; reg.enable_size = size; @@ -68,6 +96,11 @@ static int reg_enable(long *enable, int size, int bit) return ret; } +static int reg_enable(long *enable, int size, int bit) +{ + return reg_enable_flags(enable, size, bit, 0); +} + static int reg_disable(long *enable, int bit) { struct user_unreg reg = {0}; @@ -121,6 +154,26 @@ TEST_F(user, enablement) { ASSERT_EQ(0, change_event(false)); } +TEST_F(user, flags) { + /* USER_EVENT_REG_PERSIST is allowed */ + ASSERT_EQ(0, reg_enable_flags(>check, sizeof(int), 0, + USER_EVENT_REG_PERSIST)); + ASSERT_EQ(0, reg_disable(>check, 0)); + + /* Ensure it exists after close and disable */ + ASSERT_TRUE(event_exists()); + + /* Ensure we can delete it */ + ASSERT_EQ(0, event_delete()); + + /* USER_EVENT_REG_MAX or above is not allowed */ + ASSERT_EQ(-1, reg_enable_flags(>check, sizeof(int), 0, + USER_EVENT_REG_MAX)); + + /* Ensure it does not exist after invalid flags */ + ASSERT_FALSE(event_exists()); +} + TEST_F(user, bit_sizes) { /* Allow 0-31 bits for 32-bit */ ASSERT_EQ(0, reg_enable(>check, sizeof(int), 0)); diff --git a/tools/testing/selftests/user_events/dyn_test.c b/tools/testing/selftests/user_events/dyn_test.c index 91aad42b..f2a41bcb5ad8 100644 --- a/tools/testing/selftests/user_events/dyn_test.c +++ b/tools/testing/selftests/user_events/dyn_test.c @@ -16,9 +16,25 @@ #include "../kselftest_harness.h" +const char *dyn_file = "/sys/kernel/tracing/dynamic_events"; const char *abi_file = "/sys/kernel/tracing/user_events_data"; const char *enable_file = "/sys/kernel/tracing/events/user_events/__test_event/enable"; +static int event_delete(void) +{ + int fd = open(abi_file, O_RDWR); + int ret; + + if (fd < 0) + return -1; + + ret = ioctl(fd, DIAG_IOCSDEL, "__test_event"); + + close(fd); + + return ret; +} + static bool wait_for_delete(void) { int i; @@ -63,7 +79,31 @@ static int unreg_event(int fd, int *check, int bit) return ioctl(fd, DIAG_IOCSUNREG, ); } -static int parse(int *check, const char *value) +static int parse_dyn(const char *value) +{ + int fd = open(dyn_file, O_RDWR | O_APPEND); + int len = strlen(value); + int ret; + + if (fd == -1) + return -1; + + ret = write(fd, value, len); + + if (ret == len) + ret = 0; + else + ret = -1; + + close(fd); + + if (ret == 0) + event_delete(); + + return ret; +} + +static int parse_abi(int *check, const char *value) { int fd = open(abi_file, O_RDWR); int ret; @@ -89,6 +129,18 @@ static int parse(int *che