Re: [PATCH v2 1/2] tracing/user_events: Fix non-spaced field matching

2024-05-02 Thread Beau Belgrave
On Thu, May 02, 2024 at 05:16:34PM -0400, Steven Rostedt wrote:
> On Tue, 23 Apr 2024 16:23:37 +
> Beau Belgrave  wrote:
> 
> > When the ABI was updated to prevent same name w/different args, it
> > missed an important corner case when fields don't end with a space.
> > Typically, space is used for fields to help separate them, like
> > "u8 field1; u8 field2". If no spaces are used, like
> > "u8 field1;u8 field2", then the parsing works for the first time.
> > However, the match check fails on a subsequent register, leading to
> > confusion.
> > 
> > This is because the match check uses argv_split() and assumes that all
> > fields will be split upon the space. When spaces are used, we get back
> > { "u8", "field1;" }, without spaces we get back { "u8", "field1;u8" }.
> > This causes a mismatch, and the user program gets back -EADDRINUSE.
> > 
> > Add a method to detect this case before calling argv_split(). If found
> > force a space after the field separator character ';'. This ensures all
> > cases work properly for matching.
> > 
> > With this fix, the following are all treated as matching:
> > u8 field1;u8 field2
> > u8 field1; u8 field2
> > u8 field1;\tu8 field2
> > u8 field1;\nu8 field2
> 
> I'm curious, what happens if you have: "u8 field1; u8 field2;" ?
> 

You'll get an extra whitespace during the copy, assuming it was really:
"u8 field1;u8 field2"

If it had spaces, this code wouldn't run.

> Do you care? As you will then create "u8 field1; u8 field2; "
> 
> but I'm guessing the extra whitespace at the end doesn't affect anything.
> 

Right, you get an extra byte allocated, but the argv_split() with ignore
it. The compare will work correctly (I've verified this just now to
double check).

IE these all match:
"Test u8 a; u8 b; "
"Test u8 a; u8 b;"
"Test u8 a; u8 b"

> 
> > 
> > Fixes: ba470eebc2f6 ("tracing/user_events: Prevent same name but different 
> > args event")
> > Signed-off-by: Beau Belgrave 
> > ---
> >  kernel/trace/trace_events_user.c | 76 +++-
> >  1 file changed, 75 insertions(+), 1 deletion(-)
> > 
> > diff --git a/kernel/trace/trace_events_user.c 
> > b/kernel/trace/trace_events_user.c
> > index 70d428c394b6..82b191f33a28 100644
> > --- a/kernel/trace/trace_events_user.c
> > +++ b/kernel/trace/trace_events_user.c
> > @@ -1989,6 +1989,80 @@ static int user_event_set_tp_name(struct user_event 
> > *user)
> > return 0;
> >  }
> >  
> > +/*
> > + * Counts how many ';' without a trailing space are in the args.
> > + */
> > +static int count_semis_no_space(char *args)
> > +{
> > +   int count = 0;
> > +
> > +   while ((args = strchr(args, ';'))) {
> > +   args++;
> > +
> > +   if (!isspace(*args))
> > +   count++;
> 
> This will count that "..;" 
> 
> This is most likely not an issue, but since I didn't see this case
> anywhere, I figured I bring it up just to confirm that it's not an issue.
> 

It's not an issue on the matching/logic. However, you do get an extra
byte alloc (which doesn't bother me in this edge case).

Thanks,
-Beau

> -- Steve
> 
> 
> > +   }
> > +
> > +   return count;
> > +}
> > +
> > +/*
> > + * Copies the arguments while ensuring all ';' have a trailing space.
> > + */
> > +static char *insert_space_after_semis(char *args, int count)
> > +{
> > +   char *fixed, *pos;
> > +   int len;
> > +
> > +   len = strlen(args) + count;
> > +   fixed = kmalloc(len + 1, GFP_KERNEL);
> > +
> > +   if (!fixed)
> > +   return NULL;
> > +
> > +   pos = fixed;
> > +
> > +   /* Insert a space after ';' if there is no trailing space. */
> > +   while (*args) {
> > +   *pos = *args++;
> > +
> > +   if (*pos++ == ';' && !isspace(*args))
> > +   *pos++ = ' ';
> > +   }
> > +
> > +   *pos = '\0';
> > +
> > +   return fixed;
> > +}
> > +



[PATCH v2 1/2] tracing/user_events: Fix non-spaced field matching

2024-04-23 Thread Beau Belgrave
When the ABI was updated to prevent same name w/different args, it
missed an important corner case when fields don't end with a space.
Typically, space is used for fields to help separate them, like
"u8 field1; u8 field2". If no spaces are used, like
"u8 field1;u8 field2", then the parsing works for the first time.
However, the match check fails on a subsequent register, leading to
confusion.

This is because the match check uses argv_split() and assumes that all
fields will be split upon the space. When spaces are used, we get back
{ "u8", "field1;" }, without spaces we get back { "u8", "field1;u8" }.
This causes a mismatch, and the user program gets back -EADDRINUSE.

Add a method to detect this case before calling argv_split(). If found
force a space after the field separator character ';'. This ensures all
cases work properly for matching.

With this fix, the following are all treated as matching:
u8 field1;u8 field2
u8 field1; u8 field2
u8 field1;\tu8 field2
u8 field1;\nu8 field2

Fixes: ba470eebc2f6 ("tracing/user_events: Prevent same name but different args 
event")
Signed-off-by: Beau Belgrave 
---
 kernel/trace/trace_events_user.c | 76 +++-
 1 file changed, 75 insertions(+), 1 deletion(-)

diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
index 70d428c394b6..82b191f33a28 100644
--- a/kernel/trace/trace_events_user.c
+++ b/kernel/trace/trace_events_user.c
@@ -1989,6 +1989,80 @@ static int user_event_set_tp_name(struct user_event 
*user)
return 0;
 }
 
+/*
+ * Counts how many ';' without a trailing space are in the args.
+ */
+static int count_semis_no_space(char *args)
+{
+   int count = 0;
+
+   while ((args = strchr(args, ';'))) {
+   args++;
+
+   if (!isspace(*args))
+   count++;
+   }
+
+   return count;
+}
+
+/*
+ * Copies the arguments while ensuring all ';' have a trailing space.
+ */
+static char *insert_space_after_semis(char *args, int count)
+{
+   char *fixed, *pos;
+   int len;
+
+   len = strlen(args) + count;
+   fixed = kmalloc(len + 1, GFP_KERNEL);
+
+   if (!fixed)
+   return NULL;
+
+   pos = fixed;
+
+   /* Insert a space after ';' if there is no trailing space. */
+   while (*args) {
+   *pos = *args++;
+
+   if (*pos++ == ';' && !isspace(*args))
+   *pos++ = ' ';
+   }
+
+   *pos = '\0';
+
+   return fixed;
+}
+
+static char **user_event_argv_split(char *args, int *argc)
+{
+   char **split;
+   char *fixed;
+   int count;
+
+   /* Count how many ';' without a trailing space */
+   count = count_semis_no_space(args);
+
+   /* No fixup is required */
+   if (!count)
+   return argv_split(GFP_KERNEL, args, argc);
+
+   /* We must fixup 'field;field' to 'field; field' */
+   fixed = insert_space_after_semis(args, count);
+
+   if (!fixed)
+   return NULL;
+
+   /* We do a normal split afterwards */
+   split = argv_split(GFP_KERNEL, fixed, argc);
+
+   /* We can free since argv_split makes a copy */
+   kfree(fixed);
+
+   return split;
+}
+
 /*
  * Parses the event name, arguments and flags then registers if successful.
  * The name buffer lifetime is owned by this method for success cases only.
@@ -2012,7 +2086,7 @@ static int user_event_parse(struct user_event_group 
*group, char *name,
return -EPERM;
 
if (args) {
-   argv = argv_split(GFP_KERNEL, args, );
+   argv = user_event_argv_split(args, );
 
if (!argv)
return -ENOMEM;
-- 
2.34.1




[PATCH v2 2/2] selftests/user_events: Add non-spacing separator check

2024-04-23 Thread Beau Belgrave
The ABI documentation indicates that field separators do not need a
space between them, only a ';'. When no spacing is used, the register
must work. Any subsequent register, with or without spaces, must match
and not return -EADDRINUSE.

Add a non-spacing separator case to our self-test register case to ensure
it works going forward.

Signed-off-by: Beau Belgrave 
---
 tools/testing/selftests/user_events/ftrace_test.c | 8 
 1 file changed, 8 insertions(+)

diff --git a/tools/testing/selftests/user_events/ftrace_test.c 
b/tools/testing/selftests/user_events/ftrace_test.c
index dcd7509fe2e0..0bb46793dcd4 100644
--- a/tools/testing/selftests/user_events/ftrace_test.c
+++ b/tools/testing/selftests/user_events/ftrace_test.c
@@ -261,6 +261,12 @@ TEST_F(user, register_events) {
ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSREG, ));
ASSERT_EQ(0, reg.write_index);
 
+   /* Register without separator spacing should still match */
+   reg.enable_bit = 29;
+   reg.name_args = (__u64)"__test_event u32 field1;u32 field2";
+   ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSREG, ));
+   ASSERT_EQ(0, reg.write_index);
+
/* Multiple registers to same name but different args should fail */
reg.enable_bit = 29;
reg.name_args = (__u64)"__test_event u32 field1;";
@@ -288,6 +294,8 @@ TEST_F(user, register_events) {
ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSUNREG, ));
unreg.disable_bit = 30;
ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSUNREG, ));
+   unreg.disable_bit = 29;
+   ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSUNREG, ));
 
/* Delete should have been auto-done after close and unregister */
close(self->data_fd);
-- 
2.34.1




[PATCH v2 0/2] tracing/user_events: Fix non-spaced field matching

2024-04-23 Thread Beau Belgrave
When the ABI was updated to prevent same name w/different args, it
missed an important corner case when fields don't end with a space.
Typically, space is used for fields to help separate them, like
"u8 field1; u8 field2". If no spaces are used, like
"u8 field1;u8 field2", then the parsing works for the first time.
However, the match check fails on a subsequent register, leading to
confusion.

This is because the match check uses argv_split() and assumes that all
fields will be split upon the space. When spaces are used, we get back
{ "u8", "field1;" }, without spaces we get back { "u8", "field1;u8" }.
This causes a mismatch, and the user program gets back -EADDRINUSE.

Add a method to detect this case before calling argv_split(). If found
force a space after the field separator character ';'. This ensures all
cases work properly for matching.

I could not find an existing function to accomplish this, so I had to
hand code a copy with this logic. If there is a better way to achieve
this, I'm all ears.

This series also adds a selftest to ensure this doesn't break again.

With this fix, the following are all treated as matching:
u8 field1;u8 field2
u8 field1; u8 field2
u8 field1;\tu8 field2
u8 field1;\nu8 field2

V2 changes:
  Renamed fix_semis_no_space() to insert_space_after_semis().
  Have user_event_argv_split() return fast in no-split case.
  Pulled in Masami's shorter loop in insert_space_after_semis().

Beau Belgrave (2):
  tracing/user_events: Fix non-spaced field matching
  selftests/user_events: Add non-spacing separator check

 kernel/trace/trace_events_user.c  | 76 ++-
 .../selftests/user_events/ftrace_test.c   |  8 ++
 2 files changed, 83 insertions(+), 1 deletion(-)


base-commit: 0bbac3facb5d6cc0171c45c9873a2dc96bea9680
-- 
2.34.1




Re: [PATCH 1/2] tracing/user_events: Fix non-spaced field matching

2024-04-22 Thread Beau Belgrave
On Sat, Apr 20, 2024 at 09:50:52PM +0900, Masami Hiramatsu wrote:
> On Fri, 19 Apr 2024 14:13:34 -0700
> Beau Belgrave  wrote:
> 
> > On Fri, Apr 19, 2024 at 11:33:05AM +0900, Masami Hiramatsu wrote:
> > > On Tue, 16 Apr 2024 22:41:01 +
> > > Beau Belgrave  wrote:

*SNIP*

> > > nit: This loop can be simpler, because we are sure fixed has enough 
> > > length;
> > > 
> > > /* insert a space after ';' if there is no space. */
> > > while(*args) {
> > >   *pos = *args++;
> > >   if (*pos++ == ';' && !isspace(*args))
> > >   *pos++ = ' ';
> > > }
> > > 
> > 
> > I was worried that if count_semis_no_space() ever had different logic
> > (maybe after this commit) that it could cause an overflow if the count
> > was wrong, etc.
> > 
> > I don't have an issue making it shorter, but I was trying to be more on
> > the safe side, since this isn't a fast path (event register).
> 
> OK, anyway current code looks correct. But note that I don't think
> "pos++; len--;" is safer, since it is not atomic. This pattern
> easily loose "len--;" in my experience. So please carefully use it ;)
> 

I'll stick with your loop. Perhaps others will chime in on the v2 and
state a stronger opinion.

You scared me with the atomic comment, I went back and looked at all the
paths for this. In the user_events IOCTL the buffer is copied from user
to kernel, so it cannot change (and no other threads access it). I also
checked trace_parse_run_command() which is the same. So at least in this
context the non-atomic part is OK.

> > 
> > > > +
> > > > +   /*
> > > > +* len is the length of the copy excluding the null.
> > > > +* This ensures we always have room for a null.
> > > > +*/
> > > > +   *pos = '\0';
> > > > +
> > > > +   return fixed;
> > > > +}
> > > > +
> > > > +static char **user_event_argv_split(char *args, int *argc)
> > > > +{
> > > > +   /* Count how many ';' without a trailing space */
> > > > +   int count = count_semis_no_space(args);
> > > > +
> > > > +   if (count) {
> > > 
> > > nit: it is better to exit fast, so 
> > > 
> > >   if (!count)
> > >   return argv_split(GFP_KERNEL, args, argc);
> > > 
> > >   ...
> > 
> > Sure, will fix in a v2.
> > 
> > > 
> > > Thank you,
> > > 
> > > OT: BTW, can this also simplify synthetic events?
> > > 
> > 
> > I'm not sure, I'll check when I have some time. I want to get this fix
> > in sooner rather than later.
> 
> Ah, nevermind. Synthetic event parses the field by strsep(';') first
> and argv_split(). So it does not have this issue.
> 

Ok, seems unrelated. Thanks for checking.

Thanks,
-Beau

> Thank you,
> 
> > 
> > Thanks,
> > -Beau
> > 

*SNIP* 

> > > Masami Hiramatsu (Google) 
> 
> 
> -- 
> Masami Hiramatsu (Google) 



Re: [PATCH 1/2] tracing/user_events: Fix non-spaced field matching

2024-04-19 Thread Beau Belgrave
On Fri, Apr 19, 2024 at 11:33:05AM +0900, Masami Hiramatsu wrote:
> On Tue, 16 Apr 2024 22:41:01 +
> Beau Belgrave  wrote:
> 
> > When the ABI was updated to prevent same name w/different args, it
> > missed an important corner case when fields don't end with a space.
> > Typically, space is used for fields to help separate them, like
> > "u8 field1; u8 field2". If no spaces are used, like
> > "u8 field1;u8 field2", then the parsing works for the first time.
> > However, the match check fails on a subsequent register, leading to
> > confusion.
> > 
> > This is because the match check uses argv_split() and assumes that all
> > fields will be split upon the space. When spaces are used, we get back
> > { "u8", "field1;" }, without spaces we get back { "u8", "field1;u8" }.
> > This causes a mismatch, and the user program gets back -EADDRINUSE.
> > 
> > Add a method to detect this case before calling argv_split(). If found
> > force a space after the field separator character ';'. This ensures all
> > cases work properly for matching.
> > 
> > With this fix, the following are all treated as matching:
> > u8 field1;u8 field2
> > u8 field1; u8 field2
> > u8 field1;\tu8 field2
> > u8 field1;\nu8 field2
> 
> Sounds good to me. I just have some nits.
> 
> > 
> > Fixes: ba470eebc2f6 ("tracing/user_events: Prevent same name but different 
> > args event")
> > Signed-off-by: Beau Belgrave 
> > ---
> >  kernel/trace/trace_events_user.c | 88 +++-
> >  1 file changed, 87 insertions(+), 1 deletion(-)
> > 
> > diff --git a/kernel/trace/trace_events_user.c 
> > b/kernel/trace/trace_events_user.c
> > index 70d428c394b6..9184d3962b2a 100644
> > --- a/kernel/trace/trace_events_user.c
> > +++ b/kernel/trace/trace_events_user.c
> > @@ -1989,6 +1989,92 @@ static int user_event_set_tp_name(struct user_event 
> > *user)
> > return 0;
> >  }
> >  
> > +/*
> > + * Counts how many ';' without a trailing space are in the args.
> > + */
> > +static int count_semis_no_space(char *args)
> > +{
> > +   int count = 0;
> > +
> > +   while ((args = strchr(args, ';'))) {
> > +   args++;
> > +
> > +   if (!isspace(*args))
> > +   count++;
> > +   }
> > +
> > +   return count;
> > +}
> > +
> > +/*
> > + * Copies the arguments while ensuring all ';' have a trailing space.
> > + */
> > +static char *fix_semis_no_space(char *args, int count)
> 
> nit: This name does not represent what it does. 'insert_space_after_semis()'
> is more self-described.
> 

Sure, will fix in a v2.

> > +{
> > +   char *fixed, *pos;
> > +   char c, last;
> > +   int len;
> > +
> > +   len = strlen(args) + count;
> > +   fixed = kmalloc(len + 1, GFP_KERNEL);
> > +
> > +   if (!fixed)
> > +   return NULL;
> > +
> > +   pos = fixed;
> > +   last = '\0';
> > +
> > +   while (len > 0) {
> > +   c = *args++;
> > +
> > +   if (last == ';' && !isspace(c)) {
> > +   *pos++ = ' ';
> > +   len--;
> > +   }
> > +
> > +   if (len > 0) {
> > +   *pos++ = c;
> > +   len--;
> > +   }
> > +
> > +   last = c;
> > +   }
> 
> nit: This loop can be simpler, because we are sure fixed has enough length;
> 
> /* insert a space after ';' if there is no space. */
> while(*args) {
>   *pos = *args++;
>   if (*pos++ == ';' && !isspace(*args))
>   *pos++ = ' ';
> }
> 

I was worried that if count_semis_no_space() ever had different logic
(maybe after this commit) that it could cause an overflow if the count
was wrong, etc.

I don't have an issue making it shorter, but I was trying to be more on
the safe side, since this isn't a fast path (event register).

> > +
> > +   /*
> > +* len is the length of the copy excluding the null.
> > +* This ensures we always have room for a null.
> > +*/
> > +   *pos = '\0';
> > +
> > +   return fixed;
> > +}
> > +
> > +static char **user_event_argv_split(char *args, int *argc)
> > +{
> > +   /* Count how many ';' without a trailing space */
> > +   int count = count_semis_no_space(args);
> > +
> > +   if (count) {
> 
> nit: it is better to exit fast, so 
> 
&

[PATCH 2/2] selftests/user_events: Add non-spacing separator check

2024-04-16 Thread Beau Belgrave
The ABI documentation indicates that field separators do not need a
space between them, only a ';'. When no spacing is used, the register
must work. Any subsequent register, with or without spaces, must match
and not return -EADDRINUSE.

Add a non-spacing separator case to our self-test register case to ensure
it works going forward.

Signed-off-by: Beau Belgrave 
---
 tools/testing/selftests/user_events/ftrace_test.c | 8 
 1 file changed, 8 insertions(+)

diff --git a/tools/testing/selftests/user_events/ftrace_test.c 
b/tools/testing/selftests/user_events/ftrace_test.c
index dcd7509fe2e0..0bb46793dcd4 100644
--- a/tools/testing/selftests/user_events/ftrace_test.c
+++ b/tools/testing/selftests/user_events/ftrace_test.c
@@ -261,6 +261,12 @@ TEST_F(user, register_events) {
ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSREG, ));
ASSERT_EQ(0, reg.write_index);
 
+   /* Register without separator spacing should still match */
+   reg.enable_bit = 29;
+   reg.name_args = (__u64)"__test_event u32 field1;u32 field2";
+   ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSREG, ));
+   ASSERT_EQ(0, reg.write_index);
+
/* Multiple registers to same name but different args should fail */
reg.enable_bit = 29;
reg.name_args = (__u64)"__test_event u32 field1;";
@@ -288,6 +294,8 @@ TEST_F(user, register_events) {
ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSUNREG, ));
unreg.disable_bit = 30;
ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSUNREG, ));
+   unreg.disable_bit = 29;
+   ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSUNREG, ));
 
/* Delete should have been auto-done after close and unregister */
close(self->data_fd);
-- 
2.34.1




[PATCH 1/2] tracing/user_events: Fix non-spaced field matching

2024-04-16 Thread Beau Belgrave
When the ABI was updated to prevent same name w/different args, it
missed an important corner case when fields don't end with a space.
Typically, space is used for fields to help separate them, like
"u8 field1; u8 field2". If no spaces are used, like
"u8 field1;u8 field2", then the parsing works for the first time.
However, the match check fails on a subsequent register, leading to
confusion.

This is because the match check uses argv_split() and assumes that all
fields will be split upon the space. When spaces are used, we get back
{ "u8", "field1;" }, without spaces we get back { "u8", "field1;u8" }.
This causes a mismatch, and the user program gets back -EADDRINUSE.

Add a method to detect this case before calling argv_split(). If found
force a space after the field separator character ';'. This ensures all
cases work properly for matching.

With this fix, the following are all treated as matching:
u8 field1;u8 field2
u8 field1; u8 field2
u8 field1;\tu8 field2
u8 field1;\nu8 field2

Fixes: ba470eebc2f6 ("tracing/user_events: Prevent same name but different args 
event")
Signed-off-by: Beau Belgrave 
---
 kernel/trace/trace_events_user.c | 88 +++-
 1 file changed, 87 insertions(+), 1 deletion(-)

diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
index 70d428c394b6..9184d3962b2a 100644
--- a/kernel/trace/trace_events_user.c
+++ b/kernel/trace/trace_events_user.c
@@ -1989,6 +1989,92 @@ static int user_event_set_tp_name(struct user_event 
*user)
return 0;
 }
 
+/*
+ * Counts how many ';' without a trailing space are in the args.
+ */
+static int count_semis_no_space(char *args)
+{
+   int count = 0;
+
+   while ((args = strchr(args, ';'))) {
+   args++;
+
+   if (!isspace(*args))
+   count++;
+   }
+
+   return count;
+}
+
+/*
+ * Copies the arguments while ensuring all ';' have a trailing space.
+ */
+static char *fix_semis_no_space(char *args, int count)
+{
+   char *fixed, *pos;
+   char c, last;
+   int len;
+
+   len = strlen(args) + count;
+   fixed = kmalloc(len + 1, GFP_KERNEL);
+
+   if (!fixed)
+   return NULL;
+
+   pos = fixed;
+   last = '\0';
+
+   while (len > 0) {
+   c = *args++;
+
+   if (last == ';' && !isspace(c)) {
+   *pos++ = ' ';
+   len--;
+   }
+
+   if (len > 0) {
+   *pos++ = c;
+   len--;
+   }
+
+   last = c;
+   }
+
+   /*
+* len is the length of the copy excluding the null.
+* This ensures we always have room for a null.
+*/
+   *pos = '\0';
+
+   return fixed;
+}
+
+static char **user_event_argv_split(char *args, int *argc)
+{
+   /* Count how many ';' without a trailing space */
+   int count = count_semis_no_space(args);
+
+   if (count) {
+   /* We must fixup 'field;field' to 'field; field' */
+   char *fixed = fix_semis_no_space(args, count);
+   char **split;
+
+   if (!fixed)
+   return NULL;
+
+   /* We do a normal split afterwards */
+   split = argv_split(GFP_KERNEL, fixed, argc);
+
+   /* We can free since argv_split makes a copy */
+   kfree(fixed);
+
+   return split;
+   }
+
+   /* No fixup is required */
+   return argv_split(GFP_KERNEL, args, argc);
+}
+
 /*
  * Parses the event name, arguments and flags then registers if successful.
  * The name buffer lifetime is owned by this method for success cases only.
@@ -2012,7 +2098,7 @@ static int user_event_parse(struct user_event_group 
*group, char *name,
return -EPERM;
 
if (args) {
-   argv = argv_split(GFP_KERNEL, args, );
+   argv = user_event_argv_split(args, );
 
if (!argv)
return -ENOMEM;
-- 
2.34.1




[PATCH 0/2] tracing/user_events: Fix non-spaced field matching

2024-04-16 Thread Beau Belgrave
When the ABI was updated to prevent same name w/different args, it
missed an important corner case when fields don't end with a space.
Typically, space is used for fields to help separate them, like
"u8 field1; u8 field2". If no spaces are used, like
"u8 field1;u8 field2", then the parsing works for the first time.
However, the match check fails on a subsequent register, leading to
confusion.

This is because the match check uses argv_split() and assumes that all
fields will be split upon the space. When spaces are used, we get back
{ "u8", "field1;" }, without spaces we get back { "u8", "field1;u8" }.
This causes a mismatch, and the user program gets back -EADDRINUSE.

Add a method to detect this case before calling argv_split(). If found
force a space after the field separator character ';'. This ensures all
cases work properly for matching.

I could not find an existing function to accomplish this, so I had to
hand code a copy with this logic. If there is a better way to achieve
this, I'm all ears.

This series also adds a selftest to ensure this doesn't break again.

With this fix, the following are all treated as matching:
u8 field1;u8 field2
u8 field1; u8 field2
u8 field1;\tu8 field2
u8 field1;\nu8 field2

Beau Belgrave (2):
  tracing/user_events: Fix non-spaced field matching
  selftests/user_events: Add non-spacing separator check

 kernel/trace/trace_events_user.c  | 88 ++-
 .../selftests/user_events/ftrace_test.c   |  8 ++
 2 files changed, 95 insertions(+), 1 deletion(-)


base-commit: 0bbac3facb5d6cc0171c45c9873a2dc96bea9680
-- 
2.34.1




Re: [RFC PATCH 0/4] perf: Correlating user process data to samples

2024-04-12 Thread Beau Belgrave
On Fri, Apr 12, 2024 at 09:12:45AM +0200, Peter Zijlstra wrote:
> 
> On Fri, Apr 12, 2024 at 12:17:28AM +, Beau Belgrave wrote:
> 
> > An idea flow would look like this:
> > User Task   Profile
> > do_work();  sample() -> IP + No activity
> > ...
> > set_activity(123);
> > ...
> > do_work();  sample() -> IP + activity (123)
> > ...
> > set_activity(124);
> > ...
> > do_work();  sample() -> IP + activity (124)
> 
> This, start with this, because until I saw this, I was utterly confused
> as to what the heck you were on about.
> 

Will do.

> I started by thinking we already have TID in samples so you can already
> associate back to user processes and got increasingly confused the
> further I went.
> 
> What you seem to want to do however is have some task-state included so
> you can see what the thread is doing.
> 

Yeah, there is typically an external context (not on the machine) that
wants to be tied to each sample. The context could be a simple integer,
UUID, or something else entirely. For OTel, this is a 16-byte array [1].

> Anyway, since we typically run stuff from NMI context, accessing user
> data is 'interesting'. As such I would really like to make this work
> depend on the call-graph rework that pushes all the user access bits
> into return-to-user.

Cool, I assume that's the SFRAME work? Are there pointers to work I
could look at and think about what a rebase looks like? Or do you have
someone in mind I should work with for this?

Thanks,
-Beau

1. https://www.w3.org/TR/trace-context/#version-format



Re: [RFC PATCH 0/4] perf: Correlating user process data to samples

2024-04-12 Thread Beau Belgrave
On Thu, Apr 11, 2024 at 09:52:22PM -0700, Ian Rogers wrote:
> On Thu, Apr 11, 2024 at 5:17 PM Beau Belgrave  
> wrote:
> >
> > In the Open Telemetry profiling SIG [1], we are trying to find a way to
> > grab a tracing association quickly on a per-sample basis. The team at
> > Elastic has a bespoke way to do this [2], however, I'd like to see a
> > more general way to achieve this. The folks I've been talking with seem
> > open to the idea of just having a TLS value for this we could capture
> 
> Presumably TLS == Thread Local Storage.
> 

Yes, the initial idea is to use thread local storage (TLS). It seems to
be the fastest option to save a per-thread value that changes at a fast
rate.

> > upon each sample. We could then just state, Open Telemetry SDKs should
> > have a TLS value for span correlation. However, we need a way to sample
> > the TLS or other value(s) when a sampling event is generated. This is
> > supported today on Windows via EventActivityIdControl() [3]. Since
> > Open Telemetry works on both Windows and Linux, ideally we can do
> > something as efficient for Linux based workloads.
> >
> > This series is to explore how it would be best possible to collect
> > supporting data from a user process when a profile sample is collected.
> > Having a value stored in TLS makes a lot of sense for this however
> > there are other ways to explore. Whatever is chosen, kernel samples
> > taken in process context should be able to get this supporting data.
> > In these patches on X64 the fsbase and gsbase are used for this.
> >
> > An option to explore suggested by Mathieu Desnoyers is to utilize rseq
> > for processes to register a value location that can be included when
> > profiling if desired. This would allow a tighter contract between user
> > processes and a profiler.  It would allow better labeling/categorizing
> > the correlation values.
> 
> It is hard to understand this idea. Are you saying stash a cookie in
> TLS for samples to capture to indicate an activity? Restartable
> sequences are about preemption on a CPU not of a thread, so at least
> my intuition is that they feel different. You could stash information
> like this today by changing the thread name which generates comm
> events. I've wondered about having similar information in some form of
> reserved for profiling stack slot, for example, to stash a pointer to
> the name of a function being interpreted. Snapshotting all of a stack
> is bad performance wise and for security. A stack slot would be able
> to deal with nesting.
> 

You are getting the idea. A slot or tag for a thread would be great! I'm
not a fan of overriding the thread comm name (as that already has a
use). TLS would be fine, if we could also pass an offset + size + type.

Maybe a stack slot that just points to parts of TLS? That way you could
have a set of slots that don't require much memory and selectively copy
them out of TLS (or where ever those slots point to in user memory).

When I was talking to Mathieu about this, it seems that rseq already had
a place to potentially put these slots. I'm unsure though how the per
thread aspects would work.

Mathieu, can you post your ideas here about that?

> > An idea flow would look like this:
> > User Task   Profile
> > do_work();  sample() -> IP + No activity
> > ...
> > set_activity(123);
> > ...
> > do_work();  sample() -> IP + activity (123)
> > ...
> > set_activity(124);
> > ...
> > do_work();  sample() -> IP + activity (124)
> >
> > Ideally, the set_activity() method would not be a syscall. It needs to
> > be very cheap as this should not bottleneck work. Ideally this is just
> > a memcpy of 16-20 bytes as it is on Windows via EventActivityIdControl()
> > using EVENT_ACTIVITY_CTRL_SET_ID.
> >
> > For those not aware, Open Telemetry allows collecting data from multiple
> > machines and show where time was spent. The tracing context is already
> > available for logs, but not for profiling samples. The idea is to show
> > where slowdowns occur and have profile samples to explain why they
> > slowed down. This must be possible without having to track context
> > switches to do this correlation. This is because the profiling rates
> > are typically 20hz - 1Khz, while the context switching rates are much
> > higher. We do not want to have to consume high context switch rates
> > just to know a correlation for a 20hz signal. Often these 20hz signals
> > are always enabled in some environments.
> >
> > Regardless if TLS, rseq, or other source is used I believe we will need
> > a way for pe

[RFC PATCH 2/4] perf: Introduce PERF_SAMPLE_TLS_USER sample type

2024-04-11 Thread Beau Belgrave
When samples are generated, there is no way via the perf_event ABI to
fetch per-thread data. This data is very useful in tracing scenarios
that involve correlation IDs, such as OpenTelemetry. They are also
useful for tracking per-thread performance details directly within a
cooperating user process.

The newly establish OpenTelemetry profiling group requires a way to get
tracing correlations on both Linux and Windows. On Windows this
correlation is on a per-thread basis directly via ETW. On Linux we need
a fast mechanism to store these details and TLS seems like the best
option, see links for more details.

Add a new sample type (PERF_SAMPLE_TLS_USER) that fetches TLS data up to
X bytes per-sample. Use the existing PERF_SAMPLE_STACK_USER ABI for
outputting data out to consumers. Store requested data size by the user
in the previously reserved u16 (__reserved_2) within perf_event_attr.

Add tls_addr and tls_user_size to perf_sample_data and calculate them
during sample preparation. This allows the output side to know if
truncation is going to occur and not having to re-fetch the TLS value
from the user process a second time.

Add CONFIG_HAVE_PERF_USER_TLS_DUMP so that architectures can specify if
they have a TLS specific register (or other logic) that can be used for
dumping. This does not yet enable any architecture to do TLS dump, it
simply makes it possible by allowing a arch defined method named
arch_perf_user_tls_pointer().

Add perf_tls struct that arch_perf_user_tls_pointer() utilizes to set
TLS details of the address and size (for 32bit on 64bit compat cases).

Link: https://opentelemetry.io/blog/2024/profiling/
Link: 
https://www.elastic.co/blog/continuous-profiling-distributed-tracing-correlation
Signed-off-by: Beau Belgrave 
---
 arch/Kconfig|   7 +++
 include/linux/perf_event.h  |   7 +++
 include/uapi/linux/perf_event.h |   5 +-
 kernel/events/core.c| 105 +++-
 kernel/events/internal.h|  16 +
 5 files changed, 137 insertions(+), 3 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index 9f066785bb71..6afaf5f46e2f 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -430,6 +430,13 @@ config HAVE_PERF_USER_STACK_DUMP
  access to the user stack pointer which is not unified across
  architectures.
 
+config HAVE_PERF_USER_TLS_DUMP
+   bool
+   help
+ Support user tls dumps for perf event samples. This needs
+ access to the user tls pointer which is not unified across
+ architectures.
+
 config HAVE_ARCH_JUMP_LABEL
bool
 
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index d2a15c0c6f8a..7fac81929eed 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1202,8 +1202,15 @@ struct perf_sample_data {
u64 data_page_size;
u64 code_page_size;
u64 aux_size;
+   u64 tls_addr;
+   u64 tls_user_size;
 } cacheline_aligned;
 
+struct perf_tls {
+   unsigned long base; /* Base address for TLS */
+   unsigned long size; /* Size of base address */
+};
+
 /* default value for data source */
 #define PERF_MEM_NA (PERF_MEM_S(OP, NA)   |\
PERF_MEM_S(LVL, NA)   |\
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 3a64499b0f5d..b62669cfe581 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -162,8 +162,9 @@ enum perf_event_sample_format {
PERF_SAMPLE_DATA_PAGE_SIZE  = 1U << 22,
PERF_SAMPLE_CODE_PAGE_SIZE  = 1U << 23,
PERF_SAMPLE_WEIGHT_STRUCT   = 1U << 24,
+   PERF_SAMPLE_TLS_USER= 1U << 25,
 
-   PERF_SAMPLE_MAX = 1U << 25, /* non-ABI */
+   PERF_SAMPLE_MAX = 1U << 26, /* non-ABI */
 };
 
 #define PERF_SAMPLE_WEIGHT_TYPE(PERF_SAMPLE_WEIGHT | 
PERF_SAMPLE_WEIGHT_STRUCT)
@@ -509,7 +510,7 @@ struct perf_event_attr {
 */
__u32   aux_watermark;
__u16   sample_max_stack;
-   __u16   __reserved_2;
+   __u16   sample_tls_user; /* Size of TLS data to dump on samples */
__u32   aux_sample_size;
__u32   __reserved_3;
 
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 07de5cc2aa25..f848bf4be9bd 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6926,6 +6926,45 @@ static u64 perf_ustack_task_size(struct pt_regs *regs)
return TASK_SIZE - addr;
 }
 
+/*
+ * Get remaining task size from user tls pointer.
+ *
+ * Outputs the address to use for the dump to avoid doing
+ * this twice (prepare and output).
+ */
+static u64
+perf_utls_task_size(struct pt_regs *regs, u64 dump_size, u64 *tls_addr)
+{
+   struct perf_tls tls;
+   unsigne

[RFC PATCH 0/4] perf: Correlating user process data to samples

2024-04-11 Thread Beau Belgrave
In the Open Telemetry profiling SIG [1], we are trying to find a way to
grab a tracing association quickly on a per-sample basis. The team at
Elastic has a bespoke way to do this [2], however, I'd like to see a
more general way to achieve this. The folks I've been talking with seem
open to the idea of just having a TLS value for this we could capture
upon each sample. We could then just state, Open Telemetry SDKs should
have a TLS value for span correlation. However, we need a way to sample
the TLS or other value(s) when a sampling event is generated. This is
supported today on Windows via EventActivityIdControl() [3]. Since
Open Telemetry works on both Windows and Linux, ideally we can do
something as efficient for Linux based workloads.

This series is to explore how it would be best possible to collect
supporting data from a user process when a profile sample is collected.
Having a value stored in TLS makes a lot of sense for this however
there are other ways to explore. Whatever is chosen, kernel samples
taken in process context should be able to get this supporting data.
In these patches on X64 the fsbase and gsbase are used for this.

An option to explore suggested by Mathieu Desnoyers is to utilize rseq
for processes to register a value location that can be included when
profiling if desired. This would allow a tighter contract between user
processes and a profiler.  It would allow better labeling/categorizing
the correlation values.

An idea flow would look like this:
User Task   Profile
do_work();  sample() -> IP + No activity
...
set_activity(123);
...
do_work();  sample() -> IP + activity (123)
...
set_activity(124);
...
do_work();  sample() -> IP + activity (124)

Ideally, the set_activity() method would not be a syscall. It needs to
be very cheap as this should not bottleneck work. Ideally this is just
a memcpy of 16-20 bytes as it is on Windows via EventActivityIdControl()
using EVENT_ACTIVITY_CTRL_SET_ID.

For those not aware, Open Telemetry allows collecting data from multiple
machines and show where time was spent. The tracing context is already
available for logs, but not for profiling samples. The idea is to show
where slowdowns occur and have profile samples to explain why they
slowed down. This must be possible without having to track context
switches to do this correlation. This is because the profiling rates
are typically 20hz - 1Khz, while the context switching rates are much
higher. We do not want to have to consume high context switch rates
just to know a correlation for a 20hz signal. Often these 20hz signals
are always enabled in some environments.

Regardless if TLS, rseq, or other source is used I believe we will need
a way for perf_events to include it within a sample. The changes in this
series show how it could be done with TLS. There is some factoring work
under perf to make it easier to add more dump types using the existing
ABI. This is mostly to make the patches clearer, certainly the refactor
parts could get dropped and we could have duplicated/specialized paths.

1. https://opentelemetry.io/blog/2024/profiling/
2. 
https://www.elastic.co/blog/continuous-profiling-distributed-tracing-correlation
3. 
https://learn.microsoft.com/en-us/windows/win32/api/evntprov/nf-evntprov-eventactivityidcontrol

Beau Belgrave (4):
  perf/core: Introduce perf_prepare_dump_data()
  perf: Introduce PERF_SAMPLE_TLS_USER sample type
  perf/core: Factor perf_output_sample_udump()
  perf/x86/core: Add tls dump support

 arch/Kconfig  |   7 ++
 arch/x86/Kconfig  |   1 +
 arch/x86/events/core.c|  14 +++
 arch/x86/include/asm/perf_event.h |   5 +
 include/linux/perf_event.h|   7 ++
 include/uapi/linux/perf_event.h   |   5 +-
 kernel/events/core.c  | 166 +++---
 kernel/events/internal.h  |  16 +++
 8 files changed, 180 insertions(+), 41 deletions(-)


base-commit: fec50db7033ea478773b159e0e2efb135270e3b7
-- 
2.34.1




[RFC PATCH 4/4] perf/x86/core: Add tls dump support

2024-04-11 Thread Beau Belgrave
Now that perf supports TLS dumps, x86-64 can provide the details for how
to get TLS data for user threads.

Enable HAVE_PERF_USER_TLS_DUMP Kconfig only for x86-64. I do not have
access to x86 to validate 32-bit.

Utilize mmap_is_ia32() to determine 32/64 bit threads. Use fsbase for
64-bit and gsbase for 32-bit with appropriate size.

Signed-off-by: Beau Belgrave 
---
 arch/x86/Kconfig  |  1 +
 arch/x86/events/core.c| 14 ++
 arch/x86/include/asm/perf_event.h |  5 +
 3 files changed, 20 insertions(+)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 4fff6ed46e90..8d46ec8ded0c 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -263,6 +263,7 @@ config X86
select HAVE_PCI
select HAVE_PERF_REGS
select HAVE_PERF_USER_STACK_DUMP
+   select HAVE_PERF_USER_TLS_DUMP  if X86_64
select MMU_GATHER_RCU_TABLE_FREEif PARAVIRT
select MMU_GATHER_MERGE_VMAS
select HAVE_POSIX_CPU_TIMERS_TASK_WORK
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 09050641ce5d..3f851db4c591 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -41,6 +41,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "perf_event.h"
 
@@ -3002,3 +3003,16 @@ u64 perf_get_hw_event_config(int hw_event)
return 0;
 }
 EXPORT_SYMBOL_GPL(perf_get_hw_event_config);
+
+#ifdef CONFIG_X86_64
+void arch_perf_user_tls_pointer(struct perf_tls *tls)
+{
+   if (!mmap_is_ia32()) {
+   tls->base = current->thread.fsbase;
+   tls->size = sizeof(u64);
+   } else {
+   tls->base = current->thread.gsbase;
+   tls->size = sizeof(u32);
+   }
+}
+#endif
diff --git a/arch/x86/include/asm/perf_event.h 
b/arch/x86/include/asm/perf_event.h
index 3736b8a46c04..d0f65e572c20 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -628,4 +628,9 @@ static __always_inline void perf_lopwr_cb(bool lopwr_in)
 
 #define arch_perf_out_copy_user copy_from_user_nmi
 
+#ifdef CONFIG_HAVE_PERF_USER_TLS_DUMP
+struct perf_tls;
+extern void arch_perf_user_tls_pointer(struct perf_tls *tls);
+#endif
+
 #endif /* _ASM_X86_PERF_EVENT_H */
-- 
2.34.1




[RFC PATCH 3/4] perf/core: Factor perf_output_sample_udump()

2024-04-11 Thread Beau Belgrave
We now have two user dump sources (stack and tls). Both are doing the
same logic to ensure the user dump ABI output is properly handled. The
only difference is one gets the address within the method, and the other
is passed the address.

Add perf_output_sample_udump() and utilize it for both stack and tls
sample dumps. The sp register is now fetched outside of this method and
passed to it. This allows both stack and tls to utilize the same code.

Signed-off-by: Beau Belgrave 
---
 kernel/events/core.c | 68 +---
 1 file changed, 19 insertions(+), 49 deletions(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index f848bf4be9bd..6b3cf5afdd32 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6998,47 +6998,10 @@ perf_sample_dump_size(u16 dump_size, u16 header_size, 
u64 task_size)
 }
 
 static void
-perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
- struct pt_regs *regs)
-{
-   /* Case of a kernel thread, nothing to dump */
-   if (!regs) {
-   u64 size = 0;
-   perf_output_put(handle, size);
-   } else {
-   unsigned long sp;
-   unsigned int rem;
-   u64 dyn_size;
-
-   /*
-* We dump:
-* static size
-*   - the size requested by user or the best one we can fit
-* in to the sample max size
-* data
-*   - user stack dump data
-* dynamic size
-*   - the actual dumped size
-*/
-
-   /* Static size. */
-   perf_output_put(handle, dump_size);
-
-   /* Data. */
-   sp = perf_user_stack_pointer(regs);
-   rem = __output_copy_user(handle, (void *) sp, dump_size);
-   dyn_size = dump_size - rem;
-
-   perf_output_skip(handle, rem);
-
-   /* Dynamic size. */
-   perf_output_put(handle, dyn_size);
-   }
-}
-
-static void
-perf_output_sample_utls(struct perf_output_handle *handle, u64 addr,
-   u64 dump_size, struct pt_regs *regs)
+perf_output_sample_udump(struct perf_output_handle *handle,
+unsigned long addr,
+u64 dump_size,
+struct pt_regs *regs)
 {
/* Case of a kernel thread, nothing to dump */
if (!regs) {
@@ -7054,7 +7017,7 @@ perf_output_sample_utls(struct perf_output_handle 
*handle, u64 addr,
 *   - the size requested by user or the best one we can fit
 * in to the sample max size
 * data
-*   - user tls dump data
+*   - user dump data
 * dynamic size
 *   - the actual dumped size
 */
@@ -7507,9 +7470,16 @@ void perf_output_sample(struct perf_output_handle 
*handle,
}
 
if (sample_type & PERF_SAMPLE_STACK_USER) {
-   perf_output_sample_ustack(handle,
- data->stack_user_size,
- data->regs_user.regs);
+   struct pt_regs *regs = data->regs_user.regs;
+   unsigned long sp = 0;
+
+   if (regs)
+   sp = perf_user_stack_pointer(regs);
+
+   perf_output_sample_udump(handle,
+sp,
+data->stack_user_size,
+regs);
}
 
if (sample_type & PERF_SAMPLE_WEIGHT_TYPE)
@@ -7551,10 +7521,10 @@ void perf_output_sample(struct perf_output_handle 
*handle,
perf_output_put(handle, data->code_page_size);
 
if (sample_type & PERF_SAMPLE_TLS_USER) {
-   perf_output_sample_utls(handle,
-   data->tls_addr,
-   data->tls_user_size,
-   data->regs_user.regs);
+   perf_output_sample_udump(handle,
+data->tls_addr,
+data->tls_user_size,
+data->regs_user.regs);
}
 
if (sample_type & PERF_SAMPLE_AUX) {
-- 
2.34.1




[RFC PATCH 1/4] perf/core: Introduce perf_prepare_dump_data()

2024-04-11 Thread Beau Belgrave
Factor out perf_prepare_dump_data() so that the same logic is used for
dumping stack data as other types, such as TLS.

Slightly refactor perf_sample_ustack_size() to perf_sample_dump_size().
Move reg checks up into perf_ustack_task_size() since the task size
must now be calculated before preparing dump data.

Signed-off-by: Beau Belgrave 
---
 kernel/events/core.c | 79 ++--
 1 file changed, 47 insertions(+), 32 deletions(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 724e6d7e128f..07de5cc2aa25 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6912,7 +6912,13 @@ static void perf_sample_regs_intr(struct perf_regs 
*regs_intr,
  */
 static u64 perf_ustack_task_size(struct pt_regs *regs)
 {
-   unsigned long addr = perf_user_stack_pointer(regs);
+   unsigned long addr;
+
+   /* No regs, no stack pointer, no dump. */
+   if (!regs)
+   return 0;
+
+   addr = perf_user_stack_pointer(regs);
 
if (!addr || addr >= TASK_SIZE)
return 0;
@@ -6921,42 +6927,35 @@ static u64 perf_ustack_task_size(struct pt_regs *regs)
 }
 
 static u16
-perf_sample_ustack_size(u16 stack_size, u16 header_size,
-   struct pt_regs *regs)
+perf_sample_dump_size(u16 dump_size, u16 header_size, u64 task_size)
 {
-   u64 task_size;
-
-   /* No regs, no stack pointer, no dump. */
-   if (!regs)
-   return 0;
-
/*
-* Check if we fit in with the requested stack size into the:
+* Check if we fit in with the requested dump size into the:
 * - TASK_SIZE
 *   If we don't, we limit the size to the TASK_SIZE.
 *
 * - remaining sample size
-*   If we don't, we customize the stack size to
+*   If we don't, we customize the dump size to
 *   fit in to the remaining sample size.
 */
 
-   task_size  = min((u64) USHRT_MAX, perf_ustack_task_size(regs));
-   stack_size = min(stack_size, (u16) task_size);
+   task_size  = min((u64) USHRT_MAX, task_size);
+   dump_size = min(dump_size, (u16) task_size);
 
/* Current header size plus static size and dynamic size. */
header_size += 2 * sizeof(u64);
 
-   /* Do we fit in with the current stack dump size? */
-   if ((u16) (header_size + stack_size) < header_size) {
+   /* Do we fit in with the current dump size? */
+   if ((u16) (header_size + dump_size) < header_size) {
/*
 * If we overflow the maximum size for the sample,
-* we customize the stack dump size to fit in.
+* we customize the dump size to fit in.
 */
-   stack_size = USHRT_MAX - header_size - sizeof(u64);
-   stack_size = round_up(stack_size, sizeof(u64));
+   dump_size = USHRT_MAX - header_size - sizeof(u64);
+   dump_size = round_up(dump_size, sizeof(u64));
}
 
-   return stack_size;
+   return dump_size;
 }
 
 static void
@@ -7648,6 +7647,32 @@ static __always_inline u64 __cond_set(u64 flags, u64 s, 
u64 d)
return d * !!(flags & s);
 }
 
+static inline u16
+perf_prepare_dump_data(struct perf_sample_data *data,
+  struct perf_event *event,
+  struct pt_regs *regs,
+  u16 dump_size,
+  u64 task_size)
+{
+   u16 header_size = perf_sample_data_size(data, event);
+   u16 size = sizeof(u64);
+
+   dump_size = perf_sample_dump_size(dump_size, header_size,
+ task_size);
+
+   /*
+* If there is something to dump, add space for the dump
+* itself and for the field that tells the dynamic size,
+* which is how many have been actually dumped.
+*/
+   if (dump_size)
+   size += sizeof(u64) + dump_size;
+
+   data->dyn_size += size;
+
+   return dump_size;
+}
+
 void perf_prepare_sample(struct perf_sample_data *data,
 struct perf_event *event,
 struct pt_regs *regs)
@@ -7725,22 +7750,12 @@ void perf_prepare_sample(struct perf_sample_data *data,
 * up the rest of the sample size.
 */
u16 stack_size = event->attr.sample_stack_user;
-   u16 header_size = perf_sample_data_size(data, event);
-   u16 size = sizeof(u64);
-
-   stack_size = perf_sample_ustack_size(stack_size, header_size,
-data->regs_user.regs);
+   u64 task_size = perf_ustack_task_size(regs);
 
-   /*
-* If there is something to dump, add space for the dump
-* itself and for the field that tells the dynamic size,
-* which is how many hav

Re: Copying TLS/user register data per perf-sample?

2024-04-11 Thread Beau Belgrave
On Fri, Apr 12, 2024 at 12:55:19AM +0900, Masami Hiramatsu wrote:
> On Wed, 10 Apr 2024 08:35:42 -0700
> Beau Belgrave  wrote:
> 
> > On Wed, Apr 10, 2024 at 10:06:28PM +0900, Masami Hiramatsu wrote:
> > > On Thu, 4 Apr 2024 12:26:41 -0700
> > > Beau Belgrave  wrote:
> > > 
> > > > Hello,
> > > > 
> > > > I'm looking into the possibility of capturing user data that is pointed
> > > > to by a user register (IE: fs/gs for TLS on x86/64) for each sample via
> > > > perf_events.
> > > > 
> > > > I was hoping to find a way to do this similar to PERF_SAMPLE_STACK_USER.
> > > > I think it could even use roughly the same ABI in the perf ring buffer.
> > > > Or it may be possible by some kprobe linked to the perf sample function.
> > > > 
> > > > This would allow a profiler to collect TLS (or other values) on x64. In
> > > > the Open Telemetry profiling SIG [1], we are trying to find a fast way
> > > > to grab a tracing association quickly on a per-thread basis. The team
> > > > at Elastic has a bespoke way to do this [2], however, I'd like to see a
> > > > more general way to achieve this. The folks I've been talking with seem
> > > > open to the idea of just having a TLS value for this we could capture
> > > > upon each sample. We could then just state, Open Telemetry SDKs should
> > > > have a TLS value for span correlation. However, we need a way to sample
> > > > the TLS value(s) when a sampling event is generated.
> > > > 
> > > > Is this already possible via some other means? It'd be great to be able
> > > > to do this directly at the perf_event sample via the ABI or a probe.
> > > > 
> > > 
> > > Have you tried to use uprobes? It should be able to access user-space
> > > registers including fs/gs.
> > > 
> > 
> > We need to get fs/gs during a sample interrupt from perf. If the sample
> > interrupt lands during kernel code (IE: syscall) we would also like to
> > get these TLS values when in process context.
> 
> OK, those are not directly accessible from pt_regs.
> 

Yeah, it's a per-arch thread attribute.

> > 
> > I have some patches into the kernel to make this possible via
> > perf_events that works well, however, I don't want to reinvent the wheel
> > if there is some way to get these via perf samples already.
> 
> I would like to see it. I think it is possible to introduce a helper
> to get a base address of user TLS for probe events, and start supporting
> from x86.
> 

For sure, I'm hoping the patches start the right conversations.

> > 
> > In OTel, we are trying to attribute samples to transactions that are
> > occurring. So the TLS fetch has to be aligned exactly with the sample.
> > You can do this via eBPF when it's available, however, we have
> > environments where eBPF is not available.
> > 
> > It's sounding like to do this properly without eBPF a new feature would
> > be required. If so, I do have some patches I can share in a bit as an
> > RFC.
> 
> It is better to be shared in RFC stage, so that we can discuss it from
> the direction level.
> 

Agree, it could be that having the ability to run a probe on sample may
be a better option. Not sure.

Thanks,
-Beau

> Thank you,
> 
> > 
> > Thanks,
> > -Beau
> > 
> > > Thank you,
> > > 
> > > -- 
> > > Masami Hiramatsu (Google) 
> 
> 
> -- 
> Masami Hiramatsu (Google) 



Re: Copying TLS/user register data per perf-sample?

2024-04-10 Thread Beau Belgrave
On Tue, Apr 09, 2024 at 04:32:46PM -0700, Namhyung Kim wrote:
> Hello,
> 
> On Thu, Apr 4, 2024 at 12:26 PM Beau Belgrave  
> wrote:
> >
> > Hello,
> >
> > I'm looking into the possibility of capturing user data that is pointed
> > to by a user register (IE: fs/gs for TLS on x86/64) for each sample via
> > perf_events.
> >
> > I was hoping to find a way to do this similar to PERF_SAMPLE_STACK_USER.
> > I think it could even use roughly the same ABI in the perf ring buffer.
> > Or it may be possible by some kprobe linked to the perf sample function.
> >
> > This would allow a profiler to collect TLS (or other values) on x64. In
> > the Open Telemetry profiling SIG [1], we are trying to find a fast way
> > to grab a tracing association quickly on a per-thread basis. The team
> > at Elastic has a bespoke way to do this [2], however, I'd like to see a
> > more general way to achieve this. The folks I've been talking with seem
> > open to the idea of just having a TLS value for this we could capture
> > upon each sample. We could then just state, Open Telemetry SDKs should
> > have a TLS value for span correlation. However, we need a way to sample
> > the TLS value(s) when a sampling event is generated.
> >
> > Is this already possible via some other means? It'd be great to be able
> > to do this directly at the perf_event sample via the ABI or a probe.
> 
> I don't think the current perf ABI allows capturing %fs/%gs + offset.
> IIRC kprobes/uprobes don't have that too but I could be wrong.
> 

Yeah, I didn't see it either. I have some patches that I will submit in
a bit as RFC that enable this functionality. I was hoping there was
already an easy way to do this.

Thanks,
-Beau

> Thanks,
> Namhyung
> 
> >
> > 1. https://opentelemetry.io/blog/2024/profiling/
> > 2. 
> > https://www.elastic.co/blog/continuous-profiling-distributed-tracing-correlation



Re: Copying TLS/user register data per perf-sample?

2024-04-10 Thread Beau Belgrave
On Wed, Apr 10, 2024 at 10:06:28PM +0900, Masami Hiramatsu wrote:
> On Thu, 4 Apr 2024 12:26:41 -0700
> Beau Belgrave  wrote:
> 
> > Hello,
> > 
> > I'm looking into the possibility of capturing user data that is pointed
> > to by a user register (IE: fs/gs for TLS on x86/64) for each sample via
> > perf_events.
> > 
> > I was hoping to find a way to do this similar to PERF_SAMPLE_STACK_USER.
> > I think it could even use roughly the same ABI in the perf ring buffer.
> > Or it may be possible by some kprobe linked to the perf sample function.
> > 
> > This would allow a profiler to collect TLS (or other values) on x64. In
> > the Open Telemetry profiling SIG [1], we are trying to find a fast way
> > to grab a tracing association quickly on a per-thread basis. The team
> > at Elastic has a bespoke way to do this [2], however, I'd like to see a
> > more general way to achieve this. The folks I've been talking with seem
> > open to the idea of just having a TLS value for this we could capture
> > upon each sample. We could then just state, Open Telemetry SDKs should
> > have a TLS value for span correlation. However, we need a way to sample
> > the TLS value(s) when a sampling event is generated.
> > 
> > Is this already possible via some other means? It'd be great to be able
> > to do this directly at the perf_event sample via the ABI or a probe.
> > 
> 
> Have you tried to use uprobes? It should be able to access user-space
> registers including fs/gs.
> 

We need to get fs/gs during a sample interrupt from perf. If the sample
interrupt lands during kernel code (IE: syscall) we would also like to
get these TLS values when in process context.

I have some patches into the kernel to make this possible via
perf_events that works well, however, I don't want to reinvent the wheel
if there is some way to get these via perf samples already.

In OTel, we are trying to attribute samples to transactions that are
occurring. So the TLS fetch has to be aligned exactly with the sample.
You can do this via eBPF when it's available, however, we have
environments where eBPF is not available.

It's sounding like to do this properly without eBPF a new feature would
be required. If so, I do have some patches I can share in a bit as an
RFC.

Thanks,
-Beau

> Thank you,
> 
> -- 
> Masami Hiramatsu (Google) 



Copying TLS/user register data per perf-sample?

2024-04-04 Thread Beau Belgrave
Hello,

I'm looking into the possibility of capturing user data that is pointed
to by a user register (IE: fs/gs for TLS on x86/64) for each sample via
perf_events.

I was hoping to find a way to do this similar to PERF_SAMPLE_STACK_USER.
I think it could even use roughly the same ABI in the perf ring buffer.
Or it may be possible by some kprobe linked to the perf sample function.

This would allow a profiler to collect TLS (or other values) on x64. In
the Open Telemetry profiling SIG [1], we are trying to find a fast way
to grab a tracing association quickly on a per-thread basis. The team
at Elastic has a bespoke way to do this [2], however, I'd like to see a
more general way to achieve this. The folks I've been talking with seem
open to the idea of just having a TLS value for this we could capture
upon each sample. We could then just state, Open Telemetry SDKs should
have a TLS value for span correlation. However, we need a way to sample
the TLS value(s) when a sampling event is generated.

Is this already possible via some other means? It'd be great to be able
to do this directly at the perf_event sample via the ABI or a probe.

Thanks,
-Beau

1. https://opentelemetry.io/blog/2024/profiling/
2. 
https://www.elastic.co/blog/continuous-profiling-distributed-tracing-correlation



[PATCH v4 4/4] tracing/user_events: Document multi-format flag

2024-02-21 Thread Beau Belgrave
User programs can now ask user_events to handle the synchronization of
multiple different formats for an event with the same name via the new
USER_EVENT_REG_MULTI_FORMAT flag.

Add a section for USER_EVENT_REG_MULTI_FORMAT that explains the intended
purpose and caveats of using it. Explain how deletion works in these
cases and how to use /sys/kernel/tracing/dynamic_events for per-version
deletion.

Signed-off-by: Beau Belgrave 
---
 Documentation/trace/user_events.rst | 27 ++-
 1 file changed, 26 insertions(+), 1 deletion(-)

diff --git a/Documentation/trace/user_events.rst 
b/Documentation/trace/user_events.rst
index d8f12442aaa6..1d5a7626e6a6 100644
--- a/Documentation/trace/user_events.rst
+++ b/Documentation/trace/user_events.rst
@@ -92,6 +92,24 @@ The following flags are currently supported.
   process closes or unregisters the event. Requires CAP_PERFMON otherwise
   -EPERM is returned.
 
++ USER_EVENT_REG_MULTI_FORMAT: The event can contain multiple formats. This
+  allows programs to prevent themselves from being blocked when their event
+  format changes and they wish to use the same name. When this flag is used the
+  tracepoint name will be in the new format of "name.unique_id" vs the older
+  format of "name". A tracepoint will be created for each unique pair of name
+  and format. This means if several processes use the same name and format,
+  they will use the same tracepoint. If yet another process uses the same name,
+  but a different format than the other processes, it will use a different
+  tracepoint with a new unique id. Recording programs need to scan tracefs for
+  the various different formats of the event name they are interested in
+  recording. The system name of the tracepoint will also use 
"user_events_multi"
+  instead of "user_events". This prevents single-format event names conflicting
+  with any multi-format event names within tracefs. The unique_id is output as
+  a hex string. Recording programs should ensure the tracepoint name starts 
with
+  the event name they registered and has a suffix that starts with . and only
+  has hex characters. For example to find all versions of the event "test" you
+  can use the regex "^test\.[0-9a-fA-F]+$".
+
 Upon successful registration the following is set.
 
 + write_index: The index to use for this file descriptor that represents this
@@ -106,6 +124,9 @@ or perf record -e user_events:[name] when 
attaching/recording.
 **NOTE:** The event subsystem name by default is "user_events". Callers should
 not assume it will always be "user_events". Operators reserve the right in the
 future to change the subsystem name per-process to accommodate event isolation.
+In addition if the USER_EVENT_REG_MULTI_FORMAT flag is used the tracepoint name
+will have a unique id appended to it and the system name will be
+"user_events_multi" as described above.
 
 Command Format
 ^^
@@ -156,7 +177,11 @@ to request deletes than the one used for registration due 
to this.
 to the event. If programs do not want auto-delete, they must use the
 USER_EVENT_REG_PERSIST flag when registering the event. Once that flag is used
 the event exists until DIAG_IOCSDEL is invoked. Both register and delete of an
-event that persists requires CAP_PERFMON, otherwise -EPERM is returned.
+event that persists requires CAP_PERFMON, otherwise -EPERM is returned. When
+there are multiple formats of the same event name, all events with the same
+name will be attempted to be deleted. If only a specific version is wanted to
+be deleted then the /sys/kernel/tracing/dynamic_events file should be used for
+that specific format of the event.
 
 Unregistering
 -
-- 
2.34.1




[PATCH v4 1/4] tracing/user_events: Prepare find/delete for same name events

2024-02-21 Thread Beau Belgrave
The current code for finding and deleting events assumes that there will
never be cases when user_events are registered with the same name, but
different formats. Scenarios exist where programs want to use the same
name but have different formats. An example is multiple versions of a
program running side-by-side using the same event name, but with updated
formats in each version.

This change does not yet allow for multi-format events. If user_events
are registered with the same name but different arguments the programs
see the same return values as before. This change simply makes it
possible to easily accomodate for this.

Update find_user_event() to take in argument parameters and register
flags to accomodate future multi-format event scenarios. Have find
validate argument matching and return error pointers to cover when
an existing event has the same name but different format. Update
callers to handle error pointer logic.

Move delete_user_event() to use hash walking directly now that
find_user_event() has changed. Delete all events found that match the
register name, stop if an error occurs and report back to the user.

Update user_fields_match() to cover list_empty() scenarios now that
find_user_event() uses it directly. This makes the logic consistent
accross several callsites.

Signed-off-by: Beau Belgrave 
---
 kernel/trace/trace_events_user.c | 107 +--
 1 file changed, 59 insertions(+), 48 deletions(-)

diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
index 9365ce407426..dda58681247e 100644
--- a/kernel/trace/trace_events_user.c
+++ b/kernel/trace/trace_events_user.c
@@ -202,6 +202,8 @@ static struct user_event_mm *user_event_mm_get(struct 
user_event_mm *mm);
 static struct user_event_mm *user_event_mm_get_all(struct user_event *user);
 static void user_event_mm_put(struct user_event_mm *mm);
 static int destroy_user_event(struct user_event *user);
+static bool user_fields_match(struct user_event *user, int argc,
+ const char **argv);
 
 static u32 user_event_key(char *name)
 {
@@ -1493,17 +1495,24 @@ static int destroy_user_event(struct user_event *user)
 }
 
 static struct user_event *find_user_event(struct user_event_group *group,
- char *name, u32 *outkey)
+ char *name, int argc, const char 
**argv,
+ u32 flags, u32 *outkey)
 {
struct user_event *user;
u32 key = user_event_key(name);
 
*outkey = key;
 
-   hash_for_each_possible(group->register_table, user, node, key)
-   if (!strcmp(EVENT_NAME(user), name))
+   hash_for_each_possible(group->register_table, user, node, key) {
+   if (strcmp(EVENT_NAME(user), name))
+   continue;
+
+   if (user_fields_match(user, argc, argv))
return user_event_get(user);
 
+   return ERR_PTR(-EADDRINUSE);
+   }
+
return NULL;
 }
 
@@ -1860,6 +1869,9 @@ static bool user_fields_match(struct user_event *user, 
int argc,
struct list_head *head = >fields;
int i = 0;
 
+   if (argc == 0)
+   return list_empty(head);
+
list_for_each_entry_reverse(field, head, link) {
if (!user_field_match(field, argc, argv, ))
return false;
@@ -1880,10 +1892,8 @@ static bool user_event_match(const char *system, const 
char *event,
match = strcmp(EVENT_NAME(user), event) == 0 &&
(!system || strcmp(system, USER_EVENTS_SYSTEM) == 0);
 
-   if (match && argc > 0)
+   if (match)
match = user_fields_match(user, argc, argv);
-   else if (match && argc == 0)
-   match = list_empty(>fields);
 
return match;
 }
@@ -1922,11 +1932,11 @@ static int user_event_parse(struct user_event_group 
*group, char *name,
char *args, char *flags,
struct user_event **newuser, int reg_flags)
 {
-   int ret;
-   u32 key;
struct user_event *user;
+   char **argv = NULL;
int argc = 0;
-   char **argv;
+   int ret;
+   u32 key;
 
/* Currently don't support any text based flags */
if (flags != NULL)
@@ -1935,41 +1945,34 @@ static int user_event_parse(struct user_event_group 
*group, char *name,
if (!user_event_capable(reg_flags))
return -EPERM;
 
+   if (args) {
+   argv = argv_split(GFP_KERNEL, args, );
+
+   if (!argv)
+   return -ENOMEM;
+   }
+
/* Prevent dyn_event from racing */
mutex_lock(_mutex);
-   user = find_user_event(group, name, );
+   user = find_user_event(group, name, argc, (const char **)argv,
+  reg_flags, );
m

[PATCH v4 2/4] tracing/user_events: Introduce multi-format events

2024-02-21 Thread Beau Belgrave
Currently user_events supports 1 event with the same name and must have
the exact same format when referenced by multiple programs. This opens
an opportunity for malicous or poorly thought through programs to
create events that others use with different formats. Another scenario
is user programs wishing to use the same event name but add more fields
later when the software updates. Various versions of a program may be
running side-by-side, which is prevented by the current single format
requirement.

Add a new register flag (USER_EVENT_REG_MULTI_FORMAT) which indicates
the user program wishes to use the same user_event name, but may have
several different formats of the event. When this flag is used, create
the underlying tracepoint backing the user_event with a unique name
per-version of the format. It's important that existing ABI users do
not get this logic automatically, even if one of the multi format
events matches the format. This ensures existing programs that create
events and assume the tracepoint name will match exactly continue to
work as expected. Add logic to only check multi-format events with
other multi-format events and single-format events to only check
single-format events during find.

Change system name of the multi-format event tracepoint to ensure that
multi-format events are isolated completely from single-format events.
This prevents single-format names from conflicting with multi-format
events if they end with the same suffix as the multi-format events.

Add a register_name (reg_name) to the user_event struct which allows for
split naming of events. We now have the name that was used to register
within user_events as well as the unique name for the tracepoint. Upon
registering events ensure matches based on first the reg_name, followed
by the fields and format of the event. This allows for multiple events
with the same registered name to have different formats. The underlying
tracepoint will have a unique name in the format of {reg_name}.{unique_id}.

For example, if both "test u32 value" and "test u64 value" are used with
the USER_EVENT_REG_MULTI_FORMAT the system would have 2 unique
tracepoints. The dynamic_events file would then show the following:
  u:test u64 count
  u:test u32 count

The actual tracepoint names look like this:
  test.0
  test.1

Both would be under the new user_events_multi system name to prevent the
older ABI from being used to squat on multi-formatted events and block
their use.

Deleting events via "!u:test u64 count" would only delete the first
tracepoint that matched that format. When the delete ABI is used all
events with the same name will be attempted to be deleted. If
per-version deletion is required, user programs should either not use
persistent events or delete them via dynamic_events.

Signed-off-by: Beau Belgrave 
---
 include/uapi/linux/user_events.h |   6 +-
 kernel/trace/trace_events_user.c | 102 +++
 2 files changed, 95 insertions(+), 13 deletions(-)

diff --git a/include/uapi/linux/user_events.h b/include/uapi/linux/user_events.h
index f74f3aedd49c..a03de03dccbc 100644
--- a/include/uapi/linux/user_events.h
+++ b/include/uapi/linux/user_events.h
@@ -12,6 +12,7 @@
 #include 
 
 #define USER_EVENTS_SYSTEM "user_events"
+#define USER_EVENTS_MULTI_SYSTEM "user_events_multi"
 #define USER_EVENTS_PREFIX "u:"
 
 /* Create dynamic location entry within a 32-bit value */
@@ -22,8 +23,11 @@ enum user_reg_flag {
/* Event will not delete upon last reference closing */
USER_EVENT_REG_PERSIST  = 1U << 0,
 
+   /* Event will be allowed to have multiple formats */
+   USER_EVENT_REG_MULTI_FORMAT = 1U << 1,
+
/* This value or above is currently non-ABI */
-   USER_EVENT_REG_MAX  = 1U << 1,
+   USER_EVENT_REG_MAX  = 1U << 2,
 };
 
 /*
diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
index dda58681247e..044920c415e7 100644
--- a/kernel/trace/trace_events_user.c
+++ b/kernel/trace/trace_events_user.c
@@ -34,7 +34,8 @@
 
 /* Limit how long of an event name plus args within the subsystem. */
 #define MAX_EVENT_DESC 512
-#define EVENT_NAME(user_event) ((user_event)->tracepoint.name)
+#define EVENT_NAME(user_event) ((user_event)->reg_name)
+#define EVENT_TP_NAME(user_event) ((user_event)->tracepoint.name)
 #define MAX_FIELD_ARRAY_SIZE 1024
 
 /*
@@ -54,10 +55,13 @@
  * allows isolation for events by various means.
  */
 struct user_event_group {
-   char*system_name;
-   struct  hlist_node node;
-   struct  mutex reg_mutex;
+   char*system_name;
+   char*system_multi_name;
+   struct hlist_node   node;
+   struct mutexreg_mutex;
DECLARE_HASHTABLE(register_table, 8);
+   /* ID that moves forward within

[PATCH v4 3/4] selftests/user_events: Test multi-format events

2024-02-21 Thread Beau Belgrave
User_events now has multi-format events which allow for the same
register name, but with different formats. When this occurs, different
tracepoints are created with unique names.

Add a new test that ensures the same name can be used for two different
formats. Ensure they are isolated from each other and that name and arg
matching still works if yet another register comes in with the same
format as one of the two.

Signed-off-by: Beau Belgrave 
---
 .../testing/selftests/user_events/abi_test.c  | 134 ++
 1 file changed, 134 insertions(+)

diff --git a/tools/testing/selftests/user_events/abi_test.c 
b/tools/testing/selftests/user_events/abi_test.c
index cef1ff1af223..7288a05136ba 100644
--- a/tools/testing/selftests/user_events/abi_test.c
+++ b/tools/testing/selftests/user_events/abi_test.c
@@ -16,6 +16,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 #include 
 
 #include "../kselftest_harness.h"
@@ -23,6 +25,62 @@
 
 const char *data_file = "/sys/kernel/tracing/user_events_data";
 const char *enable_file = 
"/sys/kernel/tracing/events/user_events/__abi_event/enable";
+const char *multi_dir_glob = 
"/sys/kernel/tracing/events/user_events_multi/__abi_event.*";
+
+static int wait_for_delete(char *dir)
+{
+   struct stat buf;
+   int i;
+
+   for (i = 0; i < 1; ++i) {
+   if (stat(dir, ) == -1 && errno == ENOENT)
+   return 0;
+
+   usleep(1000);
+   }
+
+   return -1;
+}
+
+static int find_multi_event_dir(char *unique_field, char *out_dir, int dir_len)
+{
+   char path[256];
+   glob_t buf;
+   int i, ret;
+
+   ret = glob(multi_dir_glob, GLOB_ONLYDIR, NULL, );
+
+   if (ret)
+   return -1;
+
+   ret = -1;
+
+   for (i = 0; i < buf.gl_pathc; ++i) {
+   FILE *fp;
+
+   snprintf(path, sizeof(path), "%s/format", buf.gl_pathv[i]);
+   fp = fopen(path, "r");
+
+   if (!fp)
+   continue;
+
+   while (fgets(path, sizeof(path), fp) != NULL) {
+   if (strstr(path, unique_field)) {
+   fclose(fp);
+   /* strscpy is not available, use snprintf */
+   snprintf(out_dir, dir_len, "%s", 
buf.gl_pathv[i]);
+   ret = 0;
+   goto out;
+   }
+   }
+
+   fclose(fp);
+   }
+out:
+   globfree();
+
+   return ret;
+}
 
 static bool event_exists(void)
 {
@@ -74,6 +132,39 @@ static int event_delete(void)
return ret;
 }
 
+static int reg_enable_multi(void *enable, int size, int bit, int flags,
+   char *args)
+{
+   struct user_reg reg = {0};
+   char full_args[512] = {0};
+   int fd = open(data_file, O_RDWR);
+   int len;
+   int ret;
+
+   if (fd < 0)
+   return -1;
+
+   len = snprintf(full_args, sizeof(full_args), "__abi_event %s", args);
+
+   if (len > sizeof(full_args)) {
+   ret = -E2BIG;
+   goto out;
+   }
+
+   reg.size = sizeof(reg);
+   reg.name_args = (__u64)full_args;
+   reg.flags = USER_EVENT_REG_MULTI_FORMAT | flags;
+   reg.enable_bit = bit;
+   reg.enable_addr = (__u64)enable;
+   reg.enable_size = size;
+
+   ret = ioctl(fd, DIAG_IOCSREG, );
+out:
+   close(fd);
+
+   return ret;
+}
+
 static int reg_enable_flags(void *enable, int size, int bit, int flags)
 {
struct user_reg reg = {0};
@@ -207,6 +298,49 @@ TEST_F(user, bit_sizes) {
ASSERT_NE(0, reg_enable(>check, 128, 0));
 }
 
+TEST_F(user, multi_format) {
+   char first_dir[256];
+   char second_dir[256];
+   struct stat buf;
+
+   /* Multiple formats for the same name should work */
+   ASSERT_EQ(0, reg_enable_multi(>check, sizeof(int), 0,
+ 0, "u32 multi_first"));
+
+   ASSERT_EQ(0, reg_enable_multi(>check, sizeof(int), 1,
+ 0, "u64 multi_second"));
+
+   /* Same name with same format should also work */
+   ASSERT_EQ(0, reg_enable_multi(>check, sizeof(int), 2,
+ 0, "u64 multi_second"));
+
+   ASSERT_EQ(0, find_multi_event_dir("multi_first",
+ first_dir, sizeof(first_dir)));
+
+   ASSERT_EQ(0, find_multi_event_dir("multi_second",
+ second_dir, sizeof(second_dir)));
+
+   /* Should not be found in the same dir */
+   ASSERT_NE(0, strcmp(first_dir, second_dir));
+
+   /* First dir should still exist */
+   ASSERT_EQ(0, stat(first_dir, ));
+
+   /* Disabling first register sho

[PATCH v4 0/4] tracing/user_events: Introduce multi-format events

2024-02-21 Thread Beau Belgrave
Currently user_events supports 1 event with the same name and must have
the exact same format when referenced by multiple programs. This opens
an opportunity for malicous or poorly thought through programs to
create events that others use with different formats. Another scenario
is user programs wishing to use the same event name but add more fields
later when the software updates. Various versions of a program may be
running side-by-side, which is prevented by the current single format
requirement.

Add a new register flag (USER_EVENT_REG_MULTI_FORMAT) which indicates
the user program wishes to use the same user_event name, but may have
several different formats of the event. When this flag is used, create
the underlying tracepoint backing the user_event with a unique name
per-version of the format. It's important that existing ABI users do
not get this logic automatically, even if one of the multi format
events matches the format. This ensures existing programs that create
events and assume the tracepoint name will match exactly continue to
work as expected. Add logic to only check multi-format events with
other multi-format events and single-format events to only check
single-format events during find.

Change system name of the multi-format event tracepoint to ensure that
multi-format events are isolated completely from single-format events.
This prevents single-format names from conflicting with multi-format
events if they end with the same suffix as the multi-format events.

Add a register_name (reg_name) to the user_event struct which allows for
split naming of events. We now have the name that was used to register
within user_events as well as the unique name for the tracepoint. Upon
registering events ensure matches based on first the reg_name, followed
by the fields and format of the event. This allows for multiple events
with the same registered name to have different formats. The underlying
tracepoint will have a unique name in the format of {reg_name}.{unique_id}.

For example, if both "test u32 value" and "test u64 value" are used with
the USER_EVENT_REG_MULTI_FORMAT the system would have 2 unique
tracepoints. The dynamic_events file would then show the following:
  u:test u64 count
  u:test u32 count

The actual tracepoint names look like this:
  test.0
  test.1

Both would be under the new user_events_multi system name to prevent the
older ABI from being used to squat on multi-formatted events and block
their use.

Deleting events via "!u:test u64 count" would only delete the first
tracepoint that matched that format. When the delete ABI is used all
events with the same name will be attempted to be deleted. If
per-version deletion is required, user programs should either not use
persistent events or delete them via dynamic_events.

Changes in v4:
  Use kstrdup() in user_event_group_system_multi_name() vs kmalloc() and
  snprintf().
  
  Use kasprintf() in user_event_set_tp_name() vs kzalloc() and snprintf().

  Grammar fixes in change logs.

Changes in v3:
  Use hash_for_each_possible_safe() in destroy_user_event() to prevent
  use after free (caught by kernel test robot ).

Changes in v2:
  Tracepoint names changed from "name:[id]" to "name.id". Feedback
  was the : could conflict with system name formats. []'s are also
  special characters for bash.

  Updated self-test and docs to reflect the new suffix format.

  Updated docs to include a regex example to help guide recording
  programs find the correct event in ambiguous cases.

Beau Belgrave (4):
  tracing/user_events: Prepare find/delete for same name events
  tracing/user_events: Introduce multi-format events
  selftests/user_events: Test multi-format events
  tracing/user_events: Document multi-format flag

 Documentation/trace/user_events.rst   |  27 ++-
 include/uapi/linux/user_events.h  |   6 +-
 kernel/trace/trace_events_user.c  | 209 +-
 .../testing/selftests/user_events/abi_test.c  | 134 +++
 4 files changed, 314 insertions(+), 62 deletions(-)


base-commit: 610a9b8f49fbcf1100716370d3b5f6f884a2835a
-- 
2.34.1




Re: [PATCH v3 1/4] tracing/user_events: Prepare find/delete for same name events

2024-02-21 Thread Beau Belgrave
On Wed, Feb 21, 2024 at 10:17:21AM -0500, Steven Rostedt wrote:
> On Wed, 14 Feb 2024 17:50:43 +
> Beau Belgrave  wrote:
> 
> So the patches look good, but since I gave you some updates, I'm now going
> to go though "nits". Like grammar and such ;-)
> 

Sure thing.

> > The current code for finding and deleting events assumes that there will
> > never be cases when user_events are registered with the same name, but
> > different formats. In the future this scenario will exist to ensure
> 
> > user programs can be updated or modify their events and run different
> > versions of their programs side-by-side without being blocked.
> 
> Can you change the last sentence above. I read it three times and it's
> still awkward to understand it. Particularly, the "user programs can be
> updated or modify their events". That just doesn't want to compute.
> 

Yeah, I'll clean this up.

> > 
> > This change does not yet allow for multi-format events. If user_events
> > are registered with the same name but different arguments the programs
> > see the same return values as before. This change simply makes it
> > possible to easily accomodate for this in future changes.
> 
> I think you can drop the "in future changes" part.
> 

Agreed.

> > 
> > Update find_user_event() to take in argument parameters and register
> > flags to accomodate future multi-format event scenarios. Have find
> > validate argument matching and return error pointers to cover address
> > in use cases, or allocation errors. Update callers to handle error
> 
>   "to cover address in use cases" ?

Yeah, if the ABI is using a single-format event and it's already in use,
we return -EADDRINUSE. It does not happen in multi-format event cases,
since that is allowed.

I'll try to clarify this a bit.

> 
> > pointer logic.
> > 
> > Move delete_user_event() to use hash walking directly now that find has
> > changed. Delete all events found that match the register name, stop
> 
> "now that find has changed" ?  You mean the "find function"?
> 

Yeah, I'll just use the function name here, find_user_event().

> > if an error occurs and report back to the user.
> > 
> > Update user_fields_match() to cover list_empty() scenarios instead of
> > each callsite doing it now that find_user_event() uses it directly.
> 
> The above is a bit of a run-on sentence.
> 

I'll clean it up a bit.

Thanks,
-Beau

> -- Steve
> 
> > 
> > Signed-off-by: Beau Belgrave 
> > ---



Re: [PATCH v3 2/4] tracing/user_events: Introduce multi-format events

2024-02-21 Thread Beau Belgrave
On Wed, Feb 21, 2024 at 10:08:33AM -0500, Steven Rostedt wrote:
> On Wed, 14 Feb 2024 17:50:44 +
> Beau Belgrave  wrote:
> 
> > +static char *user_event_group_system_multi_name(void)
> > +{
> > +   char *system_name;
> > +   int len = sizeof(USER_EVENTS_MULTI_SYSTEM) + 1;
> 
> FYI, the sizeof() will include the "\0" so no need for "+ 1", but I don't
> think this matters as for what I mention below.
> 
> > +
> > +   system_name = kmalloc(len, GFP_KERNEL);
> > +
> > +   if (!system_name)
> > +   return NULL;
> > +
> > +   snprintf(system_name, len, "%s", USER_EVENTS_MULTI_SYSTEM);
> > +
> > +   return system_name;
> 
> Hmm, the above looks like an open coded version of:
> 
>   system_name = kstrdup(USER_EVENTS_MULTI_SYSTEM, GFP_KERNEL);
> 

That's much cleaner, I'll move to that.

> 
> > +}
> > +
> >  static struct user_event_group *current_user_event_group(void)
> >  {
> > return init_group;
> > @@ -367,6 +390,11 @@ static struct user_event_group 
> > *user_event_group_create(void)
> > if (!group->system_name)
> > goto error;
> >  
> > +   group->system_multi_name = user_event_group_system_multi_name();
> > +
> > +   if (!group->system_multi_name)
> > +   goto error;
> > +
> > mutex_init(>reg_mutex);
> > hash_init(group->register_table);
> >  
> > @@ -1482,6 +1510,11 @@ static int destroy_user_event(struct user_event 
> > *user)
> > hash_del(>node);
> >  
> > user_event_destroy_validators(user);
> > +
> > +   /* If we have different names, both must be freed */
> > +   if (EVENT_NAME(user) != EVENT_TP_NAME(user))
> > +   kfree(EVENT_TP_NAME(user));
> > +
> > kfree(user->call.print_fmt);
> > kfree(EVENT_NAME(user));
> > kfree(user);
> > @@ -1504,12 +1537,24 @@ static struct user_event *find_user_event(struct 
> > user_event_group *group,
> > *outkey = key;
> >  
> > hash_for_each_possible(group->register_table, user, node, key) {
> > +   /*
> > +* Single-format events shouldn't return multi-format
> > +* events. Callers expect the underlying tracepoint to match
> > +* the name exactly in these cases. Only check like-formats.
> > +*/
> > +   if (EVENT_MULTI_FORMAT(flags) != 
> > EVENT_MULTI_FORMAT(user->reg_flags))
> > +   continue;
> > +
> > if (strcmp(EVENT_NAME(user), name))
> > continue;
> >  
> > if (user_fields_match(user, argc, argv))
> > return user_event_get(user);
> >  
> > +   /* Scan others if this is a multi-format event */
> > +   if (EVENT_MULTI_FORMAT(flags))
> > +   continue;
> > +
> > return ERR_PTR(-EADDRINUSE);
> > }
> >  
> > @@ -1889,8 +1934,12 @@ static bool user_event_match(const char *system, 
> > const char *event,
> > struct user_event *user = container_of(ev, struct user_event, devent);
> > bool match;
> >  
> > -   match = strcmp(EVENT_NAME(user), event) == 0 &&
> > -   (!system || strcmp(system, USER_EVENTS_SYSTEM) == 0);
> > +   match = strcmp(EVENT_NAME(user), event) == 0;
> > +
> > +   if (match && system) {
> > +   match = strcmp(system, user->group->system_name) == 0 ||
> > +   strcmp(system, user->group->system_multi_name) == 0;
> > +   }
> >  
> > if (match)
> > match = user_fields_match(user, argc, argv);
> > @@ -1923,6 +1972,39 @@ static int user_event_trace_register(struct 
> > user_event *user)
> > return ret;
> >  }
> >  
> > +static int user_event_set_tp_name(struct user_event *user)
> > +{
> > +   lockdep_assert_held(>group->reg_mutex);
> > +
> > +   if (EVENT_MULTI_FORMAT(user->reg_flags)) {
> > +   char *multi_name;
> > +   int len;
> > +
> > +   len = snprintf(NULL, 0, "%s.%llx", user->reg_name,
> > +  user->group->multi_id) + 1;
> > +
> > +   multi_name = kzalloc(len, GFP_KERNEL_ACCOUNT);
> > +
> > +   if (!multi_name)
> > +   return -ENOMEM;
> > +
> > +   snprintf(multi_name, len, "%s.%llx", user->reg_name,
> > +user->group->multi_id);
> 
> I believe the above can be replaced with:
> 
>   multi_name = kasprintf(GFP_KERNEL_ACCOUNT, "%s.%llx", 
> user->reg_name,
>  user->group->multi_id);
>   if (!multi_name)
>   return -ENOMEM;
> 

Great, I'll move to that as well and validate.

Thanks,
-Beau

> -- Steve
> 
> > +
> > +   user->call.name = multi_name;
> > +   user->tracepoint.name = multi_name;
> > +
> > +   /* Inc to ensure unique multi-event name next time */
> > +   user->group->multi_id++;
> > +   } else {
> > +   /* Non Multi-format uses register name */
> > +   user->call.name = user->reg_name;
> > +   user->tracepoint.name = user->reg_name;
> > +   }
> > +
> > +   return 0;
> > +}



Re: [PATCH v3 2/4] tracing/user_events: Introduce multi-format events

2024-02-21 Thread Beau Belgrave
On Wed, Feb 21, 2024 at 10:21:04AM -0500, Steven Rostedt wrote:
> On Wed, 14 Feb 2024 17:50:44 +
> Beau Belgrave  wrote:
> 
> > Currently user_events supports 1 event with the same name and must have
> > the exact same format when referenced by multiple programs. This opens
> > an opportunity for malicous or poorly thought through programs to
> > create events that others use with different formats. Another scenario
> > is user programs wishing to use the same event name but add more fields
> > later when the software updates. Various versions of a program may be
> > running side-by-side, which is prevented by the current single format
> > requirement.
> > 
> > Add a new register flag (USER_EVENT_REG_MULTI_FORMAT) which indicates
> > the user program wishes to use the same user_event name, but may have
> > several different formats of the event in the future. When this flag is
> 
> "of the event in the future." Does it have to be in the future? Is there
> use case where an application might legitimately want the same event name
> with different formats?
> 

You're right, our use cases are mostly around future facing/compat.
There are valid cases where you just want several different formats with
the same name.

I'll drop the "in the future", so it'll just be "several different
formats".

Thanks,
-Beau

> -- Steve
> 
> > used, create the underlying tracepoint backing the user_event with a
> > unique name per-version of the format. It's important that existing ABI
> > users do not get this logic automatically, even if one of the multi
> > format events matches the format. This ensures existing programs that
> > create events and assume the tracepoint name will match exactly continue
> > to work as expected. Add logic to only check multi-format events with
> > other multi-format events and single-format events to only check
> > single-format events during find.
> > 



[PATCH v3 4/4] tracing/user_events: Document multi-format flag

2024-02-14 Thread Beau Belgrave
User programs can now ask user_events to handle the synchronization of
multiple different formats for an event with the same name via the new
USER_EVENT_REG_MULTI_FORMAT flag.

Add a section for USER_EVENT_REG_MULTI_FORMAT that explains the intended
purpose and caveats of using it. Explain how deletion works in these
cases and how to use /sys/kernel/tracing/dynamic_events for per-version
deletion.

Signed-off-by: Beau Belgrave 
---
 Documentation/trace/user_events.rst | 27 ++-
 1 file changed, 26 insertions(+), 1 deletion(-)

diff --git a/Documentation/trace/user_events.rst 
b/Documentation/trace/user_events.rst
index d8f12442aaa6..1d5a7626e6a6 100644
--- a/Documentation/trace/user_events.rst
+++ b/Documentation/trace/user_events.rst
@@ -92,6 +92,24 @@ The following flags are currently supported.
   process closes or unregisters the event. Requires CAP_PERFMON otherwise
   -EPERM is returned.
 
++ USER_EVENT_REG_MULTI_FORMAT: The event can contain multiple formats. This
+  allows programs to prevent themselves from being blocked when their event
+  format changes and they wish to use the same name. When this flag is used the
+  tracepoint name will be in the new format of "name.unique_id" vs the older
+  format of "name". A tracepoint will be created for each unique pair of name
+  and format. This means if several processes use the same name and format,
+  they will use the same tracepoint. If yet another process uses the same name,
+  but a different format than the other processes, it will use a different
+  tracepoint with a new unique id. Recording programs need to scan tracefs for
+  the various different formats of the event name they are interested in
+  recording. The system name of the tracepoint will also use 
"user_events_multi"
+  instead of "user_events". This prevents single-format event names conflicting
+  with any multi-format event names within tracefs. The unique_id is output as
+  a hex string. Recording programs should ensure the tracepoint name starts 
with
+  the event name they registered and has a suffix that starts with . and only
+  has hex characters. For example to find all versions of the event "test" you
+  can use the regex "^test\.[0-9a-fA-F]+$".
+
 Upon successful registration the following is set.
 
 + write_index: The index to use for this file descriptor that represents this
@@ -106,6 +124,9 @@ or perf record -e user_events:[name] when 
attaching/recording.
 **NOTE:** The event subsystem name by default is "user_events". Callers should
 not assume it will always be "user_events". Operators reserve the right in the
 future to change the subsystem name per-process to accommodate event isolation.
+In addition if the USER_EVENT_REG_MULTI_FORMAT flag is used the tracepoint name
+will have a unique id appended to it and the system name will be
+"user_events_multi" as described above.
 
 Command Format
 ^^
@@ -156,7 +177,11 @@ to request deletes than the one used for registration due 
to this.
 to the event. If programs do not want auto-delete, they must use the
 USER_EVENT_REG_PERSIST flag when registering the event. Once that flag is used
 the event exists until DIAG_IOCSDEL is invoked. Both register and delete of an
-event that persists requires CAP_PERFMON, otherwise -EPERM is returned.
+event that persists requires CAP_PERFMON, otherwise -EPERM is returned. When
+there are multiple formats of the same event name, all events with the same
+name will be attempted to be deleted. If only a specific version is wanted to
+be deleted then the /sys/kernel/tracing/dynamic_events file should be used for
+that specific format of the event.
 
 Unregistering
 -
-- 
2.34.1




[PATCH v3 1/4] tracing/user_events: Prepare find/delete for same name events

2024-02-14 Thread Beau Belgrave
The current code for finding and deleting events assumes that there will
never be cases when user_events are registered with the same name, but
different formats. In the future this scenario will exist to ensure
user programs can be updated or modify their events and run different
versions of their programs side-by-side without being blocked.

This change does not yet allow for multi-format events. If user_events
are registered with the same name but different arguments the programs
see the same return values as before. This change simply makes it
possible to easily accomodate for this in future changes.

Update find_user_event() to take in argument parameters and register
flags to accomodate future multi-format event scenarios. Have find
validate argument matching and return error pointers to cover address
in use cases, or allocation errors. Update callers to handle error
pointer logic.

Move delete_user_event() to use hash walking directly now that find has
changed. Delete all events found that match the register name, stop
if an error occurs and report back to the user.

Update user_fields_match() to cover list_empty() scenarios instead of
each callsite doing it now that find_user_event() uses it directly.

Signed-off-by: Beau Belgrave 
---
 kernel/trace/trace_events_user.c | 107 +--
 1 file changed, 59 insertions(+), 48 deletions(-)

diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
index 9365ce407426..dda58681247e 100644
--- a/kernel/trace/trace_events_user.c
+++ b/kernel/trace/trace_events_user.c
@@ -202,6 +202,8 @@ static struct user_event_mm *user_event_mm_get(struct 
user_event_mm *mm);
 static struct user_event_mm *user_event_mm_get_all(struct user_event *user);
 static void user_event_mm_put(struct user_event_mm *mm);
 static int destroy_user_event(struct user_event *user);
+static bool user_fields_match(struct user_event *user, int argc,
+ const char **argv);
 
 static u32 user_event_key(char *name)
 {
@@ -1493,17 +1495,24 @@ static int destroy_user_event(struct user_event *user)
 }
 
 static struct user_event *find_user_event(struct user_event_group *group,
- char *name, u32 *outkey)
+ char *name, int argc, const char 
**argv,
+ u32 flags, u32 *outkey)
 {
struct user_event *user;
u32 key = user_event_key(name);
 
*outkey = key;
 
-   hash_for_each_possible(group->register_table, user, node, key)
-   if (!strcmp(EVENT_NAME(user), name))
+   hash_for_each_possible(group->register_table, user, node, key) {
+   if (strcmp(EVENT_NAME(user), name))
+   continue;
+
+   if (user_fields_match(user, argc, argv))
return user_event_get(user);
 
+   return ERR_PTR(-EADDRINUSE);
+   }
+
return NULL;
 }
 
@@ -1860,6 +1869,9 @@ static bool user_fields_match(struct user_event *user, 
int argc,
struct list_head *head = >fields;
int i = 0;
 
+   if (argc == 0)
+   return list_empty(head);
+
list_for_each_entry_reverse(field, head, link) {
if (!user_field_match(field, argc, argv, ))
return false;
@@ -1880,10 +1892,8 @@ static bool user_event_match(const char *system, const 
char *event,
match = strcmp(EVENT_NAME(user), event) == 0 &&
(!system || strcmp(system, USER_EVENTS_SYSTEM) == 0);
 
-   if (match && argc > 0)
+   if (match)
match = user_fields_match(user, argc, argv);
-   else if (match && argc == 0)
-   match = list_empty(>fields);
 
return match;
 }
@@ -1922,11 +1932,11 @@ static int user_event_parse(struct user_event_group 
*group, char *name,
char *args, char *flags,
struct user_event **newuser, int reg_flags)
 {
-   int ret;
-   u32 key;
struct user_event *user;
+   char **argv = NULL;
int argc = 0;
-   char **argv;
+   int ret;
+   u32 key;
 
/* Currently don't support any text based flags */
if (flags != NULL)
@@ -1935,41 +1945,34 @@ static int user_event_parse(struct user_event_group 
*group, char *name,
if (!user_event_capable(reg_flags))
return -EPERM;
 
+   if (args) {
+   argv = argv_split(GFP_KERNEL, args, );
+
+   if (!argv)
+   return -ENOMEM;
+   }
+
/* Prevent dyn_event from racing */
mutex_lock(_mutex);
-   user = find_user_event(group, name, );
+   user = find_user_event(group, name, argc, (const char **)argv,
+  reg_flags, );
mutex_unlock(_mutex);
 
-   if (user) {
-   if (args) {
-  

[PATCH v3 2/4] tracing/user_events: Introduce multi-format events

2024-02-14 Thread Beau Belgrave
Currently user_events supports 1 event with the same name and must have
the exact same format when referenced by multiple programs. This opens
an opportunity for malicous or poorly thought through programs to
create events that others use with different formats. Another scenario
is user programs wishing to use the same event name but add more fields
later when the software updates. Various versions of a program may be
running side-by-side, which is prevented by the current single format
requirement.

Add a new register flag (USER_EVENT_REG_MULTI_FORMAT) which indicates
the user program wishes to use the same user_event name, but may have
several different formats of the event in the future. When this flag is
used, create the underlying tracepoint backing the user_event with a
unique name per-version of the format. It's important that existing ABI
users do not get this logic automatically, even if one of the multi
format events matches the format. This ensures existing programs that
create events and assume the tracepoint name will match exactly continue
to work as expected. Add logic to only check multi-format events with
other multi-format events and single-format events to only check
single-format events during find.

Change system name of the multi-format event tracepoint to ensure that
multi-format events are isolated completely from single-format events.
This prevents single-format names from conflicting with multi-format
events if they end with the same suffix as the multi-format events.

Add a register_name (reg_name) to the user_event struct which allows for
split naming of events. We now have the name that was used to register
within user_events as well as the unique name for the tracepoint. Upon
registering events ensure matches based on first the reg_name, followed
by the fields and format of the event. This allows for multiple events
with the same registered name to have different formats. The underlying
tracepoint will have a unique name in the format of {reg_name}.{unique_id}.

For example, if both "test u32 value" and "test u64 value" are used with
the USER_EVENT_REG_MULTI_FORMAT the system would have 2 unique
tracepoints. The dynamic_events file would then show the following:
  u:test u64 count
  u:test u32 count

The actual tracepoint names look like this:
  test.0
  test.1

Both would be under the new user_events_multi system name to prevent the
older ABI from being used to squat on multi-formatted events and block
their use.

Deleting events via "!u:test u64 count" would only delete the first
tracepoint that matched that format. When the delete ABI is used all
events with the same name will be attempted to be deleted. If
per-version deletion is required, user programs should either not use
persistent events or delete them via dynamic_events.

Signed-off-by: Beau Belgrave 
---
 include/uapi/linux/user_events.h |   6 +-
 kernel/trace/trace_events_user.c | 118 +++
 2 files changed, 111 insertions(+), 13 deletions(-)

diff --git a/include/uapi/linux/user_events.h b/include/uapi/linux/user_events.h
index f74f3aedd49c..a03de03dccbc 100644
--- a/include/uapi/linux/user_events.h
+++ b/include/uapi/linux/user_events.h
@@ -12,6 +12,7 @@
 #include 
 
 #define USER_EVENTS_SYSTEM "user_events"
+#define USER_EVENTS_MULTI_SYSTEM "user_events_multi"
 #define USER_EVENTS_PREFIX "u:"
 
 /* Create dynamic location entry within a 32-bit value */
@@ -22,8 +23,11 @@ enum user_reg_flag {
/* Event will not delete upon last reference closing */
USER_EVENT_REG_PERSIST  = 1U << 0,
 
+   /* Event will be allowed to have multiple formats */
+   USER_EVENT_REG_MULTI_FORMAT = 1U << 1,
+
/* This value or above is currently non-ABI */
-   USER_EVENT_REG_MAX  = 1U << 1,
+   USER_EVENT_REG_MAX  = 1U << 2,
 };
 
 /*
diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
index dda58681247e..53980982a575 100644
--- a/kernel/trace/trace_events_user.c
+++ b/kernel/trace/trace_events_user.c
@@ -34,7 +34,8 @@
 
 /* Limit how long of an event name plus args within the subsystem. */
 #define MAX_EVENT_DESC 512
-#define EVENT_NAME(user_event) ((user_event)->tracepoint.name)
+#define EVENT_NAME(user_event) ((user_event)->reg_name)
+#define EVENT_TP_NAME(user_event) ((user_event)->tracepoint.name)
 #define MAX_FIELD_ARRAY_SIZE 1024
 
 /*
@@ -54,10 +55,13 @@
  * allows isolation for events by various means.
  */
 struct user_event_group {
-   char*system_name;
-   struct  hlist_node node;
-   struct  mutex reg_mutex;
+   char*system_name;
+   char*system_multi_name;
+   struct hlist_node   node;
+   struct mutexreg_mutex;
DECLARE_HASHTABLE(register_table, 8);
+   /* ID that mov

[PATCH v3 3/4] selftests/user_events: Test multi-format events

2024-02-14 Thread Beau Belgrave
User_events now has multi-format events which allow for the same
register name, but with different formats. When this occurs, different
tracepoints are created with unique names.

Add a new test that ensures the same name can be used for two different
formats. Ensure they are isolated from each other and that name and arg
matching still works if yet another register comes in with the same
format as one of the two.

Signed-off-by: Beau Belgrave 
---
 .../testing/selftests/user_events/abi_test.c  | 134 ++
 1 file changed, 134 insertions(+)

diff --git a/tools/testing/selftests/user_events/abi_test.c 
b/tools/testing/selftests/user_events/abi_test.c
index cef1ff1af223..7288a05136ba 100644
--- a/tools/testing/selftests/user_events/abi_test.c
+++ b/tools/testing/selftests/user_events/abi_test.c
@@ -16,6 +16,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 #include 
 
 #include "../kselftest_harness.h"
@@ -23,6 +25,62 @@
 
 const char *data_file = "/sys/kernel/tracing/user_events_data";
 const char *enable_file = 
"/sys/kernel/tracing/events/user_events/__abi_event/enable";
+const char *multi_dir_glob = 
"/sys/kernel/tracing/events/user_events_multi/__abi_event.*";
+
+static int wait_for_delete(char *dir)
+{
+   struct stat buf;
+   int i;
+
+   for (i = 0; i < 1; ++i) {
+   if (stat(dir, ) == -1 && errno == ENOENT)
+   return 0;
+
+   usleep(1000);
+   }
+
+   return -1;
+}
+
+static int find_multi_event_dir(char *unique_field, char *out_dir, int dir_len)
+{
+   char path[256];
+   glob_t buf;
+   int i, ret;
+
+   ret = glob(multi_dir_glob, GLOB_ONLYDIR, NULL, );
+
+   if (ret)
+   return -1;
+
+   ret = -1;
+
+   for (i = 0; i < buf.gl_pathc; ++i) {
+   FILE *fp;
+
+   snprintf(path, sizeof(path), "%s/format", buf.gl_pathv[i]);
+   fp = fopen(path, "r");
+
+   if (!fp)
+   continue;
+
+   while (fgets(path, sizeof(path), fp) != NULL) {
+   if (strstr(path, unique_field)) {
+   fclose(fp);
+   /* strscpy is not available, use snprintf */
+   snprintf(out_dir, dir_len, "%s", 
buf.gl_pathv[i]);
+   ret = 0;
+   goto out;
+   }
+   }
+
+   fclose(fp);
+   }
+out:
+   globfree();
+
+   return ret;
+}
 
 static bool event_exists(void)
 {
@@ -74,6 +132,39 @@ static int event_delete(void)
return ret;
 }
 
+static int reg_enable_multi(void *enable, int size, int bit, int flags,
+   char *args)
+{
+   struct user_reg reg = {0};
+   char full_args[512] = {0};
+   int fd = open(data_file, O_RDWR);
+   int len;
+   int ret;
+
+   if (fd < 0)
+   return -1;
+
+   len = snprintf(full_args, sizeof(full_args), "__abi_event %s", args);
+
+   if (len > sizeof(full_args)) {
+   ret = -E2BIG;
+   goto out;
+   }
+
+   reg.size = sizeof(reg);
+   reg.name_args = (__u64)full_args;
+   reg.flags = USER_EVENT_REG_MULTI_FORMAT | flags;
+   reg.enable_bit = bit;
+   reg.enable_addr = (__u64)enable;
+   reg.enable_size = size;
+
+   ret = ioctl(fd, DIAG_IOCSREG, );
+out:
+   close(fd);
+
+   return ret;
+}
+
 static int reg_enable_flags(void *enable, int size, int bit, int flags)
 {
struct user_reg reg = {0};
@@ -207,6 +298,49 @@ TEST_F(user, bit_sizes) {
ASSERT_NE(0, reg_enable(>check, 128, 0));
 }
 
+TEST_F(user, multi_format) {
+   char first_dir[256];
+   char second_dir[256];
+   struct stat buf;
+
+   /* Multiple formats for the same name should work */
+   ASSERT_EQ(0, reg_enable_multi(>check, sizeof(int), 0,
+ 0, "u32 multi_first"));
+
+   ASSERT_EQ(0, reg_enable_multi(>check, sizeof(int), 1,
+ 0, "u64 multi_second"));
+
+   /* Same name with same format should also work */
+   ASSERT_EQ(0, reg_enable_multi(>check, sizeof(int), 2,
+ 0, "u64 multi_second"));
+
+   ASSERT_EQ(0, find_multi_event_dir("multi_first",
+ first_dir, sizeof(first_dir)));
+
+   ASSERT_EQ(0, find_multi_event_dir("multi_second",
+ second_dir, sizeof(second_dir)));
+
+   /* Should not be found in the same dir */
+   ASSERT_NE(0, strcmp(first_dir, second_dir));
+
+   /* First dir should still exist */
+   ASSERT_EQ(0, stat(first_dir, ));
+
+   /* Disabling first register sho

[PATCH v3 0/4] tracing/user_events: Introduce multi-format events

2024-02-14 Thread Beau Belgrave
Currently user_events supports 1 event with the same name and must have
the exact same format when referenced by multiple programs. This opens
an opportunity for malicous or poorly thought through programs to
create events that others use with different formats. Another scenario
is user programs wishing to use the same event name but add more fields
later when the software updates. Various versions of a program may be
running side-by-side, which is prevented by the current single format
requirement.

Add a new register flag (USER_EVENT_REG_MULTI_FORMAT) which indicates
the user program wishes to use the same user_event name, but may have
several different formats of the event in the future. When this flag is
used, create the underlying tracepoint backing the user_event with a
unique name per-version of the format. It's important that existing ABI
users do not get this logic automatically, even if one of the multi
format events matches the format. This ensures existing programs that
create events and assume the tracepoint name will match exactly continue
to work as expected. Add logic to only check multi-format events with
other multi-format events and single-format events to only check
single-format events during find.

Change system name of the multi-format event tracepoint to ensure that
multi-format events are isolated completely from single-format events.
This prevents single-format names from conflicting with multi-format
events if they end with the same suffix as the multi-format events.

Add a register_name (reg_name) to the user_event struct which allows for
split naming of events. We now have the name that was used to register
within user_events as well as the unique name for the tracepoint. Upon
registering events ensure matches based on first the reg_name, followed
by the fields and format of the event. This allows for multiple events
with the same registered name to have different formats. The underlying
tracepoint will have a unique name in the format of {reg_name}.{unique_id}.

For example, if both "test u32 value" and "test u64 value" are used with
the USER_EVENT_REG_MULTI_FORMAT the system would have 2 unique
tracepoints. The dynamic_events file would then show the following:
  u:test u64 count
  u:test u32 count

The actual tracepoint names look like this:
  test.0
  test.1

Both would be under the new user_events_multi system name to prevent the
older ABI from being used to squat on multi-formatted events and block
their use.

Deleting events via "!u:test u64 count" would only delete the first
tracepoint that matched that format. When the delete ABI is used all
events with the same name will be attempted to be deleted. If
per-version deletion is required, user programs should either not use
persistent events or delete them via dynamic_events.

Changes in v3:
  Use hash_for_each_possible_safe() in destroy_user_event() to prevent
  use after free (caught by kernel test robot ).

Changes in v2:
  Tracepoint names changed from "name:[id]" to "name.id". Feedback
  was the : could conflict with system name formats. []'s are also
  special characters for bash.

  Updated self-test and docs to reflect the new suffix format.

  Updated docs to include a regex example to help guide recording
  programs find the correct event in ambiguous cases.

Beau Belgrave (4):
  tracing/user_events: Prepare find/delete for same name events
  tracing/user_events: Introduce multi-format events
  selftests/user_events: Test multi-format events
  tracing/user_events: Document multi-format flag

 Documentation/trace/user_events.rst   |  27 ++-
 include/uapi/linux/user_events.h  |   6 +-
 kernel/trace/trace_events_user.c  | 225 +-
 .../testing/selftests/user_events/abi_test.c  | 134 +++
 4 files changed, 330 insertions(+), 62 deletions(-)


base-commit: 610a9b8f49fbcf1100716370d3b5f6f884a2835a
-- 
2.34.1




[PATCH v2 3/4] selftests/user_events: Test multi-format events

2024-02-02 Thread Beau Belgrave
User_events now has multi-format events which allow for the same
register name, but with different formats. When this occurs, different
tracepoints are created with unique names.

Add a new test that ensures the same name can be used for two different
formats. Ensure they are isolated from each other and that name and arg
matching still works if yet another register comes in with the same
format as one of the two.

Signed-off-by: Beau Belgrave 
---
 .../testing/selftests/user_events/abi_test.c  | 134 ++
 1 file changed, 134 insertions(+)

diff --git a/tools/testing/selftests/user_events/abi_test.c 
b/tools/testing/selftests/user_events/abi_test.c
index cef1ff1af223..7288a05136ba 100644
--- a/tools/testing/selftests/user_events/abi_test.c
+++ b/tools/testing/selftests/user_events/abi_test.c
@@ -16,6 +16,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 #include 
 
 #include "../kselftest_harness.h"
@@ -23,6 +25,62 @@
 
 const char *data_file = "/sys/kernel/tracing/user_events_data";
 const char *enable_file = 
"/sys/kernel/tracing/events/user_events/__abi_event/enable";
+const char *multi_dir_glob = 
"/sys/kernel/tracing/events/user_events_multi/__abi_event.*";
+
+static int wait_for_delete(char *dir)
+{
+   struct stat buf;
+   int i;
+
+   for (i = 0; i < 1; ++i) {
+   if (stat(dir, ) == -1 && errno == ENOENT)
+   return 0;
+
+   usleep(1000);
+   }
+
+   return -1;
+}
+
+static int find_multi_event_dir(char *unique_field, char *out_dir, int dir_len)
+{
+   char path[256];
+   glob_t buf;
+   int i, ret;
+
+   ret = glob(multi_dir_glob, GLOB_ONLYDIR, NULL, );
+
+   if (ret)
+   return -1;
+
+   ret = -1;
+
+   for (i = 0; i < buf.gl_pathc; ++i) {
+   FILE *fp;
+
+   snprintf(path, sizeof(path), "%s/format", buf.gl_pathv[i]);
+   fp = fopen(path, "r");
+
+   if (!fp)
+   continue;
+
+   while (fgets(path, sizeof(path), fp) != NULL) {
+   if (strstr(path, unique_field)) {
+   fclose(fp);
+   /* strscpy is not available, use snprintf */
+   snprintf(out_dir, dir_len, "%s", 
buf.gl_pathv[i]);
+   ret = 0;
+   goto out;
+   }
+   }
+
+   fclose(fp);
+   }
+out:
+   globfree();
+
+   return ret;
+}
 
 static bool event_exists(void)
 {
@@ -74,6 +132,39 @@ static int event_delete(void)
return ret;
 }
 
+static int reg_enable_multi(void *enable, int size, int bit, int flags,
+   char *args)
+{
+   struct user_reg reg = {0};
+   char full_args[512] = {0};
+   int fd = open(data_file, O_RDWR);
+   int len;
+   int ret;
+
+   if (fd < 0)
+   return -1;
+
+   len = snprintf(full_args, sizeof(full_args), "__abi_event %s", args);
+
+   if (len > sizeof(full_args)) {
+   ret = -E2BIG;
+   goto out;
+   }
+
+   reg.size = sizeof(reg);
+   reg.name_args = (__u64)full_args;
+   reg.flags = USER_EVENT_REG_MULTI_FORMAT | flags;
+   reg.enable_bit = bit;
+   reg.enable_addr = (__u64)enable;
+   reg.enable_size = size;
+
+   ret = ioctl(fd, DIAG_IOCSREG, );
+out:
+   close(fd);
+
+   return ret;
+}
+
 static int reg_enable_flags(void *enable, int size, int bit, int flags)
 {
struct user_reg reg = {0};
@@ -207,6 +298,49 @@ TEST_F(user, bit_sizes) {
ASSERT_NE(0, reg_enable(>check, 128, 0));
 }
 
+TEST_F(user, multi_format) {
+   char first_dir[256];
+   char second_dir[256];
+   struct stat buf;
+
+   /* Multiple formats for the same name should work */
+   ASSERT_EQ(0, reg_enable_multi(>check, sizeof(int), 0,
+ 0, "u32 multi_first"));
+
+   ASSERT_EQ(0, reg_enable_multi(>check, sizeof(int), 1,
+ 0, "u64 multi_second"));
+
+   /* Same name with same format should also work */
+   ASSERT_EQ(0, reg_enable_multi(>check, sizeof(int), 2,
+ 0, "u64 multi_second"));
+
+   ASSERT_EQ(0, find_multi_event_dir("multi_first",
+ first_dir, sizeof(first_dir)));
+
+   ASSERT_EQ(0, find_multi_event_dir("multi_second",
+ second_dir, sizeof(second_dir)));
+
+   /* Should not be found in the same dir */
+   ASSERT_NE(0, strcmp(first_dir, second_dir));
+
+   /* First dir should still exist */
+   ASSERT_EQ(0, stat(first_dir, ));
+
+   /* Disabling first register sho

[PATCH v2 2/4] tracing/user_events: Introduce multi-format events

2024-02-02 Thread Beau Belgrave
Currently user_events supports 1 event with the same name and must have
the exact same format when referenced by multiple programs. This opens
an opportunity for malicous or poorly thought through programs to
create events that others use with different formats. Another scenario
is user programs wishing to use the same event name but add more fields
later when the software updates. Various versions of a program may be
running side-by-side, which is prevented by the current single format
requirement.

Add a new register flag (USER_EVENT_REG_MULTI_FORMAT) which indicates
the user program wishes to use the same user_event name, but may have
several different formats of the event in the future. When this flag is
used, create the underlying tracepoint backing the user_event with a
unique name per-version of the format. It's important that existing ABI
users do not get this logic automatically, even if one of the multi
format events matches the format. This ensures existing programs that
create events and assume the tracepoint name will match exactly continue
to work as expected. Add logic to only check multi-format events with
other multi-format events and single-format events to only check
single-format events during find.

Change system name of the multi-format event tracepoint to ensure that
multi-format events are isolated completely from single-format events.
This prevents single-format names from conflicting with multi-format
events if they end with the same suffix as the multi-format events.

Add a register_name (reg_name) to the user_event struct which allows for
split naming of events. We now have the name that was used to register
within user_events as well as the unique name for the tracepoint. Upon
registering events ensure matches based on first the reg_name, followed
by the fields and format of the event. This allows for multiple events
with the same registered name to have different formats. The underlying
tracepoint will have a unique name in the format of {reg_name}.{unique_id}.

For example, if both "test u32 value" and "test u64 value" are used with
the USER_EVENT_REG_MULTI_FORMAT the system would have 2 unique
tracepoints. The dynamic_events file would then show the following:
  u:test u64 count
  u:test u32 count

The actual tracepoint names look like this:
  test.0
  test.1

Both would be under the new user_events_multi system name to prevent the
older ABI from being used to squat on multi-formatted events and block
their use.

Deleting events via "!u:test u64 count" would only delete the first
tracepoint that matched that format. When the delete ABI is used all
events with the same name will be attempted to be deleted. If
per-version deletion is required, user programs should either not use
persistent events or delete them via dynamic_events.

Signed-off-by: Beau Belgrave 
---
 include/uapi/linux/user_events.h |   6 +-
 kernel/trace/trace_events_user.c | 118 +++
 2 files changed, 111 insertions(+), 13 deletions(-)

diff --git a/include/uapi/linux/user_events.h b/include/uapi/linux/user_events.h
index f74f3aedd49c..a03de03dccbc 100644
--- a/include/uapi/linux/user_events.h
+++ b/include/uapi/linux/user_events.h
@@ -12,6 +12,7 @@
 #include 
 
 #define USER_EVENTS_SYSTEM "user_events"
+#define USER_EVENTS_MULTI_SYSTEM "user_events_multi"
 #define USER_EVENTS_PREFIX "u:"
 
 /* Create dynamic location entry within a 32-bit value */
@@ -22,8 +23,11 @@ enum user_reg_flag {
/* Event will not delete upon last reference closing */
USER_EVENT_REG_PERSIST  = 1U << 0,
 
+   /* Event will be allowed to have multiple formats */
+   USER_EVENT_REG_MULTI_FORMAT = 1U << 1,
+
/* This value or above is currently non-ABI */
-   USER_EVENT_REG_MAX  = 1U << 1,
+   USER_EVENT_REG_MAX  = 1U << 2,
 };
 
 /*
diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
index 0480579ba563..608750e4e6df 100644
--- a/kernel/trace/trace_events_user.c
+++ b/kernel/trace/trace_events_user.c
@@ -34,7 +34,8 @@
 
 /* Limit how long of an event name plus args within the subsystem. */
 #define MAX_EVENT_DESC 512
-#define EVENT_NAME(user_event) ((user_event)->tracepoint.name)
+#define EVENT_NAME(user_event) ((user_event)->reg_name)
+#define EVENT_TP_NAME(user_event) ((user_event)->tracepoint.name)
 #define MAX_FIELD_ARRAY_SIZE 1024
 
 /*
@@ -54,10 +55,13 @@
  * allows isolation for events by various means.
  */
 struct user_event_group {
-   char*system_name;
-   struct  hlist_node node;
-   struct  mutex reg_mutex;
+   char*system_name;
+   char*system_multi_name;
+   struct hlist_node   node;
+   struct mutexreg_mutex;
DECLARE_HASHTABLE(register_table, 8);
+   /* ID that mov

[PATCH v2 1/4] tracing/user_events: Prepare find/delete for same name events

2024-02-02 Thread Beau Belgrave
The current code for finding and deleting events assumes that there will
never be cases when user_events are registered with the same name, but
different formats. In the future this scenario will exist to ensure
user programs can be updated or modify their events and run different
versions of their programs side-by-side without being blocked.

This change does not yet allow for multi-format events. If user_events
are registered with the same name but different arguments the programs
see the same return values as before. This change simply makes it
possible to easily accomodate for this in future changes.

Update find_user_event() to take in argument parameters and register
flags to accomodate future multi-format event scenarios. Have find
validate argument matching and return error pointers to cover address
in use cases, or allocation errors. Update callers to handle error
pointer logic.

Move delete_user_event() to use hash walking directly now that find has
changed. Delete all events found that match the register name, stop
if an error occurs and report back to the user.

Update user_fields_match() to cover list_empty() scenarios instead of
each callsite doing it now that find_user_event() uses it directly.

Signed-off-by: Beau Belgrave 
---
 kernel/trace/trace_events_user.c | 106 +--
 1 file changed, 58 insertions(+), 48 deletions(-)

diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
index 9365ce407426..0480579ba563 100644
--- a/kernel/trace/trace_events_user.c
+++ b/kernel/trace/trace_events_user.c
@@ -202,6 +202,8 @@ static struct user_event_mm *user_event_mm_get(struct 
user_event_mm *mm);
 static struct user_event_mm *user_event_mm_get_all(struct user_event *user);
 static void user_event_mm_put(struct user_event_mm *mm);
 static int destroy_user_event(struct user_event *user);
+static bool user_fields_match(struct user_event *user, int argc,
+ const char **argv);
 
 static u32 user_event_key(char *name)
 {
@@ -1493,17 +1495,24 @@ static int destroy_user_event(struct user_event *user)
 }
 
 static struct user_event *find_user_event(struct user_event_group *group,
- char *name, u32 *outkey)
+ char *name, int argc, const char 
**argv,
+ u32 flags, u32 *outkey)
 {
struct user_event *user;
u32 key = user_event_key(name);
 
*outkey = key;
 
-   hash_for_each_possible(group->register_table, user, node, key)
-   if (!strcmp(EVENT_NAME(user), name))
+   hash_for_each_possible(group->register_table, user, node, key) {
+   if (strcmp(EVENT_NAME(user), name))
+   continue;
+
+   if (user_fields_match(user, argc, argv))
return user_event_get(user);
 
+   return ERR_PTR(-EADDRINUSE);
+   }
+
return NULL;
 }
 
@@ -1860,6 +1869,9 @@ static bool user_fields_match(struct user_event *user, 
int argc,
struct list_head *head = >fields;
int i = 0;
 
+   if (argc == 0)
+   return list_empty(head);
+
list_for_each_entry_reverse(field, head, link) {
if (!user_field_match(field, argc, argv, ))
return false;
@@ -1880,10 +1892,8 @@ static bool user_event_match(const char *system, const 
char *event,
match = strcmp(EVENT_NAME(user), event) == 0 &&
(!system || strcmp(system, USER_EVENTS_SYSTEM) == 0);
 
-   if (match && argc > 0)
+   if (match)
match = user_fields_match(user, argc, argv);
-   else if (match && argc == 0)
-   match = list_empty(>fields);
 
return match;
 }
@@ -1922,11 +1932,11 @@ static int user_event_parse(struct user_event_group 
*group, char *name,
char *args, char *flags,
struct user_event **newuser, int reg_flags)
 {
-   int ret;
-   u32 key;
struct user_event *user;
+   char **argv = NULL;
int argc = 0;
-   char **argv;
+   int ret;
+   u32 key;
 
/* Currently don't support any text based flags */
if (flags != NULL)
@@ -1935,41 +1945,34 @@ static int user_event_parse(struct user_event_group 
*group, char *name,
if (!user_event_capable(reg_flags))
return -EPERM;
 
+   if (args) {
+   argv = argv_split(GFP_KERNEL, args, );
+
+   if (!argv)
+   return -ENOMEM;
+   }
+
/* Prevent dyn_event from racing */
mutex_lock(_mutex);
-   user = find_user_event(group, name, );
+   user = find_user_event(group, name, argc, (const char **)argv,
+  reg_flags, );
mutex_unlock(_mutex);
 
-   if (user) {
-   if (args) {
-  

[PATCH v2 4/4] tracing/user_events: Document multi-format flag

2024-02-02 Thread Beau Belgrave
User programs can now ask user_events to handle the synchronization of
multiple different formats for an event with the same name via the new
USER_EVENT_REG_MULTI_FORMAT flag.

Add a section for USER_EVENT_REG_MULTI_FORMAT that explains the intended
purpose and caveats of using it. Explain how deletion works in these
cases and how to use /sys/kernel/tracing/dynamic_events for per-version
deletion.

Signed-off-by: Beau Belgrave 
---
 Documentation/trace/user_events.rst | 27 ++-
 1 file changed, 26 insertions(+), 1 deletion(-)

diff --git a/Documentation/trace/user_events.rst 
b/Documentation/trace/user_events.rst
index d8f12442aaa6..1d5a7626e6a6 100644
--- a/Documentation/trace/user_events.rst
+++ b/Documentation/trace/user_events.rst
@@ -92,6 +92,24 @@ The following flags are currently supported.
   process closes or unregisters the event. Requires CAP_PERFMON otherwise
   -EPERM is returned.
 
++ USER_EVENT_REG_MULTI_FORMAT: The event can contain multiple formats. This
+  allows programs to prevent themselves from being blocked when their event
+  format changes and they wish to use the same name. When this flag is used the
+  tracepoint name will be in the new format of "name.unique_id" vs the older
+  format of "name". A tracepoint will be created for each unique pair of name
+  and format. This means if several processes use the same name and format,
+  they will use the same tracepoint. If yet another process uses the same name,
+  but a different format than the other processes, it will use a different
+  tracepoint with a new unique id. Recording programs need to scan tracefs for
+  the various different formats of the event name they are interested in
+  recording. The system name of the tracepoint will also use 
"user_events_multi"
+  instead of "user_events". This prevents single-format event names conflicting
+  with any multi-format event names within tracefs. The unique_id is output as
+  a hex string. Recording programs should ensure the tracepoint name starts 
with
+  the event name they registered and has a suffix that starts with . and only
+  has hex characters. For example to find all versions of the event "test" you
+  can use the regex "^test\.[0-9a-fA-F]+$".
+
 Upon successful registration the following is set.
 
 + write_index: The index to use for this file descriptor that represents this
@@ -106,6 +124,9 @@ or perf record -e user_events:[name] when 
attaching/recording.
 **NOTE:** The event subsystem name by default is "user_events". Callers should
 not assume it will always be "user_events". Operators reserve the right in the
 future to change the subsystem name per-process to accommodate event isolation.
+In addition if the USER_EVENT_REG_MULTI_FORMAT flag is used the tracepoint name
+will have a unique id appended to it and the system name will be
+"user_events_multi" as described above.
 
 Command Format
 ^^
@@ -156,7 +177,11 @@ to request deletes than the one used for registration due 
to this.
 to the event. If programs do not want auto-delete, they must use the
 USER_EVENT_REG_PERSIST flag when registering the event. Once that flag is used
 the event exists until DIAG_IOCSDEL is invoked. Both register and delete of an
-event that persists requires CAP_PERFMON, otherwise -EPERM is returned.
+event that persists requires CAP_PERFMON, otherwise -EPERM is returned. When
+there are multiple formats of the same event name, all events with the same
+name will be attempted to be deleted. If only a specific version is wanted to
+be deleted then the /sys/kernel/tracing/dynamic_events file should be used for
+that specific format of the event.
 
 Unregistering
 -
-- 
2.34.1




[PATCH v2 0/4] tracing/user_events: Introduce multi-format events

2024-02-02 Thread Beau Belgrave
Currently user_events supports 1 event with the same name and must have
the exact same format when referenced by multiple programs. This opens
an opportunity for malicous or poorly thought through programs to
create events that others use with different formats. Another scenario
is user programs wishing to use the same event name but add more fields
later when the software updates. Various versions of a program may be
running side-by-side, which is prevented by the current single format
requirement.

Add a new register flag (USER_EVENT_REG_MULTI_FORMAT) which indicates
the user program wishes to use the same user_event name, but may have
several different formats of the event in the future. When this flag is
used, create the underlying tracepoint backing the user_event with a
unique name per-version of the format. It's important that existing ABI
users do not get this logic automatically, even if one of the multi
format events matches the format. This ensures existing programs that
create events and assume the tracepoint name will match exactly continue
to work as expected. Add logic to only check multi-format events with
other multi-format events and single-format events to only check
single-format events during find.

Change system name of the multi-format event tracepoint to ensure that
multi-format events are isolated completely from single-format events.
This prevents single-format names from conflicting with multi-format
events if they end with the same suffix as the multi-format events.

Add a register_name (reg_name) to the user_event struct which allows for
split naming of events. We now have the name that was used to register
within user_events as well as the unique name for the tracepoint. Upon
registering events ensure matches based on first the reg_name, followed
by the fields and format of the event. This allows for multiple events
with the same registered name to have different formats. The underlying
tracepoint will have a unique name in the format of {reg_name}.{unique_id}.

For example, if both "test u32 value" and "test u64 value" are used with
the USER_EVENT_REG_MULTI_FORMAT the system would have 2 unique
tracepoints. The dynamic_events file would then show the following:
  u:test u64 count
  u:test u32 count

The actual tracepoint names look like this:
  test.0
  test.1

Both would be under the new user_events_multi system name to prevent the
older ABI from being used to squat on multi-formatted events and block
their use.

Deleting events via "!u:test u64 count" would only delete the first
tracepoint that matched that format. When the delete ABI is used all
events with the same name will be attempted to be deleted. If
per-version deletion is required, user programs should either not use
persistent events or delete them via dynamic_events.

Changes in v2:
  Tracepoint names changed from "name:[id]" to "name.id". Feedback
  was the : could conflict with system name formats. []'s are also
  special characters for bash.

  Updated self-test and docs to reflect the new suffix format.

  Updated docs to include a regex example to help guide recording
  programs find the correct event in ambiguous cases.

Beau Belgrave (4):
  tracing/user_events: Prepare find/delete for same name events
  tracing/user_events: Introduce multi-format events
  selftests/user_events: Test multi-format events
  tracing/user_events: Document multi-format flag

 Documentation/trace/user_events.rst   |  27 ++-
 include/uapi/linux/user_events.h  |   6 +-
 kernel/trace/trace_events_user.c  | 224 +-
 .../testing/selftests/user_events/abi_test.c  | 134 +++
 4 files changed, 329 insertions(+), 62 deletions(-)


base-commit: 610a9b8f49fbcf1100716370d3b5f6f884a2835a
-- 
2.34.1




Re: [PATCH 2/4] tracing/user_events: Introduce multi-format events

2024-01-30 Thread Beau Belgrave
On Tue, Jan 30, 2024 at 01:52:30PM -0500, Steven Rostedt wrote:
> On Tue, 30 Jan 2024 10:05:15 -0800
> Beau Belgrave  wrote:
> 
> > On Mon, Jan 29, 2024 at 09:24:07PM -0500, Steven Rostedt wrote:
> > > On Mon, 29 Jan 2024 09:29:07 -0800
> > > Beau Belgrave  wrote:
> > >   
> > > > Thanks, yeah ideally we wouldn't use special characters.
> > > > 
> > > > I'm not picky about this. However, I did want something that clearly
> > > > allowed a glob pattern to find all versions of a given register name of
> > > > user_events by user programs that record. The dot notation will pull in
> > > > more than expected if dotted namespace style names are used.
> > > > 
> > > > An example is "Asserts" and "Asserts.Verbose" from different programs.
> > > > If we tried to find all versions of "Asserts" via glob of "Asserts.*" it
> > > > will pull in "Asserts.Verbose.1" in addition to "Asserts.0".  
> > > 
> > > Do you prevent brackets in names?
> > >   
> > 
> > No. However, since brackets have a start and end token that are distinct
> > finding all versions of your event is trivial compared to a single dot.
> > 
> > Imagine two events:
> > Asserts
> > Asserts[MyCoolIndex]
> > 
> > Resolves to tracepoints of:
> > Asserts:[0]
> > Asserts[MyCoolIndex]:[1]
> > 
> > Regardless of brackets in the names, a simple glob of Asserts:\[*\] only
> > finds Asserts:[0]. This is because we have that end bracket in the glob
> > and the full event name including the start bracket.
> > 
> > If I register another "version" of Asserts, thne I'll have:
> > Asserts:[0]
> > Asserts[MyCoolIndex]:[1]
> > Asserts:[2]
> > 
> > The glob of Asserts:\[*\] will return both:
> > Asserts:[0]
> > Asserts:[2]
> 
> But what if you had registered "Asserts:[MyCoolIndex]:[1]"
> 

Good point, the above would still require a regex type pattern to not
get pulled in.

> Do you prevent colons?
> 

No, nothing is prevented at this point.

It seems we could either prevent certain characters to make it easier or
define a good regex that we should document.

I'm leaning toward just doing a simple suffix and documenting the regex
well.

> > 
> > At this point the program can either record all versions or scan further
> > to find which version of Asserts is wanted.
> > 
> > > > 
> > > > While a glob of "Asserts.[0-9]" works when the unique ID is 0-9, it
> > > > doesn't work if the number is higher, like 128. If we ever decide to
> > > > change the ID from an integer to say hex to save space, these globs
> > > > would break.
> > > > 
> > > > Is there some scheme that fits the C-variable name that addresses the
> > > > above scenarios? Brackets gave me a simple glob that seemed to prevent a
> > > > lot of this ("Asserts.\[*\]" in this case).  
> > > 
> > > Prevent a lot of what? I'm not sure what your example here is.
> > >   
> > 
> > I'll try again :)
> > 
> > We have 2 events registered via user_events:
> > Asserts
> > Asserts.Verbose
> > 
> > Using dot notation these would result in tracepoints of:
> > user_events_multi/Asserts.0
> > user_events_multi/Asserts.Verbose.1
> > 
> > Using bracket notation these would result in tracepoints of:
> > user_events_multi/Asserts:[0]
> > user_events_multi/Asserts.Verbose:[1]
> > 
> > A recording program only wants to enable the Asserts tracepoint. It does
> > not want to record the Asserts.Verbose tracepoint.
> > 
> > The program must find the right tracepoint by scanning tracefs under the
> > user_events_multi system.
> > 
> > A single dot suffix does not allow a simple glob to be used. The glob
> > Asserts.* will return both Asserts.0 and Asserts.Verbose.1.
> > 
> > A simple glob of Asserts:\[*\] will only find Asserts:[0], it will not
> > find Asserts.Verbose:[1].
> > 
> > We could just use brackets and not have the colon (Asserts[0] in this
> > case). But brackets are still special for bash.
> 
> Are these shell scripts or programs. I use regex in programs all the time.
> And if you have shell scripts, use awk or something.
> 

They could be both. In our case, it is a program.

> Unless you prevent something from being added, I don't see the protection.
> 

Yeah, it just makes it way less likely. Given that, I'm starting to lean
toward just docu

Re: [PATCH 0/4] tracing/user_events: Introduce multi-format events

2024-01-30 Thread Beau Belgrave
On Tue, Jan 30, 2024 at 11:09:33AM +0900, Masami Hiramatsu wrote:
> Hi Beau,
> 
> On Tue, 23 Jan 2024 22:08:40 +0000
> Beau Belgrave  wrote:
> 
> > Currently user_events supports 1 event with the same name and must have
> > the exact same format when referenced by multiple programs. This opens
> > an opportunity for malicous or poorly thought through programs to
> > create events that others use with different formats. Another scenario
> > is user programs wishing to use the same event name but add more fields
> > later when the software updates. Various versions of a program may be
> > running side-by-side, which is prevented by the current single format
> > requirement.
> > 
> > Add a new register flag (USER_EVENT_REG_MULTI_FORMAT) which indicates
> > the user program wishes to use the same user_event name, but may have
> > several different formats of the event in the future. When this flag is
> > used, create the underlying tracepoint backing the user_event with a
> > unique name per-version of the format. It's important that existing ABI
> > users do not get this logic automatically, even if one of the multi
> > format events matches the format. This ensures existing programs that
> > create events and assume the tracepoint name will match exactly continue
> > to work as expected. Add logic to only check multi-format events with
> > other multi-format events and single-format events to only check
> > single-format events during find.
> 
> Thanks for this work! This will allow many instance to use the same
> user-events at the same time.
> 
> BTW, can we force this flag set by default? My concern is if any user
> program use this user-event interface in the container (maybe it is
> possible if we bind-mount it). In this case, the user program can
> detect the other program is using the event if this flag is not set.
> Moreover, if there is a malicious program running in the container,
> it can prevent using the event name from other programs even if it
> is isolated by the name-space.
> 

The multi-format use a different system name (user_events_multi). So you
cannot use the single-format flag to detect multi-format names, etc. You
can only use it to find other single-format names like you could always do.

Likewise, you cannot use the single-event flag to block or prevent
multi-format events from being created.

Changing this behavior to default would break all of our environments.
So I'm pretty sure it would break others. The current environment
expects tracepoints to show up as their registered name under the
user_events system name. If this changed out from under us on a specific
kernel version, that would be bad.

I'd like eventually to have a tracer namespace concept for containers.
Then we would have a user_event_group per tracer namespace. Then all
user_events within the container have a unique system name which fully
isolates them. However, even with that isolation, we still need a way to
allow programs in the same container to have different versions of the
same event name.

Multi-format events fixes this problem. I think isolation should be
dealt with via true namespace isolation at the tracing level.

> Steve suggested that if a user program which is running in a namespace
> uses user-event without this flag, we can reject that by default.
> 
> What would you think about?
> 

This would break all of our environments. It would make previously
compiled programs using the existing ABI from working as expected.

I'd much prefer that level of isolation to happen at the namespace level
and why user_events as plumbing for different groups to achieve this.
It's also why the ABI does not allow programs to state a system name.
I'm trying to reserve the system name for the group/tracer/namespace
isolation work.

Thanks,
-Beau

> Thank you,
> 
> 
> > 
> > Add a register_name (reg_name) to the user_event struct which allows for
> > split naming of events. We now have the name that was used to register
> > within user_events as well as the unique name for the tracepoint. Upon
> > registering events ensure matches based on first the reg_name, followed
> > by the fields and format of the event. This allows for multiple events
> > with the same registered name to have different formats. The underlying
> > tracepoint will have a unique name in the format of {reg_name}:[unique_id].
> > The unique_id is the time, in nanoseconds, of the event creation converted
> > to hex. Since this is done under the register mutex, it is extremely
> > unlikely for these IDs to ever match. It's also very unlikely a malicious
> > program could consistently guess what the name would be and attempt to
> > squat on it via the single format ABI.
> > 
&g

Re: [PATCH 2/4] tracing/user_events: Introduce multi-format events

2024-01-30 Thread Beau Belgrave
On Tue, Jan 30, 2024 at 11:12:22PM +0900, Masami Hiramatsu wrote:
> On Mon, 29 Jan 2024 09:29:07 -0800
> Beau Belgrave  wrote:
> 
> > On Fri, Jan 26, 2024 at 03:04:45PM -0500, Steven Rostedt wrote:
> > > On Fri, 26 Jan 2024 11:10:07 -0800
> > > Beau Belgrave  wrote:
> > > 
> > > > > OK, so the each different event has suffixed name. But this will
> > > > > introduce non C-variable name.
> > > > > 
> > > > > Steve, do you think your library can handle these symbols? It will
> > > > > be something like "event:[1]" as the event name.
> > > > > Personally I like "event.1" style. (of course we need to ensure the
> > > > > user given event name is NOT including such suffix numbers)
> > > > >   
> > > > 
> > > > Just to clarify around events including a suffix number. This is why
> > > > multi-events use "user_events_multi" system name and the single-events
> > > > using just "user_events".
> > > > 
> > > > Even if a user program did include a suffix, the suffix would still get
> > > > appended. An example is "test" vs "test:[0]" using multi-format would
> > > > result in two tracepoints ("test:[0]" and "test:[0]:[1]" respectively
> > > > (assuming these are the first multi-events on the system).
> > > > 
> > > > I'm with you, we really don't want any spoofing or squatting possible.
> > > > By using different system names and always appending the suffix I
> > > > believe covers this.
> > > > 
> > > > Looking forward to hearing Steven's thoughts on this as well.
> > > 
> > > I'm leaning towards Masami's suggestion to use dots, as that won't 
> > > conflict
> > > with special characters from bash, as '[' and ']' do.
> > > 
> > 
> > Thanks, yeah ideally we wouldn't use special characters.
> > 
> > I'm not picky about this. However, I did want something that clearly
> > allowed a glob pattern to find all versions of a given register name of
> > user_events by user programs that record. The dot notation will pull in
> > more than expected if dotted namespace style names are used.
> > 
> > An example is "Asserts" and "Asserts.Verbose" from different programs.
> > If we tried to find all versions of "Asserts" via glob of "Asserts.*" it
> > will pull in "Asserts.Verbose.1" in addition to "Asserts.0".
> 
> If we use dot for the suffix number, we can prohibit user to use it
> for their name. They still can use '_' (or change the group name?)

We could, however, we have user_event integration in OpenTelemetry and
I'm unsure if we should really try to restrict names. We'll also at some
point have libside integration, which might not have the same
restrictions on the user-tracer side as the kernel-tracer side.

I'm trying to restrict the user_event group name from changing outside
of an eventual tracer namespace. I'd like for each container to inherit
a tracer namespace long-term which decides what the actual group name
will be instead of users self-selecting names to prevent squatting or
spoofing of events.

> I just concerned that the name can be parsed by existing tools. Since
> ':' is used as a separator for group and event name in some case (e.g.
> tracefs "set_event" is using, so trace-cmd and perf is using it.)
> 

Good point.

What about just event_name[unique_id]? IE: Drop the colon.

Brackets are still special in bash, but it would prevent simple glob
patterns from matching to incorrect tracepoints under user_events_multi.

> > While a glob of "Asserts.[0-9]" works when the unique ID is 0-9, it
> > doesn't work if the number is higher, like 128. If we ever decide to
> > change the ID from an integer to say hex to save space, these globs
> > would break.
> 
> Hmm, why can't we use regexp?

We can use regex, but we'll need to agree the suffix format. We won't be
able to change it after that point. I'd prefer a base-16/Hex suffix
either in brackets or a simple dot.

> And if we limits the number of events up to 1000 for each same-name event
> we can use fixed numbers, like Assets.[0-9][0-9][0-9]
> 

I'm always wrong when I guess how many events programs will end up
using. Folks always surprise me. I'd rather have a solution that scales
to the max number of tracepoints allowed on the system (currently 16-bit
max value).

Thanks,
-Beau

> Thank you,
> 
> > 
> > Is there some scheme that fits the C-variable name that addresses the
> > above scenarios? Brackets gave me a simple glob that seemed to prevent a
> > lot of this ("Asserts.\[*\]" in this case).
> > 
> > Are we confident that we always want to represent the ID as a base-10
> > integer vs a base-16 integer? The suffix will be ABI to ensure recording
> > programs can find their events easily.
> > 
> > Thanks,
> > -Beau
> > 
> > > -- Steve
> 
> 
> -- 
> Masami Hiramatsu (Google) 



Re: [PATCH 2/4] tracing/user_events: Introduce multi-format events

2024-01-30 Thread Beau Belgrave
On Mon, Jan 29, 2024 at 09:24:07PM -0500, Steven Rostedt wrote:
> On Mon, 29 Jan 2024 09:29:07 -0800
> Beau Belgrave  wrote:
> 
> > Thanks, yeah ideally we wouldn't use special characters.
> > 
> > I'm not picky about this. However, I did want something that clearly
> > allowed a glob pattern to find all versions of a given register name of
> > user_events by user programs that record. The dot notation will pull in
> > more than expected if dotted namespace style names are used.
> > 
> > An example is "Asserts" and "Asserts.Verbose" from different programs.
> > If we tried to find all versions of "Asserts" via glob of "Asserts.*" it
> > will pull in "Asserts.Verbose.1" in addition to "Asserts.0".
> 
> Do you prevent brackets in names?
> 

No. However, since brackets have a start and end token that are distinct
finding all versions of your event is trivial compared to a single dot.

Imagine two events:
Asserts
Asserts[MyCoolIndex]

Resolves to tracepoints of:
Asserts:[0]
Asserts[MyCoolIndex]:[1]

Regardless of brackets in the names, a simple glob of Asserts:\[*\] only
finds Asserts:[0]. This is because we have that end bracket in the glob
and the full event name including the start bracket.

If I register another "version" of Asserts, thne I'll have:
Asserts:[0]
Asserts[MyCoolIndex]:[1]
Asserts:[2]

The glob of Asserts:\[*\] will return both:
Asserts:[0]
Asserts:[2]

At this point the program can either record all versions or scan further
to find which version of Asserts is wanted.

> > 
> > While a glob of "Asserts.[0-9]" works when the unique ID is 0-9, it
> > doesn't work if the number is higher, like 128. If we ever decide to
> > change the ID from an integer to say hex to save space, these globs
> > would break.
> > 
> > Is there some scheme that fits the C-variable name that addresses the
> > above scenarios? Brackets gave me a simple glob that seemed to prevent a
> > lot of this ("Asserts.\[*\]" in this case).
> 
> Prevent a lot of what? I'm not sure what your example here is.
> 

I'll try again :)

We have 2 events registered via user_events:
Asserts
Asserts.Verbose

Using dot notation these would result in tracepoints of:
user_events_multi/Asserts.0
user_events_multi/Asserts.Verbose.1

Using bracket notation these would result in tracepoints of:
user_events_multi/Asserts:[0]
user_events_multi/Asserts.Verbose:[1]

A recording program only wants to enable the Asserts tracepoint. It does
not want to record the Asserts.Verbose tracepoint.

The program must find the right tracepoint by scanning tracefs under the
user_events_multi system.

A single dot suffix does not allow a simple glob to be used. The glob
Asserts.* will return both Asserts.0 and Asserts.Verbose.1.

A simple glob of Asserts:\[*\] will only find Asserts:[0], it will not
find Asserts.Verbose:[1].

We could just use brackets and not have the colon (Asserts[0] in this
case). But brackets are still special for bash.

> > 
> > Are we confident that we always want to represent the ID as a base-10
> > integer vs a base-16 integer? The suffix will be ABI to ensure recording
> > programs can find their events easily.
> 
> Is there a difference to what we choose?
> 

If a simple glob of event_name:\[*\] cannot be used, then we must document
what the suffix format is, so an appropriate regex can be created. If we
start with base-10 then later move to base-16 we will break existing regex
patterns on the recording side.

I prefer, and have in this series, a base-16 output since it saves on
the tracepoint name size.

Either way we go, we need to define how recording programs should find
the events they care about. So we must be very clear, IMHO, about the
format of the tracepoint names in our documentation.

I personally think recording programs are likely to get this wrong
without proper guidance.

Thanks,
-Beau

> -- Steve



Re: [PATCH 2/4] tracing/user_events: Introduce multi-format events

2024-01-29 Thread Beau Belgrave
On Fri, Jan 26, 2024 at 03:04:45PM -0500, Steven Rostedt wrote:
> On Fri, 26 Jan 2024 11:10:07 -0800
> Beau Belgrave  wrote:
> 
> > > OK, so the each different event has suffixed name. But this will
> > > introduce non C-variable name.
> > > 
> > > Steve, do you think your library can handle these symbols? It will
> > > be something like "event:[1]" as the event name.
> > > Personally I like "event.1" style. (of course we need to ensure the
> > > user given event name is NOT including such suffix numbers)
> > >   
> > 
> > Just to clarify around events including a suffix number. This is why
> > multi-events use "user_events_multi" system name and the single-events
> > using just "user_events".
> > 
> > Even if a user program did include a suffix, the suffix would still get
> > appended. An example is "test" vs "test:[0]" using multi-format would
> > result in two tracepoints ("test:[0]" and "test:[0]:[1]" respectively
> > (assuming these are the first multi-events on the system).
> > 
> > I'm with you, we really don't want any spoofing or squatting possible.
> > By using different system names and always appending the suffix I
> > believe covers this.
> > 
> > Looking forward to hearing Steven's thoughts on this as well.
> 
> I'm leaning towards Masami's suggestion to use dots, as that won't conflict
> with special characters from bash, as '[' and ']' do.
> 

Thanks, yeah ideally we wouldn't use special characters.

I'm not picky about this. However, I did want something that clearly
allowed a glob pattern to find all versions of a given register name of
user_events by user programs that record. The dot notation will pull in
more than expected if dotted namespace style names are used.

An example is "Asserts" and "Asserts.Verbose" from different programs.
If we tried to find all versions of "Asserts" via glob of "Asserts.*" it
will pull in "Asserts.Verbose.1" in addition to "Asserts.0".

While a glob of "Asserts.[0-9]" works when the unique ID is 0-9, it
doesn't work if the number is higher, like 128. If we ever decide to
change the ID from an integer to say hex to save space, these globs
would break.

Is there some scheme that fits the C-variable name that addresses the
above scenarios? Brackets gave me a simple glob that seemed to prevent a
lot of this ("Asserts.\[*\]" in this case).

Are we confident that we always want to represent the ID as a base-10
integer vs a base-16 integer? The suffix will be ABI to ensure recording
programs can find their events easily.

Thanks,
-Beau

> -- Steve



Re: [PATCH 2/4] tracing/user_events: Introduce multi-format events

2024-01-26 Thread Beau Belgrave
On Sat, Jan 27, 2024 at 12:01:04AM +0900, Masami Hiramatsu wrote:
> On Tue, 23 Jan 2024 22:08:42 +
> Beau Belgrave  wrote:
> 
> > Add a register_name (reg_name) to the user_event struct which allows for
> > split naming of events. We now have the name that was used to register
> > within user_events as well as the unique name for the tracepoint. Upon
> > registering events ensure matches based on first the reg_name, followed
> > by the fields and format of the event. This allows for multiple events
> > with the same registered name to have different formats. The underlying
> > tracepoint will have a unique name in the format of {reg_name}:[unique_id].
> > 
> > For example, if both "test u32 value" and "test u64 value" are used with
> > the USER_EVENT_REG_MULTI_FORMAT the system would have 2 unique
> > tracepoints. The dynamic_events file would then show the following:
> >   u:test u64 count
> >   u:test u32 count
> > 
> > The actual tracepoint names look like this:
> >   test:[d5874fdac44]
> >   test:[d5914662cd4]
> > 
> > Both would be under the new user_events_multi system name to prevent the
> > older ABI from being used to squat on multi-formatted events and block
> > their use.
> [...]
> > @@ -1923,6 +1972,39 @@ static int user_event_trace_register(struct 
> > user_event *user)
> > return ret;
> >  }
> >  
> > +static int user_event_set_tp_name(struct user_event *user)
> > +{
> > +   lockdep_assert_held(>group->reg_mutex);
> > +
> > +   if (EVENT_MULTI_FORMAT(user->reg_flags)) {
> > +   char *multi_name;
> > +   int len;
> > +
> > +   len = snprintf(NULL, 0, "%s:[%llx]", user->reg_name,
> > +  user->group->multi_id) + 1;
> > +
> > +   multi_name = kzalloc(len, GFP_KERNEL_ACCOUNT);
> > +
> > +   if (!multi_name)
> > +   return -ENOMEM;
> > +
> > +   snprintf(multi_name, len, "%s:[%llx]", user->reg_name,
> > +user->group->multi_id);
> 
> OK, so the each different event has suffixed name. But this will
> introduce non C-variable name.
> 
> Steve, do you think your library can handle these symbols? It will
> be something like "event:[1]" as the event name.
> Personally I like "event.1" style. (of course we need to ensure the
> user given event name is NOT including such suffix numbers)
> 

Just to clarify around events including a suffix number. This is why
multi-events use "user_events_multi" system name and the single-events
using just "user_events".

Even if a user program did include a suffix, the suffix would still get
appended. An example is "test" vs "test:[0]" using multi-format would
result in two tracepoints ("test:[0]" and "test:[0]:[1]" respectively
(assuming these are the first multi-events on the system).

I'm with you, we really don't want any spoofing or squatting possible.
By using different system names and always appending the suffix I
believe covers this.

Looking forward to hearing Steven's thoughts on this as well.

Thanks,
-Beau

> Thank you.
> 
> -- 
> Masami Hiramatsu (Google) 



Re: [PATCH 0/4] tracing/user_events: Introduce multi-format events

2024-01-25 Thread Beau Belgrave
It appears to put an outdated coversheet onto this series.

Below is the updated coversheet that reflects changes made:

Currently user_events supports 1 event with the same name and must have
the exact same format when referenced by multiple programs. This opens
an opportunity for malicous or poorly thought through programs to
create events that others use with different formats. Another scenario
is user programs wishing to use the same event name but add more fields
later when the software updates. Various versions of a program may be
running side-by-side, which is prevented by the current single format
requirement.

Add a new register flag (USER_EVENT_REG_MULTI_FORMAT) which indicates
the user program wishes to use the same user_event name, but may have
several different formats of the event in the future. When this flag is
used, create the underlying tracepoint backing the user_event with a
unique name per-version of the format. It's important that existing ABI
users do not get this logic automatically, even if one of the multi
format events matches the format. This ensures existing programs that
create events and assume the tracepoint name will match exactly continue
to work as expected. Add logic to only check multi-format events with
other multi-format events and single-format events to only check
single-format events during find.

Change system name of the multi-format event tracepoint to ensure that
multi-format events are isolated completely from single-format events.

Add a register_name (reg_name) to the user_event struct which allows for
split naming of events. We now have the name that was used to register
within user_events as well as the unique name for the tracepoint. Upon
registering events ensure matches based on first the reg_name, followed
by the fields and format of the event. This allows for multiple events
with the same registered name to have different formats. The underlying
tracepoint will have a unique name in the format of {reg_name}:[unique_id].

For example, if both "test u32 value" and "test u64 value" are used with
the USER_EVENT_REG_MULTI_FORMAT the system would have 2 unique
tracepoints. The dynamic_events file would then show the following:
  u:test u64 count
  u:test u32 count

The actual tracepoint names look like this:
  test:[d5874fdac44]
  test:[d5914662cd4]

Both would be under the new user_events_multi system name to prevent the
older ABI from being used to squat on multi-formatted events and block
their use.

Deleting events via "!u:test u64 count" would only delete the first
tracepoint that matched that format. When the delete ABI is used all
events with the same name will be attempted to be deleted. If
per-version deletion is required, user programs should either not use
persistent events or delete them via dynamic_events.

Thanks,
-Beau



Re: [PATCH 1/4] tracing/user_events: Prepare find/delete for same name events

2024-01-25 Thread Beau Belgrave
On Thu, Jan 25, 2024 at 09:59:03AM +0900, Masami Hiramatsu wrote:
> On Tue, 23 Jan 2024 22:08:41 +
> Beau Belgrave  wrote:
> 
> > The current code for finding and deleting events assumes that there will
> > never be cases when user_events are registered with the same name, but
> > different formats. In the future this scenario will exist to ensure
> > user programs can be updated or modify their events and run different
> > versions of their programs side-by-side without being blocked.
> 
> Ah, this is a very important point. Kernel always has only one instance
> but user program doesn't. Thus it can define the same event name.
> For the similar problem, uprobe event assumes that the user (here
> admin) will define different group name to avoid it. But for the user
> event, it is embedded, hmm.
> 

Yes, the series will handle if multi-processes use the same name, we
will find a matching version of that name within the user_event group.
If there isn't one, a new one is created. Each is backed by an
independent tracepoint which does match up with how uprobe does it. This
actually got brought up in the tracefs meetings we've had and it seemed
to get wide agreement on how to best handle this.

> > 
> > This change does not yet allow for multi-format events. If user_events
> > are registered with the same name but different arguments the programs
> > see the same return values as before. This change simply makes it
> > possible to easily accomodate for this in future changes.
> > 
> > Update find_user_event() to take in argument parameters and register
> > flags to accomodate future multi-format event scenarios. Have find
> > validate argument matching and return error pointers to cover address
> > in use cases, or allocation errors. Update callers to handle error
> > pointer logic.
> 
> Understand, that is similar to what probe events do.
> 
> > 
> > Move delete_user_event() to use hash walking directly now that find has
> > changed. Delete all events found that match the register name, stop
> > if an error occurs and report back to the user.
> 
> What happen if we run 2 different version of the applications and terminate
> one of them? The event which is used by others will be kept?
> 

Each unique version of a user_event has it's own ref-count. If one
version is not-used, but another version is, only the not-used version
will get deleted. The other version that is in use will return a -EBUSY
when it gets to that version via enumeration.

While we only have a single tracepoint per-version, we have several
user_event structures in memory that have the same name, yet different
formats. Each of which have their own lifetime, enablers and ref-counts
to keep them isolated from each other.

Thanks,
-Beau

> Thank you,
> 
> > 
> > Update user_fields_match() to cover list_empty() scenarios instead of
> > each callsite doing it now that find_user_event() uses it directly.
> > 
> > Signed-off-by: Beau Belgrave 
> > ---
> >  kernel/trace/trace_events_user.c | 106 +--
> >  1 file changed, 58 insertions(+), 48 deletions(-)
> > 
> > diff --git a/kernel/trace/trace_events_user.c 
> > b/kernel/trace/trace_events_user.c
> > index 9365ce407426..0480579ba563 100644
> > --- a/kernel/trace/trace_events_user.c
> > +++ b/kernel/trace/trace_events_user.c
> > @@ -202,6 +202,8 @@ static struct user_event_mm *user_event_mm_get(struct 
> > user_event_mm *mm);
> >  static struct user_event_mm *user_event_mm_get_all(struct user_event 
> > *user);
> >  static void user_event_mm_put(struct user_event_mm *mm);
> >  static int destroy_user_event(struct user_event *user);
> > +static bool user_fields_match(struct user_event *user, int argc,
> > + const char **argv);
> >  
> >  static u32 user_event_key(char *name)
> >  {
> > @@ -1493,17 +1495,24 @@ static int destroy_user_event(struct user_event 
> > *user)
> >  }
> >  
> >  static struct user_event *find_user_event(struct user_event_group *group,
> > - char *name, u32 *outkey)
> > + char *name, int argc, const char 
> > **argv,
> > + u32 flags, u32 *outkey)
> >  {
> > struct user_event *user;
> > u32 key = user_event_key(name);
> >  
> > *outkey = key;
> >  
> > -   hash_for_each_possible(group->register_table, user, node, key)
> > -   if (!strcmp(EVENT_NAME(user), name))
> > +   hash_for_each_possible(group->register_table, user, node, key) {
> > +  

[PATCH 2/4] tracing/user_events: Introduce multi-format events

2024-01-23 Thread Beau Belgrave
Currently user_events supports 1 event with the same name and must have
the exact same format when referenced by multiple programs. This opens
an opportunity for malicous or poorly thought through programs to
create events that others use with different formats. Another scenario
is user programs wishing to use the same event name but add more fields
later when the software updates. Various versions of a program may be
running side-by-side, which is prevented by the current single format
requirement.

Add a new register flag (USER_EVENT_REG_MULTI_FORMAT) which indicates
the user program wishes to use the same user_event name, but may have
several different formats of the event in the future. When this flag is
used, create the underlying tracepoint backing the user_event with a
unique name per-version of the format. It's important that existing ABI
users do not get this logic automatically, even if one of the multi
format events matches the format. This ensures existing programs that
create events and assume the tracepoint name will match exactly continue
to work as expected. Add logic to only check multi-format events with
other multi-format events and single-format events to only check
single-format events during find.

Change system name of the multi-format event tracepoint to ensure that
multi-format events are isolated completely from single-format events.

Add a register_name (reg_name) to the user_event struct which allows for
split naming of events. We now have the name that was used to register
within user_events as well as the unique name for the tracepoint. Upon
registering events ensure matches based on first the reg_name, followed
by the fields and format of the event. This allows for multiple events
with the same registered name to have different formats. The underlying
tracepoint will have a unique name in the format of {reg_name}:[unique_id].

For example, if both "test u32 value" and "test u64 value" are used with
the USER_EVENT_REG_MULTI_FORMAT the system would have 2 unique
tracepoints. The dynamic_events file would then show the following:
  u:test u64 count
  u:test u32 count

The actual tracepoint names look like this:
  test:[d5874fdac44]
  test:[d5914662cd4]

Both would be under the new user_events_multi system name to prevent the
older ABI from being used to squat on multi-formatted events and block
their use.

Deleting events via "!u:test u64 count" would only delete the first
tracepoint that matched that format. When the delete ABI is used all
events with the same name will be attempted to be deleted. If
per-version deletion is required, user programs should either not use
persistent events or delete them via dynamic_events.

Signed-off-by: Beau Belgrave 
---
 include/uapi/linux/user_events.h |   6 +-
 kernel/trace/trace_events_user.c | 118 +++
 2 files changed, 111 insertions(+), 13 deletions(-)

diff --git a/include/uapi/linux/user_events.h b/include/uapi/linux/user_events.h
index f74f3aedd49c..a03de03dccbc 100644
--- a/include/uapi/linux/user_events.h
+++ b/include/uapi/linux/user_events.h
@@ -12,6 +12,7 @@
 #include 
 
 #define USER_EVENTS_SYSTEM "user_events"
+#define USER_EVENTS_MULTI_SYSTEM "user_events_multi"
 #define USER_EVENTS_PREFIX "u:"
 
 /* Create dynamic location entry within a 32-bit value */
@@ -22,8 +23,11 @@ enum user_reg_flag {
/* Event will not delete upon last reference closing */
USER_EVENT_REG_PERSIST  = 1U << 0,
 
+   /* Event will be allowed to have multiple formats */
+   USER_EVENT_REG_MULTI_FORMAT = 1U << 1,
+
/* This value or above is currently non-ABI */
-   USER_EVENT_REG_MAX  = 1U << 1,
+   USER_EVENT_REG_MAX  = 1U << 2,
 };
 
 /*
diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
index 0480579ba563..f9c0781285b6 100644
--- a/kernel/trace/trace_events_user.c
+++ b/kernel/trace/trace_events_user.c
@@ -34,7 +34,8 @@
 
 /* Limit how long of an event name plus args within the subsystem. */
 #define MAX_EVENT_DESC 512
-#define EVENT_NAME(user_event) ((user_event)->tracepoint.name)
+#define EVENT_NAME(user_event) ((user_event)->reg_name)
+#define EVENT_TP_NAME(user_event) ((user_event)->tracepoint.name)
 #define MAX_FIELD_ARRAY_SIZE 1024
 
 /*
@@ -54,10 +55,13 @@
  * allows isolation for events by various means.
  */
 struct user_event_group {
-   char*system_name;
-   struct  hlist_node node;
-   struct  mutex reg_mutex;
+   char*system_name;
+   char*system_multi_name;
+   struct hlist_node   node;
+   struct mutexreg_mutex;
DECLARE_HASHTABLE(register_table, 8);
+   /* ID that moves forward within the group for multi-event names */
+   u64 multi_id;
 };
 
 /* Group for init

[PATCH 3/4] selftests/user_events: Test multi-format events

2024-01-23 Thread Beau Belgrave
User_events now has multi-format events which allow for the same
register name, but with different formats. When this occurs, different
tracepoints are created with unique names.

Add a new test that ensures the same name can be used for two different
formats. Ensure they are isolated from each other and that name and arg
matching still works if yet another register comes in with the same
format as one of the two.

Signed-off-by: Beau Belgrave 
---
 .../testing/selftests/user_events/abi_test.c  | 134 ++
 1 file changed, 134 insertions(+)

diff --git a/tools/testing/selftests/user_events/abi_test.c 
b/tools/testing/selftests/user_events/abi_test.c
index cef1ff1af223..b3480fad65a5 100644
--- a/tools/testing/selftests/user_events/abi_test.c
+++ b/tools/testing/selftests/user_events/abi_test.c
@@ -16,6 +16,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 #include 
 
 #include "../kselftest_harness.h"
@@ -23,6 +25,62 @@
 
 const char *data_file = "/sys/kernel/tracing/user_events_data";
 const char *enable_file = 
"/sys/kernel/tracing/events/user_events/__abi_event/enable";
+const char *multi_dir_glob = 
"/sys/kernel/tracing/events/user_events_multi/__abi_event:*";
+
+static int wait_for_delete(char *dir)
+{
+   struct stat buf;
+   int i;
+
+   for (i = 0; i < 1; ++i) {
+   if (stat(dir, ) == -1 && errno == ENOENT)
+   return 0;
+
+   usleep(1000);
+   }
+
+   return -1;
+}
+
+static int find_multi_event_dir(char *unique_field, char *out_dir, int dir_len)
+{
+   char path[256];
+   glob_t buf;
+   int i, ret;
+
+   ret = glob(multi_dir_glob, GLOB_ONLYDIR, NULL, );
+
+   if (ret)
+   return -1;
+
+   ret = -1;
+
+   for (i = 0; i < buf.gl_pathc; ++i) {
+   FILE *fp;
+
+   snprintf(path, sizeof(path), "%s/format", buf.gl_pathv[i]);
+   fp = fopen(path, "r");
+
+   if (!fp)
+   continue;
+
+   while (fgets(path, sizeof(path), fp) != NULL) {
+   if (strstr(path, unique_field)) {
+   fclose(fp);
+   /* strscpy is not available, use snprintf */
+   snprintf(out_dir, dir_len, "%s", 
buf.gl_pathv[i]);
+   ret = 0;
+   goto out;
+   }
+   }
+
+   fclose(fp);
+   }
+out:
+   globfree();
+
+   return ret;
+}
 
 static bool event_exists(void)
 {
@@ -74,6 +132,39 @@ static int event_delete(void)
return ret;
 }
 
+static int reg_enable_multi(void *enable, int size, int bit, int flags,
+   char *args)
+{
+   struct user_reg reg = {0};
+   char full_args[512] = {0};
+   int fd = open(data_file, O_RDWR);
+   int len;
+   int ret;
+
+   if (fd < 0)
+   return -1;
+
+   len = snprintf(full_args, sizeof(full_args), "__abi_event %s", args);
+
+   if (len > sizeof(full_args)) {
+   ret = -E2BIG;
+   goto out;
+   }
+
+   reg.size = sizeof(reg);
+   reg.name_args = (__u64)full_args;
+   reg.flags = USER_EVENT_REG_MULTI_FORMAT | flags;
+   reg.enable_bit = bit;
+   reg.enable_addr = (__u64)enable;
+   reg.enable_size = size;
+
+   ret = ioctl(fd, DIAG_IOCSREG, );
+out:
+   close(fd);
+
+   return ret;
+}
+
 static int reg_enable_flags(void *enable, int size, int bit, int flags)
 {
struct user_reg reg = {0};
@@ -207,6 +298,49 @@ TEST_F(user, bit_sizes) {
ASSERT_NE(0, reg_enable(>check, 128, 0));
 }
 
+TEST_F(user, multi_format) {
+   char first_dir[256];
+   char second_dir[256];
+   struct stat buf;
+
+   /* Multiple formats for the same name should work */
+   ASSERT_EQ(0, reg_enable_multi(>check, sizeof(int), 0,
+ 0, "u32 multi_first"));
+
+   ASSERT_EQ(0, reg_enable_multi(>check, sizeof(int), 1,
+ 0, "u64 multi_second"));
+
+   /* Same name with same format should also work */
+   ASSERT_EQ(0, reg_enable_multi(>check, sizeof(int), 2,
+ 0, "u64 multi_second"));
+
+   ASSERT_EQ(0, find_multi_event_dir("multi_first",
+ first_dir, sizeof(first_dir)));
+
+   ASSERT_EQ(0, find_multi_event_dir("multi_second",
+ second_dir, sizeof(second_dir)));
+
+   /* Should not be found in the same dir */
+   ASSERT_NE(0, strcmp(first_dir, second_dir));
+
+   /* First dir should still exist */
+   ASSERT_EQ(0, stat(first_dir, ));
+
+   /* Disabling first register sho

[PATCH 4/4] tracing/user_events: Document multi-format flag

2024-01-23 Thread Beau Belgrave
User programs can now ask user_events to handle the synchronization of
multiple different formats for an event with the same name via the new
USER_EVENT_REG_MULTI_FORMAT flag.

Add a section for USER_EVENT_REG_MULTI_FORMAT that explains the intended
purpose and caveats of using it. Explain how deletion works in these
cases and how to use /sys/kernel/tracing/dynamic_events for per-version
deletion.

Signed-off-by: Beau Belgrave 
---
 Documentation/trace/user_events.rst | 23 ++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/Documentation/trace/user_events.rst 
b/Documentation/trace/user_events.rst
index d8f12442aaa6..35a2524bc73c 100644
--- a/Documentation/trace/user_events.rst
+++ b/Documentation/trace/user_events.rst
@@ -92,6 +92,20 @@ The following flags are currently supported.
   process closes or unregisters the event. Requires CAP_PERFMON otherwise
   -EPERM is returned.
 
++ USER_EVENT_REG_MULTI_FORMAT: The event can contain multiple formats. This
+  allows programs to prevent themselves from being blocked when their event
+  format changes and they wish to use the same name. When this flag is used the
+  tracepoint name will be in the new format of "name:[unique_id]" vs the older
+  format of "name". A tracepoint will be created for each unique pair of name
+  and format. This means if several processes use the same name and format,
+  they will use the same tracepoint. If yet another process uses the same name,
+  but a different format than the other processes, it will use a different
+  tracepoint with a new unique id. Recording programs need to scan tracefs for
+  the various different formats of the event name they are interested in
+  recording. The system name of the tracepoint will also use 
"user_events_multi"
+  instead of "user_events". This prevents single-format event names conflicting
+  with any multi-format event names within tracefs.
+
 Upon successful registration the following is set.
 
 + write_index: The index to use for this file descriptor that represents this
@@ -106,6 +120,9 @@ or perf record -e user_events:[name] when 
attaching/recording.
 **NOTE:** The event subsystem name by default is "user_events". Callers should
 not assume it will always be "user_events". Operators reserve the right in the
 future to change the subsystem name per-process to accommodate event isolation.
+In addition if the USER_EVENT_REG_MULTI_FORMAT flag is used the tracepoint name
+will have a unique id appended to it and the system name will be
+"user_events_multi" as described above.
 
 Command Format
 ^^
@@ -156,7 +173,11 @@ to request deletes than the one used for registration due 
to this.
 to the event. If programs do not want auto-delete, they must use the
 USER_EVENT_REG_PERSIST flag when registering the event. Once that flag is used
 the event exists until DIAG_IOCSDEL is invoked. Both register and delete of an
-event that persists requires CAP_PERFMON, otherwise -EPERM is returned.
+event that persists requires CAP_PERFMON, otherwise -EPERM is returned. When
+there are multiple formats of the same event name, all events with the same
+name will be attempted to be deleted. If only a specific version is wanted to
+be deleted then the /sys/kernel/tracing/dynamic_events file should be used for
+that specific format of the event.
 
 Unregistering
 -
-- 
2.34.1




[PATCH 0/4] tracing/user_events: Introduce multi-format events

2024-01-23 Thread Beau Belgrave
Currently user_events supports 1 event with the same name and must have
the exact same format when referenced by multiple programs. This opens
an opportunity for malicous or poorly thought through programs to
create events that others use with different formats. Another scenario
is user programs wishing to use the same event name but add more fields
later when the software updates. Various versions of a program may be
running side-by-side, which is prevented by the current single format
requirement.

Add a new register flag (USER_EVENT_REG_MULTI_FORMAT) which indicates
the user program wishes to use the same user_event name, but may have
several different formats of the event in the future. When this flag is
used, create the underlying tracepoint backing the user_event with a
unique name per-version of the format. It's important that existing ABI
users do not get this logic automatically, even if one of the multi
format events matches the format. This ensures existing programs that
create events and assume the tracepoint name will match exactly continue
to work as expected. Add logic to only check multi-format events with
other multi-format events and single-format events to only check
single-format events during find.

Add a register_name (reg_name) to the user_event struct which allows for
split naming of events. We now have the name that was used to register
within user_events as well as the unique name for the tracepoint. Upon
registering events ensure matches based on first the reg_name, followed
by the fields and format of the event. This allows for multiple events
with the same registered name to have different formats. The underlying
tracepoint will have a unique name in the format of {reg_name}:[unique_id].
The unique_id is the time, in nanoseconds, of the event creation converted
to hex. Since this is done under the register mutex, it is extremely
unlikely for these IDs to ever match. It's also very unlikely a malicious
program could consistently guess what the name would be and attempt to
squat on it via the single format ABI.

For example, if both "test u32 value" and "test u64 value" are used with
the USER_EVENT_REG_MULTI_FORMAT the system would have 2 unique
tracepoints. The dynamic_events file would then show the following:
  u:test u64 count
  u:test u32 count

The actual tracepoint names look like this:
  test:[d5874fdac44]
  test:[d5914662cd4]

Deleting events via "!u:test u64 count" would only delete the first
tracepoint that matched that format. When the delete ABI is used all
events with the same name will be attempted to be deleted. If
per-version deletion is required, user programs should either not use
persistent events or delete them via dynamic_events.

Beau Belgrave (4):
  tracing/user_events: Prepare find/delete for same name events
  tracing/user_events: Introduce multi-format events
  selftests/user_events: Test multi-format events
  tracing/user_events: Document multi-format flag

 Documentation/trace/user_events.rst   |  23 +-
 include/uapi/linux/user_events.h  |   6 +-
 kernel/trace/trace_events_user.c  | 224 +-
 .../testing/selftests/user_events/abi_test.c  | 134 +++
 4 files changed, 325 insertions(+), 62 deletions(-)


base-commit: 610a9b8f49fbcf1100716370d3b5f6f884a2835a
-- 
2.34.1




[PATCH 1/4] tracing/user_events: Prepare find/delete for same name events

2024-01-23 Thread Beau Belgrave
The current code for finding and deleting events assumes that there will
never be cases when user_events are registered with the same name, but
different formats. In the future this scenario will exist to ensure
user programs can be updated or modify their events and run different
versions of their programs side-by-side without being blocked.

This change does not yet allow for multi-format events. If user_events
are registered with the same name but different arguments the programs
see the same return values as before. This change simply makes it
possible to easily accomodate for this in future changes.

Update find_user_event() to take in argument parameters and register
flags to accomodate future multi-format event scenarios. Have find
validate argument matching and return error pointers to cover address
in use cases, or allocation errors. Update callers to handle error
pointer logic.

Move delete_user_event() to use hash walking directly now that find has
changed. Delete all events found that match the register name, stop
if an error occurs and report back to the user.

Update user_fields_match() to cover list_empty() scenarios instead of
each callsite doing it now that find_user_event() uses it directly.

Signed-off-by: Beau Belgrave 
---
 kernel/trace/trace_events_user.c | 106 +--
 1 file changed, 58 insertions(+), 48 deletions(-)

diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
index 9365ce407426..0480579ba563 100644
--- a/kernel/trace/trace_events_user.c
+++ b/kernel/trace/trace_events_user.c
@@ -202,6 +202,8 @@ static struct user_event_mm *user_event_mm_get(struct 
user_event_mm *mm);
 static struct user_event_mm *user_event_mm_get_all(struct user_event *user);
 static void user_event_mm_put(struct user_event_mm *mm);
 static int destroy_user_event(struct user_event *user);
+static bool user_fields_match(struct user_event *user, int argc,
+ const char **argv);
 
 static u32 user_event_key(char *name)
 {
@@ -1493,17 +1495,24 @@ static int destroy_user_event(struct user_event *user)
 }
 
 static struct user_event *find_user_event(struct user_event_group *group,
- char *name, u32 *outkey)
+ char *name, int argc, const char 
**argv,
+ u32 flags, u32 *outkey)
 {
struct user_event *user;
u32 key = user_event_key(name);
 
*outkey = key;
 
-   hash_for_each_possible(group->register_table, user, node, key)
-   if (!strcmp(EVENT_NAME(user), name))
+   hash_for_each_possible(group->register_table, user, node, key) {
+   if (strcmp(EVENT_NAME(user), name))
+   continue;
+
+   if (user_fields_match(user, argc, argv))
return user_event_get(user);
 
+   return ERR_PTR(-EADDRINUSE);
+   }
+
return NULL;
 }
 
@@ -1860,6 +1869,9 @@ static bool user_fields_match(struct user_event *user, 
int argc,
struct list_head *head = >fields;
int i = 0;
 
+   if (argc == 0)
+   return list_empty(head);
+
list_for_each_entry_reverse(field, head, link) {
if (!user_field_match(field, argc, argv, ))
return false;
@@ -1880,10 +1892,8 @@ static bool user_event_match(const char *system, const 
char *event,
match = strcmp(EVENT_NAME(user), event) == 0 &&
(!system || strcmp(system, USER_EVENTS_SYSTEM) == 0);
 
-   if (match && argc > 0)
+   if (match)
match = user_fields_match(user, argc, argv);
-   else if (match && argc == 0)
-   match = list_empty(>fields);
 
return match;
 }
@@ -1922,11 +1932,11 @@ static int user_event_parse(struct user_event_group 
*group, char *name,
char *args, char *flags,
struct user_event **newuser, int reg_flags)
 {
-   int ret;
-   u32 key;
struct user_event *user;
+   char **argv = NULL;
int argc = 0;
-   char **argv;
+   int ret;
+   u32 key;
 
/* Currently don't support any text based flags */
if (flags != NULL)
@@ -1935,41 +1945,34 @@ static int user_event_parse(struct user_event_group 
*group, char *name,
if (!user_event_capable(reg_flags))
return -EPERM;
 
+   if (args) {
+   argv = argv_split(GFP_KERNEL, args, );
+
+   if (!argv)
+   return -ENOMEM;
+   }
+
/* Prevent dyn_event from racing */
mutex_lock(_mutex);
-   user = find_user_event(group, name, );
+   user = find_user_event(group, name, argc, (const char **)argv,
+  reg_flags, );
mutex_unlock(_mutex);
 
-   if (user) {
-   if (args) {
-  

Re: [PATCH] tracing user_events: Simplify user_event_parse_field() parsing

2024-01-08 Thread Beau Belgrave
On Mon, Jan 08, 2024 at 01:37:23PM -0500, Steven Rostedt wrote:
> From: "Steven Rostedt (Google)" 
> 
> Instead of having a bunch of if statements with:
> 
>len = str_has_prefix(field, "__data_loc unsigned ");
>if (len)
>goto skip_next;
> 
>len = str_has_prefix(field, "__data_loc ");
>if (len)
>goto skip_next;
> 
>len = str_has_prefix(field, "__rel_loc unsigned ");
>if (len)
>goto skip_next;
> 
>len = str_has_prefix(field, "__rel_loc ");
>if (len)
>goto skip_next;
> 
>   goto parse;
> 
>  skip_next:
> 
> Consolidate it into a negative check and jump to parse if all the
> str_has_prefix() calls fail. If one succeeds, it will just continue with
> len equal to the proper string:
> 
>if (!(len = str_has_prefix(field, "__data_loc unsigned ")) &&
>!(len = str_has_prefix(field, "__data_loc ")) &&
>!(len = str_has_prefix(field, "__rel_loc unsigned ")) &&
>!(len = str_has_prefix(field, "__rel_loc "))) {
>goto parse;
>}
> 
>  skip_next:
> 
> Signed-off-by: Steven Rostedt (Google) 
> ---
>  kernel/trace/trace_events_user.c | 22 ++
>  1 file changed, 6 insertions(+), 16 deletions(-)
> 
> diff --git a/kernel/trace/trace_events_user.c 
> b/kernel/trace/trace_events_user.c
> index 9365ce407426..ce0c5f1ded48 100644
> --- a/kernel/trace/trace_events_user.c
> +++ b/kernel/trace/trace_events_user.c
> @@ -1175,23 +1175,13 @@ static int user_event_parse_field(char *field, struct 
> user_event *user,
>   goto skip_next;
>   }
>  
> - len = str_has_prefix(field, "__data_loc unsigned ");
> - if (len)
> - goto skip_next;
> -
> - len = str_has_prefix(field, "__data_loc ");
> - if (len)
> - goto skip_next;
> -
> - len = str_has_prefix(field, "__rel_loc unsigned ");
> - if (len)
> - goto skip_next;
> -
> - len = str_has_prefix(field, "__rel_loc ");
> - if (len)
> - goto skip_next;
> + if (!(len = str_has_prefix(field, "__data_loc unsigned ")) &&
> + !(len = str_has_prefix(field, "__data_loc ")) &&
> + !(len = str_has_prefix(field, "__rel_loc unsigned ")) &&
> + !(len = str_has_prefix(field, "__rel_loc "))) {
> + goto parse;
> + }

This now triggers a checkpatch error:
ERROR: do not use assignment in if condition
#1184: FILE: kernel/trace/trace_events_user.c:1184:
+   if (!(len = str_has_prefix(field, "__data_loc unsigned ")) &&

I personally prefer to keep these files fully checkpatch clean.
However, I did test these changes under the self-tests and it passed.

Do they bug you that much? :)

Thanks,
-Beau

>  
> - goto parse;
>  skip_next:
>   type = field;
>   field = strpbrk(field + len, " ");
> -- 
> 2.43.0



[PATCH] eventfs: Fix events beyond NAME_MAX blocking tasks

2023-12-10 Thread Beau Belgrave
Eventfs uses simple_lookup(), however, it will fail if the name of the
entry is beyond NAME_MAX length. When this error is encountered, eventfs
still tries to create dentries instead of skipping the dentry creation.
When the dentry is attempted to be created in this state d_wait_lookup()
will loop forever, waiting for the lookup to be removed.

Fix eventfs to return the error in simple_lookup() back to the caller
instead of continuing to try to create the dentry.

Fixes: 63940449555e ("eventfs: Implement eventfs lookup, read, open functions")
Link: 
https://lore.kernel.org/linux-trace-kernel/20231208183601.ga46-be...@linux.microsoft.com/
Signed-off-by: Beau Belgrave 
---
Please note the fixes tag is the first add of simple_lookup logic, there
have been other changes beyond that may be a better fit to targetting.

 fs/tracefs/event_inode.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c
index f8a594a50ae6..d2c06ba26db4 100644
--- a/fs/tracefs/event_inode.c
+++ b/fs/tracefs/event_inode.c
@@ -561,6 +561,8 @@ static struct dentry *eventfs_root_lookup(struct inode *dir,
if (strcmp(ei_child->name, name) != 0)
continue;
ret = simple_lookup(dir, dentry, flags);
+   if (IS_ERR(ret))
+   goto out;
create_dir_dentry(ei, ei_child, ei_dentry, true);
created = true;
break;
@@ -583,6 +585,8 @@ static struct dentry *eventfs_root_lookup(struct inode *dir,
if (r <= 0)
continue;
ret = simple_lookup(dir, dentry, flags);
+   if (IS_ERR(ret))
+   goto out;
create_file_dentry(ei, i, ei_dentry, name, mode, cdata,
   fops, true);
break;
-- 
2.34.1




trace_event names with more than NAME_MAX chars

2023-12-08 Thread Beau Belgrave
While developing some unrelated features I happened to create a
trace_event that was more than NAME_MAX (255) characters. When this
happened the creation worked, but tracefs would hang any task that tried
to list the directory of the trace_event or remove it.

I followed the code down to the reason being eventfs would call
simple_lookup(), and if it failed, it would still try to create the
dentry. In this case DCACHE_PAR_LOOKUP would get set and never cleared.
This caused d_wait_lookup() to loop forever, since that flag is used in
d_in_lookup().

Both tracefs and eventfs use simple_lookup() and it fails for
dentries that exceed NAME_MAX. Should we even allow trace_events to
be created that exceed this limit? Or should tracefs/eventfs allow
this but somehow represent these differently?

I have a fix that appears to work for myself, but unsure if there are
other locations (attached at the end of this mail).

Thanks,
-Beau

diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c
index f8a594a50ae6..d2c06ba26db4 100644
--- a/fs/tracefs/event_inode.c
+++ b/fs/tracefs/event_inode.c
@@ -561,6 +561,8 @@ static struct dentry *eventfs_root_lookup(struct inode *dir,
if (strcmp(ei_child->name, name) != 0)
continue;
ret = simple_lookup(dir, dentry, flags);
+   if (IS_ERR(ret))
+   goto out;
create_dir_dentry(ei, ei_child, ei_dentry, true);
created = true;
break;
@@ -583,6 +585,8 @@ static struct dentry *eventfs_root_lookup(struct inode *dir,
if (r <= 0)
continue;
ret = simple_lookup(dir, dentry, flags);
+   if (IS_ERR(ret))
+   goto out;
create_file_dentry(ei, i, ei_dentry, name, mode, cdata,
   fops, true);
break;



Re: [PATCH] tracing: Have trace_event_file have ref counters

2023-10-31 Thread Beau Belgrave
On Tue, Oct 31, 2023 at 12:24:53PM -0400, Steven Rostedt wrote:
> From: "Steven Rostedt (Google)" 
> 
> The following can crash the kernel:
> 
>  # cd /sys/kernel/tracing
>  # echo 'p:sched schedule' > kprobe_events
>  # exec 5>>events/kprobes/sched/enable
>  # > kprobe_events
>  # exec 5>&-
> 
> The above commands:
> 
>  1. Change directory to the tracefs directory
>  2. Create a kprobe event (doesn't matter what one)
>  3. Open bash file descriptor 5 on the enable file of the kprobe event
>  4. Delete the kprobe event (removes the files too)
>  5. Close the bash file descriptor 5
> 
> The above causes a crash!
> 
>  BUG: kernel NULL pointer dereference, address: 0028
>  #PF: supervisor read access in kernel mode
>  #PF: error_code(0x) - not-present page
>  PGD 0 P4D 0
>  Oops:  [#1] PREEMPT SMP PTI
>  CPU: 6 PID: 877 Comm: bash Not tainted 
> 6.5.0-rc4-test-8-g2c6b6b1029d4-dirty #186
>  Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 
> 1.16.2-debian-1.16.2-1 04/01/2014
>  RIP: 0010:tracing_release_file_tr+0xc/0x50
> 
> What happens here is that the kprobe event creates a trace_event_file
> "file" descriptor that represents the file in tracefs to the event. It
> maintains state of the event (is it enabled for the given instance?).
> Opening the "enable" file gets a reference to the event "file" descriptor
> via the open file descriptor. When the kprobe event is deleted, the file is
> also deleted from the tracefs system which also frees the event "file"
> descriptor.
> 
> But as the tracefs file is still opened by user space, it will not be
> totally removed until the final dput() is called on it. But this is not
> true with the event "file" descriptor that is already freed. If the user
> does a write to or simply closes the file descriptor it will reference the
> event "file" descriptor that was just freed, causing a use-after-free bug.
> 
> To solve this, add a ref count to the event "file" descriptor as well as a
> new flag called "FREED". The "file" will not be freed until the last
> reference is released. But the FREE flag will be set when the event is
> removed to prevent any more modifications to that event from happening,
> even if there's still a reference to the event "file" descriptor.
> 
> Link: 
> https://lore.kernel.org/linux-trace-kernel/2023103131.1e705...@gandalf.local.home/
> 

I ran this patch with your repro and via the user_events ftrace self
test repro. Both worked correctly, thanks!

Tested-by: Beau Belgrave 

Thanks,
-Beau

> Cc: sta...@vger.kernel.org
> Fixes: f5ca233e2e66d ("tracing: Increase trace array ref count on enable and 
> filter files")
> Reported-by: Beau Belgrave 
> Signed-off-by: Steven Rostedt (Google) 
> ---
>  include/linux/trace_events.h   |  4 
>  kernel/trace/trace.c   | 15 +++
>  kernel/trace/trace.h   |  3 +++
>  kernel/trace/trace_events.c| 31 ++
>  kernel/trace/trace_events_filter.c |  3 +++
>  5 files changed, 52 insertions(+), 4 deletions(-)
> 
> diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
> index 12207dc6722d..696f8dc4aa53 100644
> --- a/include/linux/trace_events.h
> +++ b/include/linux/trace_events.h
> @@ -492,6 +492,7 @@ enum {
>   EVENT_FILE_FL_TRIGGER_COND_BIT,
>   EVENT_FILE_FL_PID_FILTER_BIT,
>   EVENT_FILE_FL_WAS_ENABLED_BIT,
> + EVENT_FILE_FL_FREED_BIT,
>  };
>  
>  extern struct trace_event_file *trace_get_event_file(const char *instance,
> @@ -630,6 +631,7 @@ extern int __kprobe_event_add_fields(struct dynevent_cmd 
> *cmd, ...);
>   *  TRIGGER_COND  - When set, one or more triggers has an associated filter
>   *  PID_FILTER- When set, the event is filtered based on pid
>   *  WAS_ENABLED   - Set when enabled to know to clear trace on module removal
> + *  FREED - File descriptor is freed, all fields should be 
> considered invalid
>   */
>  enum {
>   EVENT_FILE_FL_ENABLED   = (1 << EVENT_FILE_FL_ENABLED_BIT),
> @@ -643,6 +645,7 @@ enum {
>   EVENT_FILE_FL_TRIGGER_COND  = (1 << EVENT_FILE_FL_TRIGGER_COND_BIT),
>   EVENT_FILE_FL_PID_FILTER= (1 << EVENT_FILE_FL_PID_FILTER_BIT),
>   EVENT_FILE_FL_WAS_ENABLED   = (1 << EVENT_FILE_FL_WAS_ENABLED_BIT),
> + EVENT_FILE_FL_FREED = (1 << EVENT_FILE_FL_FREED_BIT),
>  };
>  
>  struct trace_event_file {
> @@ -671,6 +674,7 @@ struct trace_event_file {
>* caching and such. Which is mostly OK ;-)
&g

Re: [PATCH] tracing/user_events: align uaddr on unsigned long alignment

2023-09-25 Thread Beau Belgrave
On Mon, Sep 25, 2023 at 09:53:16AM +0200, Clément Léger wrote:
> 
> 
> On 22/09/2023 21:22, Beau Belgrave wrote:
> > On Tue, Sep 19, 2023 at 02:59:12PM +0200, Clément Léger wrote:
> >>
> >>
> >> On 14/09/2023 19:29, Steven Rostedt wrote:
> >>> On Thu, 14 Sep 2023 13:17:00 -0400
> >>> Steven Rostedt  wrote:
> >>>
> >>>> Now lets look at big endian layout:
> >>>>
> >>>>  uaddr = 0xbeef0004
> >>>>  enabler = 1;
> >>>>
> >>>>  memory at 0xbeef:  00 00 00 00 00 00 00 02
> >>>> ^
> >>>> addr: 0xbeef0004
> >>>>
> >>>>  (enabler is set )
> >>>>
> >>>>  bitoffset = uaddr & (sizeof(unsigned long) - 1); bitoffset = 4
> >>>>  bit_offset *= 8; bitoffset = 32
> >>>>  uaddr &= ~(sizeof(unsigned long) - 1);   uaddr = 0xbeef
> >>>>
> >>>>  ptr = kaddr + (uaddr & ~PAGE_MASK);
> >>>>
> >>>>  clear_bit(1 + 32, ptr);
> >>>>
> >>>>  memory at 0xbeef:  00 00 00 00 00 00 00 02
> >>>>   ^
> >>>>  bit 33 of 0xbeef
> >>>>
> >>>> I don't think that's what you expected!
> >>>
> >>> I believe the above can be fixed with:
> >>>
> >>>   bit_offset = uaddr & (sizeof(unsigned long) - 1);
> >>>   if (bit_offset) {
> >>> #ifdef CONFIG_CPU_BIG_ENDIAN
> >>>   bit_offest = 0;
> >>> #else
> >>>   bit_offset *= BITS_PER_BYTE;
> >>> #endif
> >>>   uaddr &= ~(sizeof(unsigned long) - 1);
> >>>   }
> >>>
> >>> -- Steve
> >>
> >>
> >> Actually, after looking more in depth at that, it seems like there are
> >> actually 2 problems that can happen.
> >>
> >> First one is atomic access misalignment due to enable_size == 4 and
> >> uaddr not being aligned on a (long) boundary on 64 bits architecture.
> >> This can generate misaligned exceptions on various architectures. This
> >> can be fixed in a more general way according to Masami snippet.
> >>
> >> Second one that I can see is on 64 bits, big endian architectures with
> >> enable_size == 4. In that case, the bit provided by the userspace won't
> >> be correctly set since this code kind of assume that the atomic are done
> >> on 32bits value. Since the kernel assume long sized atomic operation, on
> >> big endian 64 bits architecture, the updated bit will actually be in the
> >> next 32 bits word.
> >>
> >> Can someone confirm my understanding ?
> >>
> > 
> > I have a ppc 64bit BE VM I've been validating this on. If we do the
> > shifting within user_events (vs a generic set_bit_aligned approach)
> > 64bit BE does not need additional bit manipulation. However, if we were
> > to blindly pass the address and bit as is to set_bit_aligned() it
> > assumes the bit number is for a long, not a 32 bit word. So for that
> > approach we would need to offset the bit in the unaligned case.
> > 
> > Here's a patch I have that I've validated on ppc64 BE, aarch64 LE, and
> > x86_64 LE. I personally feel more comfortable with this approach than
> > the generic set_bit_aligned() one.
> > 
> > Thanks,
> > -Beau
> > 
> > diff --git a/kernel/trace/trace_events_user.c 
> > b/kernel/trace/trace_events_user.c
> > index e3f2b8d72e01..ae854374d0b7 100644
> > --- a/kernel/trace/trace_events_user.c
> > +++ b/kernel/trace/trace_events_user.c
> > @@ -162,6 +162,23 @@ struct user_event_validator {
> > int flags;
> >  };
> >  
> > +static inline void align_addr_bit(unsigned long *addr, int *bit)
> > +{
> > +   if (IS_ALIGNED(*addr, sizeof(long)))
> > +   return;
> > +
> > +   *addr = ALIGN_DOWN(*addr, sizeof(long));
> > +
> > +   /*
> > +* We only support 32 and 64 bit values. The only time we need
> > +* to align is a 32 bit value on a 64 bit kernel, which on LE
> > +* is always 32 bits, and on BE requires no change.
> > +*/
> > +#ifdef __LITTLE_ENDIAN
> > +   *bit += 32;
> > +#endif
> 
> Hi Beau, except the specific alignment that 

Re: [PATCH] tracing/user_events: align uaddr on unsigned long alignment

2023-09-22 Thread Beau Belgrave
On Tue, Sep 19, 2023 at 02:59:12PM +0200, Clément Léger wrote:
> 
> 
> On 14/09/2023 19:29, Steven Rostedt wrote:
> > On Thu, 14 Sep 2023 13:17:00 -0400
> > Steven Rostedt  wrote:
> > 
> >> Now lets look at big endian layout:
> >>
> >>  uaddr = 0xbeef0004
> >>  enabler = 1;
> >>
> >>  memory at 0xbeef:  00 00 00 00 00 00 00 02
> >> ^
> >> addr: 0xbeef0004
> >>
> >>(enabler is set )
> >>
> >>bitoffset = uaddr & (sizeof(unsigned long) - 1); bitoffset = 4
> >>bit_offset *= 8; bitoffset = 32
> >>uaddr &= ~(sizeof(unsigned long) - 1);   uaddr = 0xbeef
> >>
> >>ptr = kaddr + (uaddr & ~PAGE_MASK);
> >>
> >>clear_bit(1 + 32, ptr);
> >>
> >>  memory at 0xbeef:  00 00 00 00 00 00 00 02
> >>   ^
> >>bit 33 of 0xbeef
> >>
> >> I don't think that's what you expected!
> > 
> > I believe the above can be fixed with:
> > 
> > bit_offset = uaddr & (sizeof(unsigned long) - 1);
> > if (bit_offset) {
> > #ifdef CONFIG_CPU_BIG_ENDIAN
> > bit_offest = 0;
> > #else
> > bit_offset *= BITS_PER_BYTE;
> > #endif
> > uaddr &= ~(sizeof(unsigned long) - 1);
> > }
> > 
> > -- Steve
> 
> 
> Actually, after looking more in depth at that, it seems like there are
> actually 2 problems that can happen.
> 
> First one is atomic access misalignment due to enable_size == 4 and
> uaddr not being aligned on a (long) boundary on 64 bits architecture.
> This can generate misaligned exceptions on various architectures. This
> can be fixed in a more general way according to Masami snippet.
> 
> Second one that I can see is on 64 bits, big endian architectures with
> enable_size == 4. In that case, the bit provided by the userspace won't
> be correctly set since this code kind of assume that the atomic are done
> on 32bits value. Since the kernel assume long sized atomic operation, on
> big endian 64 bits architecture, the updated bit will actually be in the
> next 32 bits word.
> 
> Can someone confirm my understanding ?
> 

Actually, I take back some of what I said [1]. If a 32-bit on a 64-bit
kernel comes in on BE, and is aligned, we do need to offset the bits as
well (just verified on my ppc64 BE VM).

You should be able to use that patch as a base though and add a flag to
struct user_event_enabler when this case occurs. Then in the
align_addr_bit() adjust the bits as well upon aligned cases.

Thanks,
-Beau

1. 
https://lore.kernel.org/linux-trace-kernel/20230922192231.ga1828-be...@linux.microsoft.com/

> Clément



Re: [PATCH] tracing/user_events: align uaddr on unsigned long alignment

2023-09-22 Thread Beau Belgrave
On Tue, Sep 19, 2023 at 02:59:12PM +0200, Clément Léger wrote:
> 
> 
> On 14/09/2023 19:29, Steven Rostedt wrote:
> > On Thu, 14 Sep 2023 13:17:00 -0400
> > Steven Rostedt  wrote:
> > 
> >> Now lets look at big endian layout:
> >>
> >>  uaddr = 0xbeef0004
> >>  enabler = 1;
> >>
> >>  memory at 0xbeef:  00 00 00 00 00 00 00 02
> >> ^
> >> addr: 0xbeef0004
> >>
> >>(enabler is set )
> >>
> >>bitoffset = uaddr & (sizeof(unsigned long) - 1); bitoffset = 4
> >>bit_offset *= 8; bitoffset = 32
> >>uaddr &= ~(sizeof(unsigned long) - 1);   uaddr = 0xbeef
> >>
> >>ptr = kaddr + (uaddr & ~PAGE_MASK);
> >>
> >>clear_bit(1 + 32, ptr);
> >>
> >>  memory at 0xbeef:  00 00 00 00 00 00 00 02
> >>   ^
> >>bit 33 of 0xbeef
> >>
> >> I don't think that's what you expected!
> > 
> > I believe the above can be fixed with:
> > 
> > bit_offset = uaddr & (sizeof(unsigned long) - 1);
> > if (bit_offset) {
> > #ifdef CONFIG_CPU_BIG_ENDIAN
> > bit_offest = 0;
> > #else
> > bit_offset *= BITS_PER_BYTE;
> > #endif
> > uaddr &= ~(sizeof(unsigned long) - 1);
> > }
> > 
> > -- Steve
> 
> 
> Actually, after looking more in depth at that, it seems like there are
> actually 2 problems that can happen.
> 
> First one is atomic access misalignment due to enable_size == 4 and
> uaddr not being aligned on a (long) boundary on 64 bits architecture.
> This can generate misaligned exceptions on various architectures. This
> can be fixed in a more general way according to Masami snippet.
> 
> Second one that I can see is on 64 bits, big endian architectures with
> enable_size == 4. In that case, the bit provided by the userspace won't
> be correctly set since this code kind of assume that the atomic are done
> on 32bits value. Since the kernel assume long sized atomic operation, on
> big endian 64 bits architecture, the updated bit will actually be in the
> next 32 bits word.
> 
> Can someone confirm my understanding ?
> 

I have a ppc 64bit BE VM I've been validating this on. If we do the
shifting within user_events (vs a generic set_bit_aligned approach)
64bit BE does not need additional bit manipulation. However, if we were
to blindly pass the address and bit as is to set_bit_aligned() it
assumes the bit number is for a long, not a 32 bit word. So for that
approach we would need to offset the bit in the unaligned case.

Here's a patch I have that I've validated on ppc64 BE, aarch64 LE, and
x86_64 LE. I personally feel more comfortable with this approach than
the generic set_bit_aligned() one.

Thanks,
-Beau

diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
index e3f2b8d72e01..ae854374d0b7 100644
--- a/kernel/trace/trace_events_user.c
+++ b/kernel/trace/trace_events_user.c
@@ -162,6 +162,23 @@ struct user_event_validator {
int flags;
 };
 
+static inline void align_addr_bit(unsigned long *addr, int *bit)
+{
+   if (IS_ALIGNED(*addr, sizeof(long)))
+   return;
+
+   *addr = ALIGN_DOWN(*addr, sizeof(long));
+
+   /*
+* We only support 32 and 64 bit values. The only time we need
+* to align is a 32 bit value on a 64 bit kernel, which on LE
+* is always 32 bits, and on BE requires no change.
+*/
+#ifdef __LITTLE_ENDIAN
+   *bit += 32;
+#endif
+}
+
 typedef void (*user_event_func_t) (struct user_event *user, struct iov_iter *i,
   void *tpdata, bool *faulted);
 
@@ -481,6 +498,7 @@ static int user_event_enabler_write(struct user_event_mm 
*mm,
unsigned long *ptr;
struct page *page;
void *kaddr;
+   int bit = ENABLE_BIT(enabler);
int ret;
 
lockdep_assert_held(_mutex);
@@ -496,6 +514,8 @@ static int user_event_enabler_write(struct user_event_mm 
*mm,
 test_bit(ENABLE_VAL_FREEING_BIT, ENABLE_BITOPS(enabler
return -EBUSY;
 
+   align_addr_bit(, );
+
ret = pin_user_pages_remote(mm->mm, uaddr, 1, FOLL_WRITE | FOLL_NOFAULT,
, NULL);
 
@@ -514,9 +534,9 @@ static int user_event_enabler_write(struct user_event_mm 
*mm,
 
/* Update bit atomically, user tracers must be atomic as well */
if (enabler->event && enabler->event->status)
-   set_bit(ENABLE_BIT(enabler), ptr);
+   set_bit(bit, ptr);
else
-   clear_bit(ENABLE_BIT(enabler), ptr);
+   clear_bit(bit, ptr);
 
kunmap_local(kaddr);
unpin_user_pages_dirty_lock(, 1, true);



Re: [PATCH] tracing/user_events: align uaddr on unsigned long alignment

2023-09-14 Thread Beau Belgrave
On Thu, Sep 14, 2023 at 03:11:02PM +0200, Clément Léger wrote:
> enabler->uaddr can be aligned on 32 or 64 bits. If aligned on 32 bits,
> this will result in a misaligned access on 64 bits architectures since
> set_bit()/clear_bit() are expecting an unsigned long (aligned) pointer.
> On architecture that do not support misaligned access, this will crash
> the kernel. Align uaddr on unsigned long size to avoid such behavior.
> This bug was found while running kselftests on RISC-V.
> 
> Fixes: 7235759084a4 ("tracing/user_events: Use remote writes for event 
> enablement")
> Signed-off-by: Clément Léger 

Thanks for fixing! I have a few comments on this.

I unfortunately do not have RISC-V hardware to validate this on.

> ---
>  kernel/trace/trace_events_user.c | 12 +---
>  1 file changed, 9 insertions(+), 3 deletions(-)
> 
> diff --git a/kernel/trace/trace_events_user.c 
> b/kernel/trace/trace_events_user.c
> index 6f046650e527..580c0fe4b23e 100644
> --- a/kernel/trace/trace_events_user.c
> +++ b/kernel/trace/trace_events_user.c
> @@ -479,7 +479,7 @@ static int user_event_enabler_write(struct user_event_mm 
> *mm,
>   bool fixup_fault, int *attempt)
>  {
>   unsigned long uaddr = enabler->addr;
> - unsigned long *ptr;
> + unsigned long *ptr, bit_offset;
>   struct page *page;
>   void *kaddr;
>   int ret;
> @@ -511,13 +511,19 @@ static int user_event_enabler_write(struct 
> user_event_mm *mm,
>   }
>  
>   kaddr = kmap_local_page(page);
> +
> + bit_offset = uaddr & (sizeof(unsigned long) - 1);
> + if (bit_offset) {
> + bit_offset *= 8;

I think for future readers of this code it would be more clear to use
BITS_PER_BYTE instead of the hardcoded 8. Given we always align on a
"natural" boundary, I believe the bit_offset will always be 32 bits.

A comment here might help clarify why we do this as well in case folks
don't see the change description.

> + uaddr &= ~(sizeof(unsigned long) - 1);

Shouldn't this uaddr change be done before calling pin_user_pages_remote()
to ensure things cannot go bad? (I don't think they can, but it looks a
little odd).

Thanks,
-Beau

> + }
>   ptr = kaddr + (uaddr & ~PAGE_MASK);
>  
>   /* Update bit atomically, user tracers must be atomic as well */
>   if (enabler->event && enabler->event->status)
> - set_bit(ENABLE_BIT(enabler), ptr);
> + set_bit(ENABLE_BIT(enabler) + bit_offset, ptr);
>   else
> - clear_bit(ENABLE_BIT(enabler), ptr);
> + clear_bit(ENABLE_BIT(enabler) + bit_offset, ptr);
>  
>   kunmap_local(kaddr);
>   unpin_user_pages_dirty_lock(, 1, true);
> -- 
> 2.40.1


[PATCH v2 1/3] tracing/user_events: Allow events to persist for perfmon_capable users

2023-09-12 Thread Beau Belgrave
There are several scenarios that have come up where having a user_event
persist even if the process that registered it exits. The main one is
having a daemon create events on bootup that shouldn't get deleted if
the daemon has to exit or reload. Another is within OpenTelemetry
exporters, they wish to potentially check if a user_event exists on the
system to determine if exporting the data out should occur. The
user_event in this case must exist even in the absence of the owning
process running (such as the above daemon case).

Expose the previously internal flag USER_EVENT_REG_PERSIST to user
processes. Upon register or delete of events with this flag, ensure the
user is perfmon_capable to prevent random user processes with access to
tracefs from creating events that persist after exit.

Signed-off-by: Beau Belgrave 
---
 include/uapi/linux/user_events.h | 11 +-
 kernel/trace/trace_events_user.c | 36 +++-
 2 files changed, 32 insertions(+), 15 deletions(-)

diff --git a/include/uapi/linux/user_events.h b/include/uapi/linux/user_events.h
index 2984aae4a2b4..f74f3aedd49c 100644
--- a/include/uapi/linux/user_events.h
+++ b/include/uapi/linux/user_events.h
@@ -17,6 +17,15 @@
 /* Create dynamic location entry within a 32-bit value */
 #define DYN_LOC(offset, size) ((size) << 16 | (offset))
 
+/* List of supported registration flags */
+enum user_reg_flag {
+   /* Event will not delete upon last reference closing */
+   USER_EVENT_REG_PERSIST  = 1U << 0,
+
+   /* This value or above is currently non-ABI */
+   USER_EVENT_REG_MAX  = 1U << 1,
+};
+
 /*
  * Describes an event registration and stores the results of the registration.
  * This structure is passed to the DIAG_IOCSREG ioctl, callers at a minimum
@@ -33,7 +42,7 @@ struct user_reg {
/* Input: Enable size in bytes at address */
__u8enable_size;
 
-   /* Input: Flags for future use, set to 0 */
+   /* Input: Flags to use, if any */
__u16   flags;
 
/* Input: Address to update when enabled */
diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
index 6f046650e527..e3f2b8d72e01 100644
--- a/kernel/trace/trace_events_user.c
+++ b/kernel/trace/trace_events_user.c
@@ -49,18 +49,6 @@
 #define EVENT_STATUS_PERF BIT(1)
 #define EVENT_STATUS_OTHER BIT(7)
 
-/*
- * User register flags are not allowed yet, keep them here until we are
- * ready to expose them out to the user ABI.
- */
-enum user_reg_flag {
-   /* Event will not delete upon last reference closing */
-   USER_EVENT_REG_PERSIST  = 1U << 0,
-
-   /* This value or above is currently non-ABI */
-   USER_EVENT_REG_MAX  = 1U << 1,
-};
-
 /*
  * Stores the system name, tables, and locks for a group of events. This
  * allows isolation for events by various means.
@@ -191,6 +179,17 @@ static u32 user_event_key(char *name)
return jhash(name, strlen(name), 0);
 }
 
+static bool user_event_capable(u16 reg_flags)
+{
+   /* Persistent events require CAP_PERFMON / CAP_SYS_ADMIN */
+   if (reg_flags & USER_EVENT_REG_PERSIST) {
+   if (!perfmon_capable())
+   return false;
+   }
+
+   return true;
+}
+
 static struct user_event *user_event_get(struct user_event *user)
 {
refcount_inc(>refcnt);
@@ -1773,6 +1772,9 @@ static int user_event_free(struct dyn_event *ev)
if (!user_event_last_ref(user))
return -EBUSY;
 
+   if (!user_event_capable(user->reg_flags))
+   return -EPERM;
+
return destroy_user_event(user);
 }
 
@@ -1888,10 +1890,13 @@ static int user_event_parse(struct user_event_group 
*group, char *name,
int argc = 0;
char **argv;
 
-   /* User register flags are not ready yet */
-   if (reg_flags != 0 || flags != NULL)
+   /* Currently don't support any text based flags */
+   if (flags != NULL)
return -EINVAL;
 
+   if (!user_event_capable(reg_flags))
+   return -EPERM;
+
/* Prevent dyn_event from racing */
mutex_lock(_mutex);
user = find_user_event(group, name, );
@@ -2024,6 +2029,9 @@ static int delete_user_event(struct user_event_group 
*group, char *name)
if (!user_event_last_ref(user))
return -EBUSY;
 
+   if (!user_event_capable(user->reg_flags))
+   return -EPERM;
+
return destroy_user_event(user);
 }
 
-- 
2.34.1



[PATCH v2 0/3] tracing/user_events: Allow events to persist for perfmon_capable users

2023-09-12 Thread Beau Belgrave
There are several scenarios that have come up where having a user_event
persist even if the process that registered it exits. The main one is
having a daemon create events on bootup that shouldn't get deleted if
the daemon has to exit or reload. Another is within OpenTelemetry
exporters, they wish to potentially check if a user_event exists on the
system to determine if exporting the data out should occur. The
user_event in this case must exist even in the absence of the owning
process running (such as the above daemon case).

Since persistent events aren't automatically cleaned up, we want to ensure
only trusted users are allowed to do this. It seems reasonable to use
CAP_PERFMON as that boundary, since those users can already do many things
via perf_event_open without requiring full CAP_SYS_ADMIN.

This patchset brings back the ability to use /sys/kernel/tracing/dynamic_events
to create user_events, as persist is now back to being supported. Both the
register and delete of events that persist require CAP_PERFMON, which prevents
a non-perfmon user from making an event go away that a perfmon user decided
should persist.

Change History:

V2:
Added common user_event_capable() function to check access given
the register flags.

Ensure access check is done for dynamic_events when implicitly removing
events, such as "echo 'u:test' > /sys/kernel/tracing/dynamic_events".

Beau Belgrave (3):
  tracing/user_events: Allow events to persist for perfmon_capable users
  selftests/user_events: Test persist flag cases
  tracing/user_events: Document persist event flags

 Documentation/trace/user_events.rst   | 21 ++-
 include/uapi/linux/user_events.h  | 11 +++-
 kernel/trace/trace_events_user.c  | 36 +++-
 .../testing/selftests/user_events/abi_test.c  | 55 ++-
 .../testing/selftests/user_events/dyn_test.c  | 54 +-
 5 files changed, 157 insertions(+), 20 deletions(-)


base-commit: fc1653abba0d554aad80224e51bcad42b09895ed
-- 
2.34.1



[PATCH v2 3/3] tracing/user_events: Document persist event flags

2023-09-12 Thread Beau Belgrave
Users need to know how to make events persist now that we allow for
that. We also now allow the dynamic_events file to create events by
utilizing the persist flag during event register.

Add back in to documentation how /sys/kernel/tracing/dynamic_events can
be used to create persistent user_events. Add a section under registering
for the currently supported flags (USER_EVENT_REG_PERSIST) and the
required permissions. Add a note under deleting that deleting a
persistent event also requires sufficient permission.

Signed-off-by: Beau Belgrave 
---
 Documentation/trace/user_events.rst | 21 ++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/Documentation/trace/user_events.rst 
b/Documentation/trace/user_events.rst
index e7b07313550a..576d2c35f22e 100644
--- a/Documentation/trace/user_events.rst
+++ b/Documentation/trace/user_events.rst
@@ -14,6 +14,11 @@ Programs can view status of the events via
 /sys/kernel/tracing/user_events_status and can both register and write
 data out via /sys/kernel/tracing/user_events_data.
 
+Programs can also use /sys/kernel/tracing/dynamic_events to register and
+delete user based events via the u: prefix. The format of the command to
+dynamic_events is the same as the ioctl with the u: prefix applied. This
+requires CAP_PERFMON due to the event persisting, otherwise -EPERM is returned.
+
 Typically programs will register a set of events that they wish to expose to
 tools that can read trace_events (such as ftrace and perf). The registration
 process tells the kernel which address and bit to reflect if any tool has
@@ -45,7 +50,7 @@ This command takes a packed struct user_reg as an argument::
 /* Input: Enable size in bytes at address */
 __u8 enable_size;
 
-/* Input: Flags for future use, set to 0 */
+/* Input: Flags to use, if any */
 __u16 flags;
 
 /* Input: Address to update when enabled */
@@ -69,7 +74,7 @@ The struct user_reg requires all the above inputs to be set 
appropriately.
   This must be 4 (32-bit) or 8 (64-bit). 64-bit values are only allowed to be
   used on 64-bit kernels, however, 32-bit can be used on all kernels.
 
-+ flags: The flags to use, if any. For the initial version this must be 0.
++ flags: The flags to use, if any.
   Callers should first attempt to use flags and retry without flags to ensure
   support for lower versions of the kernel. If a flag is not supported -EINVAL
   is returned.
@@ -80,6 +85,13 @@ The struct user_reg requires all the above inputs to be set 
appropriately.
 + name_args: The name and arguments to describe the event, see command format
   for details.
 
+The following flags are currently supported.
+
++ USER_EVENT_REG_PERSIST: The event will not delete upon the last reference
+  closing. Callers may use this if an event should exist even after the
+  process closes or unregisters the event. Requires CAP_PERFMON otherwise
+  -EPERM is returned.
+
 Upon successful registration the following is set.
 
 + write_index: The index to use for this file descriptor that represents this
@@ -141,7 +153,10 @@ event (in both user and kernel space). User programs 
should use a separate file
 to request deletes than the one used for registration due to this.
 
 **NOTE:** By default events will auto-delete when there are no references left
-to the event. Flags in the future may change this logic.
+to the event. If programs do not want auto-delete, they must use the
+USER_EVENT_REG_PERSIST flag when registering the event. Once that flag is used
+the event exists until DIAG_IOCSDEL is invoked. Both register and delete of an
+event that persists requires CAP_PERFMON, otherwise -EPERM is returned.
 
 Unregistering
 -
-- 
2.34.1



[PATCH v2 2/3] selftests/user_events: Test persist flag cases

2023-09-12 Thread Beau Belgrave
Now that we have exposed USER_EVENT_REG_PERSIST events can persist both
via the ABI and in the /sys/kernel/tracing/dynamic_events file.

Ensure both the ABI and DYN cases work by calling both during the parse
tests. Add new flags test that ensures only USER_EVENT_REG_PERSIST is
honored and any other flag is invalid.

Signed-off-by: Beau Belgrave 
---
 .../testing/selftests/user_events/abi_test.c  | 55 ++-
 .../testing/selftests/user_events/dyn_test.c  | 54 +-
 2 files changed, 107 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/user_events/abi_test.c 
b/tools/testing/selftests/user_events/abi_test.c
index 5125c42efe65..b95fc15496ba 100644
--- a/tools/testing/selftests/user_events/abi_test.c
+++ b/tools/testing/selftests/user_events/abi_test.c
@@ -23,6 +23,18 @@
 const char *data_file = "/sys/kernel/tracing/user_events_data";
 const char *enable_file = 
"/sys/kernel/tracing/events/user_events/__abi_event/enable";
 
+static bool event_exists(void)
+{
+   int fd = open(enable_file, O_RDWR);
+
+   if (fd < 0)
+   return false;
+
+   close(fd);
+
+   return true;
+}
+
 static int change_event(bool enable)
 {
int fd = open(enable_file, O_RDWR);
@@ -46,7 +58,22 @@ static int change_event(bool enable)
return ret;
 }
 
-static int reg_enable(long *enable, int size, int bit)
+static int event_delete(void)
+{
+   int fd = open(data_file, O_RDWR);
+   int ret;
+
+   if (fd < 0)
+   return -1;
+
+   ret = ioctl(fd, DIAG_IOCSDEL, "__abi_event");
+
+   close(fd);
+
+   return ret;
+}
+
+static int reg_enable_flags(long *enable, int size, int bit, int flags)
 {
struct user_reg reg = {0};
int fd = open(data_file, O_RDWR);
@@ -57,6 +84,7 @@ static int reg_enable(long *enable, int size, int bit)
 
reg.size = sizeof(reg);
reg.name_args = (__u64)"__abi_event";
+   reg.flags = flags;
reg.enable_bit = bit;
reg.enable_addr = (__u64)enable;
reg.enable_size = size;
@@ -68,6 +96,11 @@ static int reg_enable(long *enable, int size, int bit)
return ret;
 }
 
+static int reg_enable(long *enable, int size, int bit)
+{
+   return reg_enable_flags(enable, size, bit, 0);
+}
+
 static int reg_disable(long *enable, int bit)
 {
struct user_unreg reg = {0};
@@ -121,6 +154,26 @@ TEST_F(user, enablement) {
ASSERT_EQ(0, change_event(false));
 }
 
+TEST_F(user, flags) {
+   /* USER_EVENT_REG_PERSIST is allowed */
+   ASSERT_EQ(0, reg_enable_flags(>check, sizeof(int), 0,
+ USER_EVENT_REG_PERSIST));
+   ASSERT_EQ(0, reg_disable(>check, 0));
+
+   /* Ensure it exists after close and disable */
+   ASSERT_TRUE(event_exists());
+
+   /* Ensure we can delete it */
+   ASSERT_EQ(0, event_delete());
+
+   /* USER_EVENT_REG_MAX or above is not allowed */
+   ASSERT_EQ(-1, reg_enable_flags(>check, sizeof(int), 0,
+  USER_EVENT_REG_MAX));
+
+   /* Ensure it does not exist after invalid flags */
+   ASSERT_FALSE(event_exists());
+}
+
 TEST_F(user, bit_sizes) {
/* Allow 0-31 bits for 32-bit */
ASSERT_EQ(0, reg_enable(>check, sizeof(int), 0));
diff --git a/tools/testing/selftests/user_events/dyn_test.c 
b/tools/testing/selftests/user_events/dyn_test.c
index 91aad42b..f2a41bcb5ad8 100644
--- a/tools/testing/selftests/user_events/dyn_test.c
+++ b/tools/testing/selftests/user_events/dyn_test.c
@@ -16,9 +16,25 @@
 
 #include "../kselftest_harness.h"
 
+const char *dyn_file = "/sys/kernel/tracing/dynamic_events";
 const char *abi_file = "/sys/kernel/tracing/user_events_data";
 const char *enable_file = 
"/sys/kernel/tracing/events/user_events/__test_event/enable";
 
+static int event_delete(void)
+{
+   int fd = open(abi_file, O_RDWR);
+   int ret;
+
+   if (fd < 0)
+   return -1;
+
+   ret = ioctl(fd, DIAG_IOCSDEL, "__test_event");
+
+   close(fd);
+
+   return ret;
+}
+
 static bool wait_for_delete(void)
 {
int i;
@@ -63,7 +79,31 @@ static int unreg_event(int fd, int *check, int bit)
return ioctl(fd, DIAG_IOCSUNREG, );
 }
 
-static int parse(int *check, const char *value)
+static int parse_dyn(const char *value)
+{
+   int fd = open(dyn_file, O_RDWR | O_APPEND);
+   int len = strlen(value);
+   int ret;
+
+   if (fd == -1)
+   return -1;
+
+   ret = write(fd, value, len);
+
+   if (ret == len)
+   ret = 0;
+   else
+   ret = -1;
+
+   close(fd);
+
+   if (ret == 0)
+   event_delete();
+
+   return ret;
+}
+
+static int parse_abi(int *check, const char *value)
 {
int fd = open(abi_file, O_RDWR);
int ret;
@@ -89,6 +129,18 @@ static int parse(int *che