When samples are generated, there is no way via the perf_event ABI to
fetch per-thread data. This data is very useful in tracing scenarios
that involve correlation IDs, such as OpenTelemetry. They are also
useful for tracking per-thread performance details directly within a
cooperating user process.

The newly establish OpenTelemetry profiling group requires a way to get
tracing correlations on both Linux and Windows. On Windows this
correlation is on a per-thread basis directly via ETW. On Linux we need
a fast mechanism to store these details and TLS seems like the best
option, see links for more details.

Add a new sample type (PERF_SAMPLE_TLS_USER) that fetches TLS data up to
X bytes per-sample. Use the existing PERF_SAMPLE_STACK_USER ABI for
outputting data out to consumers. Store requested data size by the user
in the previously reserved u16 (__reserved_2) within perf_event_attr.

Add tls_addr and tls_user_size to perf_sample_data and calculate them
during sample preparation. This allows the output side to know if
truncation is going to occur and not having to re-fetch the TLS value
from the user process a second time.

Add CONFIG_HAVE_PERF_USER_TLS_DUMP so that architectures can specify if
they have a TLS specific register (or other logic) that can be used for
dumping. This does not yet enable any architecture to do TLS dump, it
simply makes it possible by allowing a arch defined method named
arch_perf_user_tls_pointer().

Add perf_tls struct that arch_perf_user_tls_pointer() utilizes to set
TLS details of the address and size (for 32bit on 64bit compat cases).

Link: https://opentelemetry.io/blog/2024/profiling/
Link: 
https://www.elastic.co/blog/continuous-profiling-distributed-tracing-correlation
Signed-off-by: Beau Belgrave <be...@linux.microsoft.com>
---
 arch/Kconfig                    |   7 +++
 include/linux/perf_event.h      |   7 +++
 include/uapi/linux/perf_event.h |   5 +-
 kernel/events/core.c            | 105 +++++++++++++++++++++++++++++++-
 kernel/events/internal.h        |  16 +++++
 5 files changed, 137 insertions(+), 3 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index 9f066785bb71..6afaf5f46e2f 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -430,6 +430,13 @@ config HAVE_PERF_USER_STACK_DUMP
          access to the user stack pointer which is not unified across
          architectures.
 
+config HAVE_PERF_USER_TLS_DUMP
+       bool
+       help
+         Support user tls dumps for perf event samples. This needs
+         access to the user tls pointer which is not unified across
+         architectures.
+
 config HAVE_ARCH_JUMP_LABEL
        bool
 
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index d2a15c0c6f8a..7fac81929eed 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1202,8 +1202,15 @@ struct perf_sample_data {
        u64                             data_page_size;
        u64                             code_page_size;
        u64                             aux_size;
+       u64                             tls_addr;
+       u64                             tls_user_size;
 } ____cacheline_aligned;
 
+struct perf_tls {
+       unsigned long base; /* Base address for TLS */
+       unsigned long size; /* Size of base address */
+};
+
 /* default value for data source */
 #define PERF_MEM_NA (PERF_MEM_S(OP, NA)   |\
                    PERF_MEM_S(LVL, NA)   |\
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 3a64499b0f5d..b62669cfe581 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -162,8 +162,9 @@ enum perf_event_sample_format {
        PERF_SAMPLE_DATA_PAGE_SIZE              = 1U << 22,
        PERF_SAMPLE_CODE_PAGE_SIZE              = 1U << 23,
        PERF_SAMPLE_WEIGHT_STRUCT               = 1U << 24,
+       PERF_SAMPLE_TLS_USER                    = 1U << 25,
 
-       PERF_SAMPLE_MAX = 1U << 25,             /* non-ABI */
+       PERF_SAMPLE_MAX = 1U << 26,             /* non-ABI */
 };
 
 #define PERF_SAMPLE_WEIGHT_TYPE        (PERF_SAMPLE_WEIGHT | 
PERF_SAMPLE_WEIGHT_STRUCT)
@@ -509,7 +510,7 @@ struct perf_event_attr {
         */
        __u32   aux_watermark;
        __u16   sample_max_stack;
-       __u16   __reserved_2;
+       __u16   sample_tls_user; /* Size of TLS data to dump on samples */
        __u32   aux_sample_size;
        __u32   __reserved_3;
 
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 07de5cc2aa25..f848bf4be9bd 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6926,6 +6926,45 @@ static u64 perf_ustack_task_size(struct pt_regs *regs)
        return TASK_SIZE - addr;
 }
 
+/*
+ * Get remaining task size from user tls pointer.
+ *
+ * Outputs the address to use for the dump to avoid doing
+ * this twice (prepare and output).
+ */
+static u64
+perf_utls_task_size(struct pt_regs *regs, u64 dump_size, u64 *tls_addr)
+{
+       struct perf_tls tls;
+       unsigned long addr;
+
+       *tls_addr = 0;
+
+       /* No regs, no tls pointer, no dump. */
+       if (!regs)
+               return 0;
+
+       perf_user_tls_pointer(&tls);
+
+       if (WARN_ONCE(tls.size > sizeof(addr), "perf: Bad TLS size.\n"))
+               return 0;
+
+       addr = 0;
+       arch_perf_out_copy_user(&addr, (void *)tls.base, tls.size);
+
+       if (addr < dump_size)
+               return 0;
+
+       addr -= dump_size;
+
+       if (!addr || addr >= TASK_SIZE)
+               return 0;
+
+       *tls_addr = addr;
+
+       return TASK_SIZE - addr;
+}
+
 static u16
 perf_sample_dump_size(u16 dump_size, u16 header_size, u64 task_size)
 {
@@ -6997,6 +7036,43 @@ perf_output_sample_ustack(struct perf_output_handle 
*handle, u64 dump_size,
        }
 }
 
+static void
+perf_output_sample_utls(struct perf_output_handle *handle, u64 addr,
+                       u64 dump_size, struct pt_regs *regs)
+{
+       /* Case of a kernel thread, nothing to dump */
+       if (!regs) {
+               u64 size = 0;
+               perf_output_put(handle, size);
+       } else {
+               unsigned int rem;
+               u64 dyn_size;
+
+               /*
+                * We dump:
+                * static size
+                *   - the size requested by user or the best one we can fit
+                *     in to the sample max size
+                * data
+                *   - user tls dump data
+                * dynamic size
+                *   - the actual dumped size
+                */
+
+               /* Static size. */
+               perf_output_put(handle, dump_size);
+
+               /* Data. */
+               rem = __output_copy_user(handle, (void *)addr, dump_size);
+               dyn_size = dump_size - rem;
+
+               perf_output_skip(handle, rem);
+
+               /* Dynamic size. */
+               perf_output_put(handle, dyn_size);
+       }
+}
+
 static unsigned long perf_prepare_sample_aux(struct perf_event *event,
                                          struct perf_sample_data *data,
                                          size_t size)
@@ -7474,6 +7550,13 @@ void perf_output_sample(struct perf_output_handle 
*handle,
        if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE)
                perf_output_put(handle, data->code_page_size);
 
+       if (sample_type & PERF_SAMPLE_TLS_USER) {
+               perf_output_sample_utls(handle,
+                                       data->tls_addr,
+                                       data->tls_user_size,
+                                       data->regs_user.regs);
+       }
+
        if (sample_type & PERF_SAMPLE_AUX) {
                perf_output_put(handle, data->aux_size);
 
@@ -7759,6 +7842,19 @@ void perf_prepare_sample(struct perf_sample_data *data,
                data->sample_flags |= PERF_SAMPLE_STACK_USER;
        }
 
+       if (filtered_sample_type & PERF_SAMPLE_TLS_USER) {
+               u16 tls_size = event->attr.sample_tls_user;
+               u64 task_size = perf_utls_task_size(data->regs_user.regs,
+                                                   tls_size,
+                                                   &data->tls_addr);
+
+               tls_size = perf_prepare_dump_data(data, event, regs,
+                                                 tls_size, task_size);
+
+               data->tls_user_size = tls_size;
+               data->sample_flags |= PERF_SAMPLE_TLS_USER;
+       }
+
        if (filtered_sample_type & PERF_SAMPLE_WEIGHT_TYPE) {
                data->weight.full = 0;
                data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE;
@@ -12159,7 +12255,7 @@ static int perf_copy_attr(struct perf_event_attr __user 
*uattr,
 
        attr->size = size;
 
-       if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3)
+       if (attr->__reserved_1 || attr->__reserved_3)
                return -EINVAL;
 
        if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
@@ -12225,6 +12321,13 @@ static int perf_copy_attr(struct perf_event_attr 
__user *uattr,
                        return -EINVAL;
        }
 
+       if (attr->sample_type & PERF_SAMPLE_TLS_USER) {
+               if (!arch_perf_have_user_tls_dump())
+                       return -ENOSYS;
+               else if (!IS_ALIGNED(attr->sample_tls_user, sizeof(u64)))
+                       return -EINVAL;
+       }
+
        if (!attr->sample_max_stack)
                attr->sample_max_stack = sysctl_perf_event_max_stack;
 
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 5150d5f84c03..b42747b1eb04 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -243,4 +243,20 @@ static inline bool arch_perf_have_user_stack_dump(void)
 #define perf_user_stack_pointer(regs) 0
 #endif /* CONFIG_HAVE_PERF_USER_STACK_DUMP */
 
+#ifdef CONFIG_HAVE_PERF_USER_TLS_DUMP
+static inline bool arch_perf_have_user_tls_dump(void)
+{
+       return true;
+}
+
+#define perf_user_tls_pointer(tls) arch_perf_user_tls_pointer(tls)
+#else
+static inline bool arch_perf_have_user_tls_dump(void)
+{
+       return false;
+}
+
+#define perf_user_tls_pointer(tls) memset(tls, 0, sizeof(*tls))
+#endif /* CONFIG_HAVE_PERF_USER_TLS_DUMP */
+
 #endif /* _KERNEL_EVENTS_INTERNAL_H */
-- 
2.34.1


Reply via email to