Re: [PATCH v1 0/4] perf parse-regs: Cleanup config and building

2024-02-14 Thread Leo Yan
On Wed, Feb 14, 2024 at 02:42:55PM -0800, Ian Rogers wrote:

[...]

> Thanks Leo, this is great cleanup! Series:
> Reviewed-by: Ian Rogers 

Thanks a lot for reviewing, Ian!

Leo



[PATCH v1 4/4] perf build: Cleanup perf register configuration

2024-02-14 Thread Leo Yan
The target is to allow the tool to always enable the perf register
feature for native parsing and cross parsing, and current code doesn't
depend on the macro 'HAVE_PERF_REGS_SUPPORT'.

This patch remove the variable 'NO_PERF_REGS' and the defined macro
'HAVE_PERF_REGS_SUPPORT' from the Makefile.

Signed-off-by: Leo Yan 
---
 tools/perf/Makefile.config | 21 -
 1 file changed, 21 deletions(-)

diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config
index 8b740c668ab7..7de7111c0226 100644
--- a/tools/perf/Makefile.config
+++ b/tools/perf/Makefile.config
@@ -28,8 +28,6 @@ include $(srctree)/tools/scripts/Makefile.arch
 
 $(call detected_var,SRCARCH)
 
-NO_PERF_REGS := 1
-
 ifneq ($(NO_SYSCALL_TABLE),1)
   NO_SYSCALL_TABLE := 1
 
@@ -50,7 +48,6 @@ endif
 
 # Additional ARCH settings for ppc
 ifeq ($(SRCARCH),powerpc)
-  NO_PERF_REGS := 0
   CFLAGS += -I$(OUTPUT)arch/powerpc/include/generated
   LIBUNWIND_LIBS := -lunwind -lunwind-ppc64
 endif
@@ -66,41 +63,27 @@ ifeq ($(SRCARCH),x86)
   else
 LIBUNWIND_LIBS = -lunwind-x86 -llzma -lunwind
   endif
-  NO_PERF_REGS := 0
 endif
 
 ifeq ($(SRCARCH),arm)
-  NO_PERF_REGS := 0
   LIBUNWIND_LIBS = -lunwind -lunwind-arm
 endif
 
 ifeq ($(SRCARCH),arm64)
-  NO_PERF_REGS := 0
   CFLAGS += -I$(OUTPUT)arch/arm64/include/generated
   LIBUNWIND_LIBS = -lunwind -lunwind-aarch64
 endif
 
 ifeq ($(SRCARCH),loongarch)
-  NO_PERF_REGS := 0
   CFLAGS += -I$(OUTPUT)arch/loongarch/include/generated
   LIBUNWIND_LIBS = -lunwind -lunwind-loongarch64
 endif
 
-ifeq ($(SRCARCH),riscv)
-  NO_PERF_REGS := 0
-endif
-
-ifeq ($(SRCARCH),csky)
-  NO_PERF_REGS := 0
-endif
-
 ifeq ($(ARCH),s390)
-  NO_PERF_REGS := 0
   CFLAGS += -fPIC -I$(OUTPUT)arch/s390/include/generated
 endif
 
 ifeq ($(ARCH),mips)
-  NO_PERF_REGS := 0
   CFLAGS += -I$(OUTPUT)arch/mips/include/generated
   LIBUNWIND_LIBS = -lunwind -lunwind-mips
 endif
@@ -161,10 +144,6 @@ endif
 FEATURE_CHECK_CFLAGS-libopencsd := $(LIBOPENCSD_CFLAGS)
 FEATURE_CHECK_LDFLAGS-libopencsd := $(LIBOPENCSD_LDFLAGS) $(OPENCSDLIBS)
 
-ifeq ($(NO_PERF_REGS),0)
-  CFLAGS += -DHAVE_PERF_REGS_SUPPORT
-endif
-
 # for linking with debug library, run like:
 # make DEBUG=1 LIBDW_DIR=/opt/libdw/
 ifdef LIBDW_DIR
-- 
2.34.1




[PATCH v1 3/4] perf parse-regs: Introduce a weak function arch__sample_reg_masks()

2024-02-14 Thread Leo Yan
Every architecture can provide a register list for sampling. If an
architecture doesn't support register sampling, it won't define the data
structure 'sample_reg_masks'. Consequently, any code using this
structure must be protected by the macro 'HAVE_PERF_REGS_SUPPORT'.

This patch defines a weak function, arch__sample_reg_masks(), which will
be replaced by an architecture-defined function for returning the
architecture's register list. With this refactoring, the function always
exists, the condition checking for 'HAVE_PERF_REGS_SUPPORT' is not
needed anymore, so remove it.

Signed-off-by: Leo Yan 
---
 tools/perf/arch/arm/util/perf_regs.c   | 7 ++-
 tools/perf/arch/arm64/util/machine.c   | 2 ++
 tools/perf/arch/arm64/util/perf_regs.c | 7 ++-
 tools/perf/arch/csky/util/perf_regs.c  | 7 ++-
 tools/perf/arch/loongarch/util/perf_regs.c | 7 ++-
 tools/perf/arch/mips/util/perf_regs.c  | 7 ++-
 tools/perf/arch/powerpc/util/perf_regs.c   | 7 ++-
 tools/perf/arch/riscv/util/perf_regs.c | 7 ++-
 tools/perf/arch/s390/util/perf_regs.c  | 7 ++-
 tools/perf/arch/x86/util/perf_regs.c   | 7 ++-
 tools/perf/util/parse-regs-options.c   | 8 ++--
 tools/perf/util/perf_regs.c| 9 +
 tools/perf/util/perf_regs.h| 3 +--
 13 files changed, 68 insertions(+), 17 deletions(-)

diff --git a/tools/perf/arch/arm/util/perf_regs.c 
b/tools/perf/arch/arm/util/perf_regs.c
index 2c56e8b56ddf..f94a0210c7b7 100644
--- a/tools/perf/arch/arm/util/perf_regs.c
+++ b/tools/perf/arch/arm/util/perf_regs.c
@@ -2,7 +2,7 @@
 #include "perf_regs.h"
 #include "../../../util/perf_regs.h"
 
-const struct sample_reg sample_reg_masks[] = {
+static const struct sample_reg sample_reg_masks[] = {
SMPL_REG_END
 };
 
@@ -15,3 +15,8 @@ uint64_t arch__user_reg_mask(void)
 {
return PERF_REGS_MASK;
 }
+
+const struct sample_reg *arch__sample_reg_masks(void)
+{
+   return sample_reg_masks;
+}
diff --git a/tools/perf/arch/arm64/util/machine.c 
b/tools/perf/arch/arm64/util/machine.c
index ba1144366e85..aab1cc2bc283 100644
--- a/tools/perf/arch/arm64/util/machine.c
+++ b/tools/perf/arch/arm64/util/machine.c
@@ -12,5 +12,7 @@
 
 void arch__add_leaf_frame_record_opts(struct record_opts *opts)
 {
+   const struct sample_reg *sample_reg_masks = arch__sample_reg_masks();
+
opts->sample_user_regs |= sample_reg_masks[PERF_REG_ARM64_LR].mask;
 }
diff --git a/tools/perf/arch/arm64/util/perf_regs.c 
b/tools/perf/arch/arm64/util/perf_regs.c
index 1b79d8eab22f..09308665e28a 100644
--- a/tools/perf/arch/arm64/util/perf_regs.c
+++ b/tools/perf/arch/arm64/util/perf_regs.c
@@ -16,7 +16,7 @@
 #define HWCAP_SVE  (1 << 22)
 #endif
 
-const struct sample_reg sample_reg_masks[] = {
+static const struct sample_reg sample_reg_masks[] = {
SMPL_REG(x0, PERF_REG_ARM64_X0),
SMPL_REG(x1, PERF_REG_ARM64_X1),
SMPL_REG(x2, PERF_REG_ARM64_X2),
@@ -175,3 +175,8 @@ uint64_t arch__user_reg_mask(void)
}
return PERF_REGS_MASK;
 }
+
+const struct sample_reg *arch__sample_reg_masks(void)
+{
+   return sample_reg_masks;
+}
diff --git a/tools/perf/arch/csky/util/perf_regs.c 
b/tools/perf/arch/csky/util/perf_regs.c
index c0877c264d49..6b1665f41180 100644
--- a/tools/perf/arch/csky/util/perf_regs.c
+++ b/tools/perf/arch/csky/util/perf_regs.c
@@ -2,7 +2,7 @@
 #include "perf_regs.h"
 #include "../../util/perf_regs.h"
 
-const struct sample_reg sample_reg_masks[] = {
+static const struct sample_reg sample_reg_masks[] = {
SMPL_REG_END
 };
 
@@ -15,3 +15,8 @@ uint64_t arch__user_reg_mask(void)
 {
return PERF_REGS_MASK;
 }
+
+const struct sample_reg *arch__sample_reg_masks(void)
+{
+   return sample_reg_masks;
+}
diff --git a/tools/perf/arch/loongarch/util/perf_regs.c 
b/tools/perf/arch/loongarch/util/perf_regs.c
index 2c56e8b56ddf..f94a0210c7b7 100644
--- a/tools/perf/arch/loongarch/util/perf_regs.c
+++ b/tools/perf/arch/loongarch/util/perf_regs.c
@@ -2,7 +2,7 @@
 #include "perf_regs.h"
 #include "../../../util/perf_regs.h"
 
-const struct sample_reg sample_reg_masks[] = {
+static const struct sample_reg sample_reg_masks[] = {
SMPL_REG_END
 };
 
@@ -15,3 +15,8 @@ uint64_t arch__user_reg_mask(void)
 {
return PERF_REGS_MASK;
 }
+
+const struct sample_reg *arch__sample_reg_masks(void)
+{
+   return sample_reg_masks;
+}
diff --git a/tools/perf/arch/mips/util/perf_regs.c 
b/tools/perf/arch/mips/util/perf_regs.c
index c0877c264d49..6b1665f41180 100644
--- a/tools/perf/arch/mips/util/perf_regs.c
+++ b/tools/perf/arch/mips/util/perf_regs.c
@@ -2,7 +2,7 @@
 #include "perf_regs.h"
 #include "../../util/perf_regs.h"
 
-const struct sample_reg sample_reg_masks[] = {
+static const struct sample_reg sample_reg_masks[] = {
SMPL_REG_END
 };
 
@@ -15,3 +15,8 @@ uint64_t arch__user_reg_mask(void)
 {

[PATCH v1 2/4] perf parse-regs: Always build perf register functions

2024-02-14 Thread Leo Yan
Currently, the macro HAVE_PERF_REGS_SUPPORT is used as a switch to turn
on or turn off the code of perf registers. If any architecture cannot
support perf register, it disables the perf register parsing, for both
the native parsing and cross parsing for other architectures.

To support both the native parsing and cross parsing, the tool should
always build the perf regs functions. Thus, this patch removes
HAVE_PERF_REGS_SUPPORT from the perf regs files.

Signed-off-by: Leo Yan 
---
 .../util/perf-regs-arch/perf_regs_aarch64.c   |  4 ---
 .../perf/util/perf-regs-arch/perf_regs_arm.c  |  4 ---
 .../perf/util/perf-regs-arch/perf_regs_csky.c |  4 ---
 .../util/perf-regs-arch/perf_regs_loongarch.c |  4 ---
 .../perf/util/perf-regs-arch/perf_regs_mips.c |  4 ---
 .../util/perf-regs-arch/perf_regs_powerpc.c   |  4 ---
 .../util/perf-regs-arch/perf_regs_riscv.c |  4 ---
 .../perf/util/perf-regs-arch/perf_regs_s390.c |  4 ---
 .../perf/util/perf-regs-arch/perf_regs_x86.c  |  4 ---
 tools/perf/util/perf_regs.c   |  4 ---
 tools/perf/util/perf_regs.h   | 31 ---
 11 files changed, 71 deletions(-)

diff --git a/tools/perf/util/perf-regs-arch/perf_regs_aarch64.c 
b/tools/perf/util/perf-regs-arch/perf_regs_aarch64.c
index 696566c54768..9dcda80d310f 100644
--- a/tools/perf/util/perf-regs-arch/perf_regs_aarch64.c
+++ b/tools/perf/util/perf-regs-arch/perf_regs_aarch64.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
 
-#ifdef HAVE_PERF_REGS_SUPPORT
-
 #include "../perf_regs.h"
 #include "../../../arch/arm64/include/uapi/asm/perf_regs.h"
 
@@ -92,5 +90,3 @@ uint64_t __perf_reg_sp_arm64(void)
 {
return PERF_REG_ARM64_SP;
 }
-
-#endif
diff --git a/tools/perf/util/perf-regs-arch/perf_regs_arm.c 
b/tools/perf/util/perf-regs-arch/perf_regs_arm.c
index 700fd07cd2aa..e29d130a587a 100644
--- a/tools/perf/util/perf-regs-arch/perf_regs_arm.c
+++ b/tools/perf/util/perf-regs-arch/perf_regs_arm.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
 
-#ifdef HAVE_PERF_REGS_SUPPORT
-
 #include "../perf_regs.h"
 #include "../../../arch/arm/include/uapi/asm/perf_regs.h"
 
@@ -56,5 +54,3 @@ uint64_t __perf_reg_sp_arm(void)
 {
return PERF_REG_ARM_SP;
 }
-
-#endif
diff --git a/tools/perf/util/perf-regs-arch/perf_regs_csky.c 
b/tools/perf/util/perf-regs-arch/perf_regs_csky.c
index a2841094e096..75b461ef2eba 100644
--- a/tools/perf/util/perf-regs-arch/perf_regs_csky.c
+++ b/tools/perf/util/perf-regs-arch/perf_regs_csky.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
 
-#ifdef HAVE_PERF_REGS_SUPPORT
-
 #include "../perf_regs.h"
 #include "../../arch/csky/include/uapi/asm/perf_regs.h"
 
@@ -96,5 +94,3 @@ uint64_t __perf_reg_sp_csky(void)
 {
return PERF_REG_CSKY_SP;
 }
-
-#endif
diff --git a/tools/perf/util/perf-regs-arch/perf_regs_loongarch.c 
b/tools/perf/util/perf-regs-arch/perf_regs_loongarch.c
index a9ba0f934123..043f97f4e3ac 100644
--- a/tools/perf/util/perf-regs-arch/perf_regs_loongarch.c
+++ b/tools/perf/util/perf-regs-arch/perf_regs_loongarch.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
 
-#ifdef HAVE_PERF_REGS_SUPPORT
-
 #include "../perf_regs.h"
 #include "../../../arch/loongarch/include/uapi/asm/perf_regs.h"
 
@@ -87,5 +85,3 @@ uint64_t __perf_reg_sp_loongarch(void)
 {
return PERF_REG_LOONGARCH_R3;
 }
-
-#endif
diff --git a/tools/perf/util/perf-regs-arch/perf_regs_mips.c 
b/tools/perf/util/perf-regs-arch/perf_regs_mips.c
index 5a45830cfbf5..793178fc3c78 100644
--- a/tools/perf/util/perf-regs-arch/perf_regs_mips.c
+++ b/tools/perf/util/perf-regs-arch/perf_regs_mips.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
 
-#ifdef HAVE_PERF_REGS_SUPPORT
-
 #include "../perf_regs.h"
 #include "../../../arch/mips/include/uapi/asm/perf_regs.h"
 
@@ -83,5 +81,3 @@ uint64_t __perf_reg_sp_mips(void)
 {
return PERF_REG_MIPS_R29;
 }
-
-#endif
diff --git a/tools/perf/util/perf-regs-arch/perf_regs_powerpc.c 
b/tools/perf/util/perf-regs-arch/perf_regs_powerpc.c
index 1f0d682db74a..08636bb09a3a 100644
--- a/tools/perf/util/perf-regs-arch/perf_regs_powerpc.c
+++ b/tools/perf/util/perf-regs-arch/perf_regs_powerpc.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
 
-#ifdef HAVE_PERF_REGS_SUPPORT
-
 #include "../perf_regs.h"
 #include "../../../arch/powerpc/include/uapi/asm/perf_regs.h"
 
@@ -141,5 +139,3 @@ uint64_t __perf_reg_sp_powerpc(void)
 {
return PERF_REG_POWERPC_R1;
 }
-
-#endif
diff --git a/tools/perf/util/perf-regs-arch/perf_regs_riscv.c 
b/tools/perf/util/perf-regs-arch/perf_regs_riscv.c
index e432630be4c5..337b687c655d 100644
--- a/tools/perf/util/perf-regs-arch/perf_regs_riscv.c
+++ b/tools/perf/util/perf-regs-arch/perf_regs_riscv.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
 
-#ifdef HAVE_PERF_REGS_SUPPORT
-
 #include "../perf_regs.h"
 #include "../../../arch/riscv/incl

[PATCH v1 1/4] perf build: Remove unused CONFIG_PERF_REGS

2024-02-14 Thread Leo Yan
CONFIG_PERF_REGS is not used, remove it.

Signed-off-by: Leo Yan 
---
 tools/perf/Makefile.config | 4 
 1 file changed, 4 deletions(-)

diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config
index aa55850fbc21..8b740c668ab7 100644
--- a/tools/perf/Makefile.config
+++ b/tools/perf/Makefile.config
@@ -105,10 +105,6 @@ ifeq ($(ARCH),mips)
   LIBUNWIND_LIBS = -lunwind -lunwind-mips
 endif
 
-ifeq ($(NO_PERF_REGS),0)
-  $(call detected,CONFIG_PERF_REGS)
-endif
-
 # So far there's only x86 and arm libdw unwind support merged in perf.
 # Disable it on all other architectures in case libdw unwind
 # support is detected in system. Add supported architectures
-- 
2.34.1




[PATCH v1 0/4] perf parse-regs: Cleanup config and building

2024-02-14 Thread Leo Yan
Currently, the perf building enables register parsing based on the
target architecture has supported register feature.

Furthermore, the perf building system needs to maintain a variable
'NO_PERF_REGS' and defines macro 'HAVE_PERF_REGS_SUPPORT' for statically
compiling the tool.

As a result, the perf has no flexibilty for parsing register if an
architecture doesn't support it. And the source files use the macro
'HAVE_PERF_REGS_SUPPORT' to switch on and off the register parsing
related code, which is not a good practice.

This series is to remove the static building for register parsing. In
theory, we should can dynamically detect if an arch has support this
feature and functions can return errors when the feature is not
supported.

The first patch is to remove unused build configuration
CONFIG_PERF_REGS.

The second patch is to build perf register functions, without using the
macro 'HAVE_PERF_REGS_SUPPORT' to statically turn on or off code.

The third patch is to introduce a weak function arch__sample_reg_masks(),
this function can allow the target arch to return its sample register
list.  With this change, we can totally remove the macro
'HAVE_PERF_REGS_SUPPORT' in the source file.

The forth patch is to clean up the Makefile for removing relevant
configuration and macro definition, as they are not useful anymore.

I tested this patch set on Arm64 and x86 for building and did a cross
register parsing ('perf record' on Arm64 and 'perf report' on x86).


Leo Yan (4):
  perf build: Remove unused CONFIG_PERF_REGS
  perf parse-regs: Always build perf register functions
  perf parse-regs: Introduce a weak function arch__sample_reg_masks()
  perf build: Cleanup perf register configuration

 tools/perf/Makefile.config| 25 --
 tools/perf/arch/arm/util/perf_regs.c  |  7 +++-
 tools/perf/arch/arm64/util/machine.c  |  2 ++
 tools/perf/arch/arm64/util/perf_regs.c|  7 +++-
 tools/perf/arch/csky/util/perf_regs.c |  7 +++-
 tools/perf/arch/loongarch/util/perf_regs.c|  7 +++-
 tools/perf/arch/mips/util/perf_regs.c |  7 +++-
 tools/perf/arch/powerpc/util/perf_regs.c  |  7 +++-
 tools/perf/arch/riscv/util/perf_regs.c|  7 +++-
 tools/perf/arch/s390/util/perf_regs.c |  7 +++-
 tools/perf/arch/x86/util/perf_regs.c  |  7 +++-
 tools/perf/util/parse-regs-options.c  |  8 ++---
 .../util/perf-regs-arch/perf_regs_aarch64.c   |  4 ---
 .../perf/util/perf-regs-arch/perf_regs_arm.c  |  4 ---
 .../perf/util/perf-regs-arch/perf_regs_csky.c |  4 ---
 .../util/perf-regs-arch/perf_regs_loongarch.c |  4 ---
 .../perf/util/perf-regs-arch/perf_regs_mips.c |  4 ---
 .../util/perf-regs-arch/perf_regs_powerpc.c   |  4 ---
 .../util/perf-regs-arch/perf_regs_riscv.c |  4 ---
 .../perf/util/perf-regs-arch/perf_regs_s390.c |  4 ---
 .../perf/util/perf-regs-arch/perf_regs_x86.c  |  4 ---
 tools/perf/util/perf_regs.c   | 11 --
 tools/perf/util/perf_regs.h   | 34 +--
 23 files changed, 67 insertions(+), 112 deletions(-)

-- 
2.34.1




[PATCH] perf auxtrace: Fix potential null pointer dereference

2021-04-20 Thread Leo Yan
In the function auxtrace_parse_snapshot_options(), the callback pointer
"itr->parse_snapshot_options" can be NULL if it has not been set during
the AUX record initialization.  This can cause tool crashing if the
callback pointer "itr->parse_snapshot_options" is dereferenced without
performing NULL check.

Add a NULL check for the pointer "itr->parse_snapshot_options" before
invoke the callback.

Fixes: d20031bb63dd ("perf tools: Add AUX area tracing Snapshot Mode")
Signed-off-by: Leo Yan 
---
 tools/perf/util/auxtrace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/perf/util/auxtrace.c b/tools/perf/util/auxtrace.c
index 953f4afacd3b..320b47f133d3 100644
--- a/tools/perf/util/auxtrace.c
+++ b/tools/perf/util/auxtrace.c
@@ -638,7 +638,7 @@ int auxtrace_parse_snapshot_options(struct auxtrace_record 
*itr,
break;
}
 
-   if (itr)
+   if (itr && itr->parse_snapshot_options)
return itr->parse_snapshot_options(itr, opts, str);
 
pr_err("No AUX area tracing to snapshot\n");
-- 
2.25.1



Re: [PATCH v4 4/6] perf arm-spe: Assign kernel time to synthesized event

2021-04-16 Thread Leo Yan
On Fri, Apr 16, 2021 at 03:51:25PM +0300, James Clark wrote:

[...]

> >> I noticed that in arm_spe_recording_options() the TIME sample bit is set 
> >> regardless of any options.
> >> I don't know of a way to remove this, and if there isn't, does that mean 
> >> that all the code in this
> >> file that looks at spe->timeless_decoding is untested and has never been 
> >> hit?
> >>
> >> Unless there is a way to get a perf file with only the AUXTRACE event and 
> >> no others? I think that one
> >> might have no timestamp set. Otherwise other events will always have 
> >> timestamps so spe->timeless_decoding
> >> is always false.
> > 
> > Good point.  To be honest, I never noticed this issue until you
> > mentioned this.
> > 
> > We should fix for the "timeless" flow; and it's questionable for the
> > function arm_spe_recording_options(), except for setting
> > PERF_SAMPLE_TIME, it also hard codes for setting
> > PERF_SAMPLE_CPU and PERF_SAMPLE_TID.  Might need to carefully go
> > through this function.
> > 
> 
> Yeah, it's not strictly related to your change, which is definitely an 
> improvement.
> But maybe we should have a look at the SPE implementation relating to 
> timestamps as a whole.

Totally agree, at least this patch series should not introduce any
barrier for timeless case.  I will go back to verify it; if you'd
like to fix timeless issue, please feel free to go ahead.

Thanks,
Leo


Re: [PATCH v4 4/6] perf arm-spe: Assign kernel time to synthesized event

2021-04-15 Thread Leo Yan
On Thu, Apr 15, 2021 at 05:46:31PM +0300, James Clark wrote:
> 
> 
> On 12/04/2021 12:10, Leo Yan wrote:
> > In current code, it assigns the arch timer counter to the synthesized
> > samples Arm SPE trace, thus the samples don't contain the kernel time
> > but only contain the raw counter value.
> > 
> > To fix the issue, this patch converts the timer counter to kernel time
> > and assigns it to sample timestamp.
> > 
> > Signed-off-by: Leo Yan 
> > ---
> >  tools/perf/util/arm-spe.c | 2 +-
> >  1 file changed, 1 insertion(+), 1 deletion(-)
> > 
> > diff --git a/tools/perf/util/arm-spe.c b/tools/perf/util/arm-spe.c
> > index 23714cf0380e..c13a89f06ab8 100644
> > --- a/tools/perf/util/arm-spe.c
> > +++ b/tools/perf/util/arm-spe.c
> > @@ -234,7 +234,7 @@ static void arm_spe_prep_sample(struct arm_spe *spe,
> > struct arm_spe_record *record = >decoder->record;
> >  
> > if (!spe->timeless_decoding)
> > -   sample->time = speq->timestamp;
> > +   sample->time = tsc_to_perf_time(record->timestamp, >tc);
> 
> 
> I noticed that in arm_spe_recording_options() the TIME sample bit is set 
> regardless of any options.
> I don't know of a way to remove this, and if there isn't, does that mean that 
> all the code in this
> file that looks at spe->timeless_decoding is untested and has never been hit?
> 
> Unless there is a way to get a perf file with only the AUXTRACE event and no 
> others? I think that one
> might have no timestamp set. Otherwise other events will always have 
> timestamps so spe->timeless_decoding
> is always false.

Good point.  To be honest, I never noticed this issue until you
mentioned this.

We should fix for the "timeless" flow; and it's questionable for the
function arm_spe_recording_options(), except for setting
PERF_SAMPLE_TIME, it also hard codes for setting
PERF_SAMPLE_CPU and PERF_SAMPLE_TID.  Might need to carefully go
through this function.

Thanks,
Leo


Re: [PATCH v4 0/6] perf arm-spe: Enable timestamp

2021-04-15 Thread Leo Yan
On Thu, Apr 15, 2021 at 05:43:24PM +0300, James Clark wrote:
> Hi Leo,
> 
> I was looking at testing this on N1SDP and I thought I would try the round 
> trip with perf inject and
> then perf report but saw that perf inject with SPE always results in an error 
> (unrelated to your change)
> 
>-> ./perf report -i per-thread-spe-time.inject.data
>   0x1328 [0x8]: failed to process type: 9 [Bad address]
>   Error:
>   failed to process sample
> 
> 
> Do you have any test suggestions other than looking at the raw data?

Good catching!  I didn't use inject mode for Arm SPE before (it's not
not like Arm CoreSight for instruction sample, or SPE's branch sample
is statistical so we cannot generate branch samples based on accurate
interval).

For the debugging, it's good to use "git grep" to search "Bad address"
to check where the error happens, and can use gdb.  I personally think
it's possible to go back to check the sythenization flow, simply to
say, it might have problems when inject samples but not in the
decoding flow.

Thanks,
Leo


Re: [PATCH v4 1/6] perf arm-spe: Remove unused enum value ARM_SPE_PER_CPU_MMAPS

2021-04-15 Thread Leo Yan
Hi James,

On Thu, Apr 15, 2021 at 05:13:36PM +0300, James Clark wrote:
> On 12/04/2021 12:10, Leo Yan wrote:
> > The enum value 'ARM_SPE_PER_CPU_MMAPS' is never used so remove it.
> 
> Hi Leo,
> 
> I think this causes an error when attempting to open a newly recorded file
> with an old version of perf. The value ARM_SPE_AUXTRACE_PRIV_MAX is used here:
> 
>   size_t min_sz = sizeof(u64) * ARM_SPE_AUXTRACE_PRIV_MAX;
>   struct perf_record_time_conv *tc = >time_conv;
>   struct arm_spe *spe;
>   int err;
> 
>   if (auxtrace_info->header.size < sizeof(struct 
> perf_record_auxtrace_info) +
>   min_sz)
>   return -EINVAL;
> 
> And removing ARM_SPE_PER_CPU_MMAPS changes the value of 
> ARM_SPE_AUXTRACE_PRIV_MAX.
> 
> At least I think that's what's causing the problem. I get this error:
> 
>   ./perf report -i per-thread-spe-time.data
>   0x1c0 [0x18]: failed to process type: 70 [Invalid argument]
>   Error:
>   failed to process sample
>   # To display the perf.data header info, please use 
> --header/--header-only options.
>   #

Yes, when working on this patch I had concern as well.

I carefully thought that the perf tool should be backwards-compatible,
but there have no requirement for forwards-compatibility.  This is the
main reason why I kept this patch.

If you or anyone could confirm the forwards-compatibility is required,
it's quite fine for me to drop this patch.

Thanks a lot for the reviewing and testing!
Leo


Re: [PATCH 2/2] perf cs-etm: Set time on synthesised samples to preserve ordering

2021-04-15 Thread Leo Yan
Hi James,

On Thu, Apr 15, 2021 at 03:51:46PM +0300, James Clark wrote:

[...]

> > For the orignal perf data file with "--per-thread" option, the decoder
> > runs into the condition for "etm->timeless_decoding"; and it doesn't
> > contain ETM timestamp.
> > 
> > Afterwards, the injected perf data file also misses ETM timestamp and
> > hit the condition "etm->timeless_decoding".
> > 
> > So I am confusing why the original perf data can be processed properly
> > but fails to handle the injected perf data file.
> 
> Hi Leo,
> 
> My patch only deals with per-cpu mode. With per-thread mode everything is 
> already working
> because _none_ of the events have timestamps because they are not enabled by 
> default:
> 
>   /* In per-cpu case, always need the time of mmap events etc */
>   if (!perf_cpu_map__empty(cpus))
>   evsel__set_sample_bit(tracking_evsel, TIME);
> 
> When none of the events have timestamps, I think perf doesn't use the 
> ordering code in
> ordered-events.c. So when the inject file is opened, the events are read in 
> file order.

The explination makes sense to me.  One thinking: if the original file
doesn't use the ordered event, is it possible for the injected file to
not use the ordered event as well?

Could you confirm Intel-pt can work well for per-cpu mode for inject
file?

> So it's not really about --per-thread vs per-cpu mode, it's actually about 
> whether
> PERF_SAMPLE_TIME is set, which is set as a by-product of per-cpu mode.
>
> I hope I understood your question properly.

Thanks for info, sorry if I miss any info you have elaborated.

Leo


Re: [PATCH 2/2] perf cs-etm: Set time on synthesised samples to preserve ordering

2021-04-15 Thread Leo Yan
On Wed, Apr 14, 2021 at 05:41:46PM +0300, James Clark wrote:
> Hi,
> 
> For this change, I also tried removing the setting of PERF_SAMPLE_TIME in 
> cs_etm__synth_events(). In theory, this would remove the sorting when opening 
> the file, but the change doesn't affect when the built-in events are saved to 
> the inject file. Resulting in events like MMAP and COMM with timestamps, but 
> the synthesised events without. This results in the same issue of the 
> synthesised events appearing before the COMM and MMAP events. If it was 
> possible to somehow tell perf to remove timestamps from built-in events, 
> removing PERF_SAMPLE_TIME would probably be the right solution, because we 
> don't set sample.time.
> 
> For Arm v8.4 we will have the kernel time in the etm timestamps, so an if can 
> be added to switch between this behaviour and the next (more correct) one 
> depending on the hardware. 
> 
> On the subject of timestamps, but not related to this change, some 
> combinations of timestamp options aren't working. For example:
> 
> perf record -e cs_etm/time,@tmc_etr0/u --per-thread
> or  perf record -e cs_etm/@tmc_etr0/u --timestamp --per-thread
> 
> These don't work because of the assumption that etm->timeless_decoding == 
> --per-thread
> and kernel timestamps enabled (/time/ or --timestamp) == etm timestamps 
> enabled (/timestamp/), which isn't necessarily true.
> 
> This can be made to work with a few code changes for cs_etm/time,timestamp/u 
> --per-thread, but cs_etm/time/u --per-thread could be a bit more work. 
> Changes involved would be using "per_cpu_mmaps" in some places instead of 
> etm->timeless_decoding, and also setting etm->timeless_decoding based on 
> whether there are any etm timestamps, not kernel ones. Although to search for 
> any etm timestamp would involve a full decode ahead of time which might not 
> be feasible (or maybe just checking the options, although that's not how it's 
> done in cs_etm__is_timeless_decoding() currently).

Confirm for one thing:

For the orignal perf data file with "--per-thread" option, the decoder
runs into the condition for "etm->timeless_decoding"; and it doesn't
contain ETM timestamp.

Afterwards, the injected perf data file also misses ETM timestamp and
hit the condition "etm->timeless_decoding".

So I am confusing why the original perf data can be processed properly
but fails to handle the injected perf data file.

Thanks,
Leo

> Or, we could force /time/ and /timestamp/ options to always be enabled 
> together in the record stage. 
> 
> 
> Thanks
> James
> 
> On 14/04/2021 17:39, James Clark wrote:
> > The following attribute is set when synthesising samples in
> > timed decoding mode:
> > 
> > attr.sample_type |= PERF_SAMPLE_TIME;
> > 
> > This results in new samples that appear to have timestamps but
> > because we don't assign any timestamps to the samples, when the
> > resulting inject file is opened again, the synthesised samples
> > will be on the wrong side of the MMAP or COMM events.
> > 
> > For example this results in the samples being associated with
> > the perf binary, rather than the target of the record:
> > 
> > perf record -e cs_etm/@tmc_etr0/u top
> > perf inject -i perf.data -o perf.inject --itrace=i100il
> > perf report -i perf.inject
> > 
> > Where 'Command' == perf should show as 'top':
> > 
> > # Overhead  Command  Source Shared Object  Source Symbol   
> > Target Symbol   Basic Block Cycles
> > #   ...    ..  
> > ..  ..
> > #
> > 31.08%  perf [unknown] [.] 0x0040c3f8  [.] 
> > 0x0040c3e8  -
> > 
> > If the perf.data file is opened directly with perf, without the
> > inject step, then this already works correctly because the
> > events are synthesised after the COMM and MMAP events and
> > no second sorting happens. Re-sorting only happens when opening
> > the perf.inject file for the second time so timestamps are
> > needed.
> > 
> > Using the timestamp from the AUX record mirrors the current
> > behaviour when opening directly with perf, because the events
> > are generated on the call to cs_etm__process_queues().
> > 
> > Signed-off-by: James Clark 
> > Co-developed-by: Al Grant 
> > Signed-off-by: Al Grant 
> > ---
> >  tools/perf/util/cs-etm.c | 10 --
> >  1 file changed, 8 insertions(+), 2 deletions(-)
> > 
> > diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c
> > index c25da2ffa8f3..d0fa9dce47f1 100644
> > --- a/tools/perf/util/cs-etm.c
> > +++ b/tools/perf/util/cs-etm.c
> > @@ -54,6 +54,7 @@ struct cs_etm_auxtrace {
> > u8 sample_instructions;
> >  
> > int num_cpu;
> > +   u64 latest_kernel_timestamp;
> > u32 auxtrace_type;
> > u64 branches_sample_type;
> > u64 branches_id;
> > @@ -1192,6 +1193,8 @@ static int cs_etm__synth_instruction_sample(struct 
> > cs_etm_queue *etmq,
> > 

Re: [PATCH 2/2] perf cs-etm: Set time on synthesised samples to preserve ordering

2021-04-15 Thread Leo Yan
Hi James,

On Wed, Apr 14, 2021 at 05:39:19PM +0300, James Clark wrote:
> The following attribute is set when synthesising samples in
> timed decoding mode:
> 
> attr.sample_type |= PERF_SAMPLE_TIME;
> 
> This results in new samples that appear to have timestamps but
> because we don't assign any timestamps to the samples, when the
> resulting inject file is opened again, the synthesised samples
> will be on the wrong side of the MMAP or COMM events.
> 
> For example this results in the samples being associated with
> the perf binary, rather than the target of the record:
> 
> perf record -e cs_etm/@tmc_etr0/u top
> perf inject -i perf.data -o perf.inject --itrace=i100il
> perf report -i perf.inject
> 
> Where 'Command' == perf should show as 'top':
> 
> # Overhead  Command  Source Shared Object  Source Symbol   Target 
> Symbol   Basic Block Cycles
> #   ...    ..  
> ..  ..
> #
> 31.08%  perf [unknown] [.] 0x0040c3f8  [.] 
> 0x0040c3e8  -
> 
> If the perf.data file is opened directly with perf, without the
> inject step, then this already works correctly because the
> events are synthesised after the COMM and MMAP events and
> no second sorting happens. Re-sorting only happens when opening
> the perf.inject file for the second time so timestamps are
> needed.
> 
> Using the timestamp from the AUX record mirrors the current
> behaviour when opening directly with perf, because the events
> are generated on the call to cs_etm__process_queues().
> 
> Signed-off-by: James Clark 
> Co-developed-by: Al Grant 
> Signed-off-by: Al Grant 
> ---
>  tools/perf/util/cs-etm.c | 10 --
>  1 file changed, 8 insertions(+), 2 deletions(-)
> 
> diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c
> index c25da2ffa8f3..d0fa9dce47f1 100644
> --- a/tools/perf/util/cs-etm.c
> +++ b/tools/perf/util/cs-etm.c
> @@ -54,6 +54,7 @@ struct cs_etm_auxtrace {
>   u8 sample_instructions;
>  
>   int num_cpu;
> + u64 latest_kernel_timestamp;
>   u32 auxtrace_type;
>   u64 branches_sample_type;
>   u64 branches_id;
> @@ -1192,6 +1193,8 @@ static int cs_etm__synth_instruction_sample(struct 
> cs_etm_queue *etmq,
>   event->sample.header.misc = cs_etm__cpu_mode(etmq, addr);
>   event->sample.header.size = sizeof(struct perf_event_header);
>  
> + if (!etm->timeless_decoding)
> + sample.time = etm->latest_kernel_timestamp;
>   sample.ip = addr;
>   sample.pid = tidq->pid;
>   sample.tid = tidq->tid;
> @@ -1248,6 +1251,8 @@ static int cs_etm__synth_branch_sample(struct 
> cs_etm_queue *etmq,
>   event->sample.header.misc = cs_etm__cpu_mode(etmq, ip);
>   event->sample.header.size = sizeof(struct perf_event_header);
>  
> + if (!etm->timeless_decoding)
> + sample.time = etm->latest_kernel_timestamp;
>   sample.ip = ip;
>   sample.pid = tidq->pid;
>   sample.tid = tidq->tid;
> @@ -2412,9 +2417,10 @@ static int cs_etm__process_event(struct perf_session 
> *session,
>   else if (event->header.type == PERF_RECORD_SWITCH_CPU_WIDE)
>   return cs_etm__process_switch_cpu_wide(etm, event);
>  
> - if (!etm->timeless_decoding &&
> - event->header.type == PERF_RECORD_AUX)
> + if (!etm->timeless_decoding && event->header.type == PERF_RECORD_AUX) {
> + etm->latest_kernel_timestamp = sample_kernel_timestamp;
>       return cs_etm__process_queues(etm);
> + }

The change looks good to me, I went through these two patches for at
least twice, and didn't find issue.

And given the trace data might be overflow and overwritten, it's
reasonable for me to use the PERF_RECORD_AUX timestamp from the tail of
trace data.

Reviewed-by: Leo Yan 

>   return 0;
>  }
> -- 
> 2.28.0
> 


Re: [PATCH] coresight: etm-perf: Fix define build issue when built as module

2021-04-14 Thread Leo Yan
On Wed, Apr 14, 2021 at 08:48:08PM +0100, Mike Leach wrote:
> CONFIG_CORESIGHT_SOURCE_ETM4X is undefined when built as module,
> CONFIG_CORESIGHT_SOURCE_ETM4X_MODULE is defined instead.
> 
> Therefore code in format_attr_contextid_show() not correctly complied
> when coresight built as module.
> 
> Use IS_ENABLED(CONFIG_CORESIGHT_SOURCE_ETM4X) to correct this.
> 
> Fixes: 88f11864cf1d ("coresight: etm-perf: Support PID tracing for kernel at 
> EL2")
> Signed-off-by: Mike Leach 

Reviewed-by: Leo Yan 

And sorry for introducing mistakes when sending the patches.

> ---
>  drivers/hwtracing/coresight/coresight-etm-perf.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/drivers/hwtracing/coresight/coresight-etm-perf.c 
> b/drivers/hwtracing/coresight/coresight-etm-perf.c
> index 560baefdfed8..b2d6db78a025 100644
> --- a/drivers/hwtracing/coresight/coresight-etm-perf.c
> +++ b/drivers/hwtracing/coresight/coresight-etm-perf.c
> @@ -78,7 +78,7 @@ static ssize_t format_attr_contextid_show(struct device 
> *dev,
>  {
>   int pid_fmt = ETM_OPT_CTXTID;
>  
> -#if defined(CONFIG_CORESIGHT_SOURCE_ETM4X)
> +#if IS_ENABLED(CONFIG_CORESIGHT_SOURCE_ETM4X)
>   pid_fmt = is_kernel_in_hyp_mode() ? ETM_OPT_CTXTID2 : ETM_OPT_CTXTID;
>  #endif
>   return sprintf(page, "config:%d\n", pid_fmt);
> -- 
> 2.17.1
> 


[PATCH v4 3/6] perf arm-spe: Convert event kernel time to counter value

2021-04-12 Thread Leo Yan
When handle a perf event, Arm SPE decoder needs to decide if this perf
event is earlier or later than the samples from Arm SPE trace data; to
do comparision, it needs to use the same unit for the time.

This patch converts the event kernel time to arch timer's counter value,
thus it can be used to compare with counter value contained in Arm SPE
Timestamp packet.

Signed-off-by: Leo Yan 
---
 tools/perf/util/arm-spe.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/perf/util/arm-spe.c b/tools/perf/util/arm-spe.c
index 7620dcc45940..23714cf0380e 100644
--- a/tools/perf/util/arm-spe.c
+++ b/tools/perf/util/arm-spe.c
@@ -669,7 +669,7 @@ static int arm_spe_process_event(struct perf_session 
*session,
}
 
if (sample->time && (sample->time != (u64) -1))
-   timestamp = sample->time;
+   timestamp = perf_time_to_tsc(sample->time, >tc);
else
timestamp = 0;
 
-- 
2.25.1



[PATCH v4 4/6] perf arm-spe: Assign kernel time to synthesized event

2021-04-12 Thread Leo Yan
In current code, it assigns the arch timer counter to the synthesized
samples Arm SPE trace, thus the samples don't contain the kernel time
but only contain the raw counter value.

To fix the issue, this patch converts the timer counter to kernel time
and assigns it to sample timestamp.

Signed-off-by: Leo Yan 
---
 tools/perf/util/arm-spe.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/perf/util/arm-spe.c b/tools/perf/util/arm-spe.c
index 23714cf0380e..c13a89f06ab8 100644
--- a/tools/perf/util/arm-spe.c
+++ b/tools/perf/util/arm-spe.c
@@ -234,7 +234,7 @@ static void arm_spe_prep_sample(struct arm_spe *spe,
struct arm_spe_record *record = >decoder->record;
 
if (!spe->timeless_decoding)
-   sample->time = speq->timestamp;
+   sample->time = tsc_to_perf_time(record->timestamp, >tc);
 
sample->ip = record->from_ip;
sample->cpumode = arm_spe_cpumode(spe, sample->ip);
-- 
2.25.1



[PATCH v4 0/6] perf arm-spe: Enable timestamp

2021-04-12 Thread Leo Yan
This patch set is to enable timestamp for Arm SPE trace.  It reads out
TSC parameters from the TIME_CONV event, the parameters are used for
conversion between timer counter and kernel time and which is applied
for Arm SPE samples.

This version dropped the change for adding hardware clock parameters
into auxtrace info, alternatively, it utilizes the TIME_CONV event to
extract the clock parameters which is used for timestamp calculation.

This patch set can be clearly applied on perf/core branch with:

  commit 2c0cb9f56020 ("perf test: Add a shell test for 'perf stat 
--bpf-counters' new option")

Ths patch series has been tested on Hisilicon D06 platform.

Changes from v3:
* Let to be backwards-compatible for TIME_CONV event (Adrian).

Changes from v2:
* Changed to use TIME_CONV event for extracting clock parameters (Al).

Changes from v1:
* Rebased patch series on the latest perf/core branch;
* Fixed the patch for dumping TSC parameters to support both the
  older and new auxtrace info format.


Leo Yan (6):
  perf arm-spe: Remove unused enum value ARM_SPE_PER_CPU_MMAPS
  perf arm-spe: Save clock parameters from TIME_CONV event
  perf arm-spe: Convert event kernel time to counter value
  perf arm-spe: Assign kernel time to synthesized event
  perf arm-spe: Bail out if the trace is later than perf event
  perf arm-spe: Don't wait for PERF_RECORD_EXIT event

 tools/perf/util/arm-spe.c | 74 +--
 tools/perf/util/arm-spe.h |  1 -
 2 files changed, 64 insertions(+), 11 deletions(-)

-- 
2.25.1



[PATCH v4 2/6] perf arm-spe: Save clock parameters from TIME_CONV event

2021-04-12 Thread Leo Yan
During the recording phase, "perf record" tool synthesizes event
PERF_RECORD_TIME_CONV for the hardware clock parameters and saves the
event into the data file.

Afterwards, when processing the data file, the event TIME_CONV will be
processed at the very early time and is stored into session context.

This patch extracts these parameters from the session context and saves
into the structure "spe->tc" with the type perf_tsc_conversion, so that
the parameters are ready for conversion between clock counter and time
stamp.

Signed-off-by: Leo Yan 
---
 tools/perf/util/arm-spe.c | 27 +++
 1 file changed, 27 insertions(+)

diff --git a/tools/perf/util/arm-spe.c b/tools/perf/util/arm-spe.c
index 2539d4baec44..7620dcc45940 100644
--- a/tools/perf/util/arm-spe.c
+++ b/tools/perf/util/arm-spe.c
@@ -26,6 +26,7 @@
 #include "symbol.h"
 #include "thread.h"
 #include "thread-stack.h"
+#include "tsc.h"
 #include "tool.h"
 #include "util/synthetic-events.h"
 
@@ -45,6 +46,8 @@ struct arm_spe {
struct machine  *machine;
u32 pmu_type;
 
+   struct perf_tsc_conversion  tc;
+
u8  timeless_decoding;
u8  data_queued;
 
@@ -1006,6 +1009,7 @@ int arm_spe_process_auxtrace_info(union perf_event *event,
 {
struct perf_record_auxtrace_info *auxtrace_info = >auxtrace_info;
size_t min_sz = sizeof(u64) * ARM_SPE_AUXTRACE_PRIV_MAX;
+   struct perf_record_time_conv *tc = >time_conv;
struct arm_spe *spe;
int err;
 
@@ -1027,6 +1031,29 @@ int arm_spe_process_auxtrace_info(union perf_event 
*event,
spe->pmu_type = auxtrace_info->priv[ARM_SPE_PMU_TYPE];
 
spe->timeless_decoding = arm_spe__is_timeless_decoding(spe);
+
+   /*
+* The synthesized event PERF_RECORD_TIME_CONV has been handled ahead
+* and the parameters for hardware clock are stored in the session
+* context.  Passes these parameters to the struct perf_tsc_conversion
+* in "spe->tc", which is used for later conversion between clock
+* counter and timestamp.
+*
+* For backward compatibility, checks the event size and save extended
+* fields staring from "time_cycles" if these fields are contained in
+* the event.
+*/
+   spe->tc.time_shift = tc->time_shift;
+   spe->tc.time_mult = tc->time_mult;
+   spe->tc.time_zero = tc->time_zero;
+
+   if (tc->header.size > ((void *)>time_cycles - (void *)tc)) {
+   spe->tc.time_cycles = tc->time_cycles;
+   spe->tc.time_mask = tc->time_mask;
+   spe->tc.cap_user_time_zero = tc->cap_user_time_zero;
+   spe->tc.cap_user_time_short = tc->cap_user_time_short;
+   }
+
spe->auxtrace.process_event = arm_spe_process_event;
spe->auxtrace.process_auxtrace_event = arm_spe_process_auxtrace_event;
spe->auxtrace.flush_events = arm_spe_flush;
-- 
2.25.1



[PATCH v4 5/6] perf arm-spe: Bail out if the trace is later than perf event

2021-04-12 Thread Leo Yan
It's possible that record in Arm SPE trace is later than perf event and
vice versa.  This asks to correlate the perf events and Arm SPE
synthesized events to be processed in the manner of correct timing.

To achieve the time ordering, this patch reverses the flow, it firstly
calls arm_spe_sample() and then calls arm_spe_decode().  By comparing
the timestamp value and detect the perf event is coming earlier than Arm
SPE trace data, it bails out from the decoding loop, the last record is
pushed into auxtrace stack and is deferred to generate sample.  To track
the timestamp, everytime it updates timestamp for the latest record.

Signed-off-by: Leo Yan 
---
 tools/perf/util/arm-spe.c | 37 ++---
 1 file changed, 34 insertions(+), 3 deletions(-)

diff --git a/tools/perf/util/arm-spe.c b/tools/perf/util/arm-spe.c
index c13a89f06ab8..b37d1cacebe9 100644
--- a/tools/perf/util/arm-spe.c
+++ b/tools/perf/util/arm-spe.c
@@ -434,12 +434,36 @@ static int arm_spe_sample(struct arm_spe_queue *speq)
 static int arm_spe_run_decoder(struct arm_spe_queue *speq, u64 *timestamp)
 {
struct arm_spe *spe = speq->spe;
+   struct arm_spe_record *record;
int ret;
 
if (!spe->kernel_start)
spe->kernel_start = machine__kernel_start(spe->machine);
 
while (1) {
+   /*
+* The usual logic is firstly to decode the packets, and then
+* based the record to synthesize sample; but here the flow is
+* reversed: it calls arm_spe_sample() for synthesizing samples
+* prior to arm_spe_decode().
+*
+* Two reasons for this code logic:
+* 1. Firstly, when setup queue in arm_spe__setup_queue(), it
+* has decoded trace data and generated a record, but the record
+* is left to generate sample until run to here, so it's correct
+* to synthesize sample for the left record.
+* 2. After decoding trace data, it needs to compare the record
+* timestamp with the coming perf event, if the record timestamp
+* is later than the perf event, it needs bail out and pushs the
+* record into auxtrace heap, thus the record can be deferred to
+* synthesize sample until run to here at the next time; so this
+* can correlate samples between Arm SPE trace data and other
+* perf events with correct time ordering.
+*/
+   ret = arm_spe_sample(speq);
+   if (ret)
+   return ret;
+
ret = arm_spe_decode(speq->decoder);
if (!ret) {
pr_debug("No data or all data has been processed.\n");
@@ -453,10 +477,17 @@ static int arm_spe_run_decoder(struct arm_spe_queue 
*speq, u64 *timestamp)
if (ret < 0)
continue;
 
-   ret = arm_spe_sample(speq);
-   if (ret)
-   return ret;
+   record = >decoder->record;
 
+   /* Update timestamp for the last record */
+   if (record->timestamp > speq->timestamp)
+   speq->timestamp = record->timestamp;
+
+   /*
+* If the timestamp of the queue is later than timestamp of the
+* coming perf event, bail out so can allow the perf event to
+* be processed ahead.
+*/
if (!spe->timeless_decoding && speq->timestamp >= *timestamp) {
*timestamp = speq->timestamp;
return 0;
-- 
2.25.1



[PATCH v4 6/6] perf arm-spe: Don't wait for PERF_RECORD_EXIT event

2021-04-12 Thread Leo Yan
When decode Arm SPE trace, it waits for PERF_RECORD_EXIT event (the last
perf event) for processing trace data, which is needless and even might
cause logic error, e.g. it might fail to correlate perf events with Arm
SPE events correctly.

So this patch removes the condition checking for PERF_RECORD_EXIT event.

Signed-off-by: Leo Yan 
---
 tools/perf/util/arm-spe.c | 6 +-
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/tools/perf/util/arm-spe.c b/tools/perf/util/arm-spe.c
index b37d1cacebe9..654fa2413823 100644
--- a/tools/perf/util/arm-spe.c
+++ b/tools/perf/util/arm-spe.c
@@ -717,11 +717,7 @@ static int arm_spe_process_event(struct perf_session 
*session,
sample->time);
}
} else if (timestamp) {
-   if (event->header.type == PERF_RECORD_EXIT) {
-   err = arm_spe_process_queues(spe, timestamp);
-   if (err)
-   return err;
-   }
+   err = arm_spe_process_queues(spe, timestamp);
}
 
return err;
-- 
2.25.1



[PATCH v4 1/6] perf arm-spe: Remove unused enum value ARM_SPE_PER_CPU_MMAPS

2021-04-12 Thread Leo Yan
The enum value 'ARM_SPE_PER_CPU_MMAPS' is never used so remove it.

Signed-off-by: Leo Yan 
---
 tools/perf/util/arm-spe.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tools/perf/util/arm-spe.h b/tools/perf/util/arm-spe.h
index 98d3235781c3..105ce0ea0a01 100644
--- a/tools/perf/util/arm-spe.h
+++ b/tools/perf/util/arm-spe.h
@@ -11,7 +11,6 @@
 
 enum {
ARM_SPE_PMU_TYPE,
-   ARM_SPE_PER_CPU_MMAPS,
ARM_SPE_AUXTRACE_PRIV_MAX,
 };
 
-- 
2.25.1



[PATCH v1 2/3] perf session: Add swap operation for event TIME_CONV

2021-04-12 Thread Leo Yan
Since commit d110162cafc8 ("perf tsc: Support cap_user_time_short for
event TIME_CONV"), the event PERF_RECORD_TIME_CONV has extended the data
structure for clock parameters.

To be backwards-compatible, this patch adds a dedicated swap operation
for the event PERF_RECORD_TIME_CONV, based on checking the event size,
it can support both for the old and new event formats.

Fixes: d110162cafc8 ("perf tsc: Support cap_user_time_short for event 
TIME_CONV")
Signed-off-by: Leo Yan 
---
 tools/perf/util/session.c | 22 +-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 9a8808507bd9..afca3d5fc851 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -949,6 +949,26 @@ static void perf_event__stat_round_swap(union perf_event 
*event,
event->stat_round.time = bswap_64(event->stat_round.time);
 }
 
+static void perf_event__time_conv_swap(union perf_event *event,
+  bool sample_id_all __maybe_unused)
+{
+   size_t time_zero_size;
+
+   event->time_conv.time_shift = bswap_64(event->time_conv.time_shift);
+   event->time_conv.time_mult  = bswap_64(event->time_conv.time_mult);
+   event->time_conv.time_zero  = bswap_64(event->time_conv.time_zero);
+
+   time_zero_size = (void *)>time_conv.time_cycles - (void *)event;
+   if (event->header.size > time_zero_size) {
+   event->time_conv.time_cycles = 
bswap_64(event->time_conv.time_cycles);
+   event->time_conv.time_mask = 
bswap_64(event->time_conv.time_mask);
+   event->time_conv.cap_user_time_zero =
+   bswap_32(event->time_conv.cap_user_time_zero);
+   event->time_conv.cap_user_time_short =
+   bswap_32(event->time_conv.cap_user_time_short);
+   }
+}
+
 typedef void (*perf_event__swap_op)(union perf_event *event,
bool sample_id_all);
 
@@ -985,7 +1005,7 @@ static perf_event__swap_op perf_event__swap_ops[] = {
[PERF_RECORD_STAT]= perf_event__stat_swap,
[PERF_RECORD_STAT_ROUND]  = perf_event__stat_round_swap,
[PERF_RECORD_EVENT_UPDATE]= perf_event__event_update_swap,
-   [PERF_RECORD_TIME_CONV]   = perf_event__all64_swap,
+   [PERF_RECORD_TIME_CONV]   = perf_event__time_conv_swap,
[PERF_RECORD_HEADER_MAX]  = NULL,
 };
 
-- 
2.25.1



[PATCH v1 3/3] perf session: Dump PERF_RECORD_TIME_CONV event

2021-04-12 Thread Leo Yan
Now perf tool uses the common stub function process_event_op2_stub() for
dumping TIME_CONV event, thus it doesn't output the clock parameters
contained in the event.

This patch adds the callback function for dumping the hardware clock
parameters in TIME_CONV event.

Before:

  # perf report -D

  0x978 [0x38]: event: 79
  .
  . ... raw event: size 56 bytes
  .  :  4f 00 00 00 00 00 38 00 15 00 00 00 00 00 00 00  O.8.
  .  0010:  00 00 40 01 00 00 00 00 86 89 0b bf df ff ff ff  
..@
  .  0020:  d1 c1 b2 39 03 00 00 00 ff ff ff ff ff ff ff 00  
9.
  .  0030:  01 01 00 00 00 00 00 00  

  0 0 0x978 [0x38]: PERF_RECORD_TIME_CONV
  : unhandled!

  [...]

After:

  # perf report -D

  0x978 [0x38]: event: 79
  .
  . ... raw event: size 56 bytes
  .  :  4f 00 00 00 00 00 38 00 15 00 00 00 00 00 00 00  O.8.
  .  0010:  00 00 40 01 00 00 00 00 86 89 0b bf df ff ff ff  
..@
  .  0020:  d1 c1 b2 39 03 00 00 00 ff ff ff ff ff ff ff 00  
9.
  .  0030:  01 01 00 00 00 00 00 00  

  0 0 0x978 [0x38]: PERF_RECORD_TIME_CONV
  ... Time Shift  21
  ... Time Muliplier  20971520
  ... Time Zero   18446743935180835206
  ... Time Cycles 13852918225
  ... Time Mask   0xff
  ... Cap Time Zero   1
  ... Cap Time Short  1
  : unhandled!

  [...]

Signed-off-by: Leo Yan 
---
 tools/perf/util/session.c | 13 -
 tools/perf/util/tsc.c | 31 +++
 tools/perf/util/tsc.h |  4 
 3 files changed, 47 insertions(+), 1 deletion(-)

diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index afca3d5fc851..19a0b2bc5f33 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -29,6 +29,7 @@
 #include "thread-stack.h"
 #include "sample-raw.h"
 #include "stat.h"
+#include "tsc.h"
 #include "ui/progress.h"
 #include "../perf.h"
 #include "arch/common.h"
@@ -451,6 +452,16 @@ static int process_stat_round_stub(struct perf_session 
*perf_session __maybe_unu
return 0;
 }
 
+static int process_event_time_conv_stub(struct perf_session *perf_session 
__maybe_unused,
+   union perf_event *event)
+{
+   if (dump_trace)
+   perf_event__fprintf_time_conv(event, stdout);
+
+   dump_printf(": unhandled!\n");
+   return 0;
+}
+
 static int perf_session__process_compressed_event_stub(struct perf_session 
*session __maybe_unused,
   union perf_event *event 
__maybe_unused,
   u64 file_offset 
__maybe_unused)
@@ -532,7 +543,7 @@ void perf_tool__fill_defaults(struct perf_tool *tool)
if (tool->stat_round == NULL)
tool->stat_round = process_stat_round_stub;
if (tool->time_conv == NULL)
-   tool->time_conv = process_event_op2_stub;
+   tool->time_conv = process_event_time_conv_stub;
if (tool->feature == NULL)
tool->feature = process_event_op2_stub;
if (tool->compressed == NULL)
diff --git a/tools/perf/util/tsc.c b/tools/perf/util/tsc.c
index 62b4c75c966c..e2a2c63e5189 100644
--- a/tools/perf/util/tsc.c
+++ b/tools/perf/util/tsc.c
@@ -1,5 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 #include 
+#include 
+#include 
 
 #include 
 #include 
@@ -110,3 +112,32 @@ u64 __weak rdtsc(void)
 {
return 0;
 }
+
+size_t perf_event__fprintf_time_conv(union perf_event *event, FILE *fp)
+{
+   struct perf_record_time_conv *tc = (struct perf_record_time_conv 
*)event;
+   size_t ret;
+
+   ret  = fprintf(fp, "\n... Time Shift  %" PRI_lu64 "\n", 
tc->time_shift);
+   ret += fprintf(fp, "... Time Muliplier  %" PRI_lu64 "\n", 
tc->time_mult);
+   ret += fprintf(fp, "... Time Zero   %" PRI_lu64 "\n", 
tc->time_zero);
+
+   /*
+* The event TIME_CONV was extended for the fields from "time_cycles"
+* when supported cap_user_time_short, for backward compatibility,
+* checks the event size and prints these extended fields if these
+* fields are contained in the perf data file.
+*/
+   if (tc->header.size > ((void *)>time_cycles - (void *)tc)) {
+   ret += fprintf(fp, "... Time Cycles %" PRI_lu64 "\n",
+  tc->time_cycles);
+   ret += fprintf(fp, "... Time Mask   %#" PRI_lx64 "\n",
+  tc->time_mask);
+   ret += fprintf(fp, "... Cap Time Zero   %" PRId32 "\n",
+  tc->cap_user_time_zero);
+   ret += fprintf(fp, "... Cap Time Short  %" P

[PATCH v1 1/3] perf jit: Let convert_timestamp() to be backwards-compatible

2021-04-12 Thread Leo Yan
Commit d110162cafc8 ("perf tsc: Support cap_user_time_short for event
TIME_CONV") supports the extended parameters for event TIME_CONV, but it
broke the backwards compatibility, so any perf data file with old event
format fails to convert timestamp.

For the backwards-compatibility, this patch checks the event size, if
the event size confirms the extended parameters are supported in the
event TIME_CONV, then copies these parameters.

Fixes: d110162cafc8 ("perf tsc: Support cap_user_time_short for event 
TIME_CONV")
Signed-off-by: Leo Yan 
---
 tools/perf/util/jitdump.c | 31 +--
 1 file changed, 21 insertions(+), 10 deletions(-)

diff --git a/tools/perf/util/jitdump.c b/tools/perf/util/jitdump.c
index 9760d8e7b386..67b514c38a43 100644
--- a/tools/perf/util/jitdump.c
+++ b/tools/perf/util/jitdump.c
@@ -396,21 +396,32 @@ static pid_t jr_entry_tid(struct jit_buf_desc *jd, union 
jr_entry *jr)
 
 static uint64_t convert_timestamp(struct jit_buf_desc *jd, uint64_t timestamp)
 {
-   struct perf_tsc_conversion tc;
+   struct perf_tsc_conversion tc = { 0 };
+   struct perf_record_time_conv *time_conv = >session->time_conv;
 
if (!jd->use_arch_timestamp)
return timestamp;
 
-   tc.time_shift  = jd->session->time_conv.time_shift;
-   tc.time_mult   = jd->session->time_conv.time_mult;
-   tc.time_zero   = jd->session->time_conv.time_zero;
-   tc.time_cycles = jd->session->time_conv.time_cycles;
-   tc.time_mask   = jd->session->time_conv.time_mask;
-   tc.cap_user_time_zero  = jd->session->time_conv.cap_user_time_zero;
-   tc.cap_user_time_short = jd->session->time_conv.cap_user_time_short;
+   tc.time_shift = time_conv->time_shift;
+   tc.time_mult  = time_conv->time_mult;
+   tc.time_zero  = time_conv->time_zero;
 
-   if (!tc.cap_user_time_zero)
-   return 0;
+   /*
+* The event TIME_CONV was extended for the fields from "time_cycles"
+* when supported cap_user_time_short, for backward compatibility,
+* checks the event size and assigns these extended fields if these
+* fields are contained in the event.
+*/
+   if (time_conv->header.size >
+   ((void *)_conv->time_cycles - (void *)time_conv)) {
+   tc.time_cycles = time_conv->time_cycles;
+   tc.time_mask   = time_conv->time_mask;
+   tc.cap_user_time_zero  = time_conv->cap_user_time_zero;
+   tc.cap_user_time_short = time_conv->cap_user_time_short;
+
+   if (!tc.cap_user_time_zero)
+   return 0;
+   }
 
return tsc_to_perf_time(timestamp, );
 }
-- 
2.25.1



[PATCH v1 0/3] perf: Allow TIME_CONV to be backwards-compatible and dump it

2021-04-12 Thread Leo Yan
The event PERF_RECORD_TIME_CONV was extended for clock parameters, but
the tool fails to be backwards-compatible for the old event format.

Based on checking the event size, this patch series can decide if the
extended clock parameters are contained in the perf event or not.  This
allows the event PERF_RECORD_TIME_CONV to be backwards-compatible.

The last patch also is introduced for dumping the event, for both the
old and latest event formats.

The patch set has been tested on Arm64 HiSilicon D06 platform.

Leo Yan (3):
  perf jit: Let convert_timestamp() to be backwards-compatible
  perf session: Add swap operation for event TIME_CONV
  perf session: Dump PERF_RECORD_TIME_CONV event

 tools/perf/util/jitdump.c | 31 +--
 tools/perf/util/session.c | 35 +--
 tools/perf/util/tsc.c | 31 +++
 tools/perf/util/tsc.h |  4 
 4 files changed, 89 insertions(+), 12 deletions(-)

-- 
2.25.1



Re: [PATCH v3 2/6] perf arm-spe: Save clock parameters from TIME_CONV event

2021-04-11 Thread Leo Yan
On Sat, Apr 10, 2021 at 01:00:42PM +0800, Leo Yan wrote:
> During the recording phase, "perf record" tool synthesizes event
> PERF_RECORD_TIME_CONV for the hardware clock parameters and saves the
> event into the data file.
> 
> Afterwards, when processing the data file, the event TIME_CONV will be
> processed at the very early time and is stored into session context.
> 
> This patch extracts these parameters from the session context and saves
> into the structure "spe->tc" with the type perf_tsc_conversion, so that
> the parameters are ready for conversion between clock counter and time
> stamp.
> 
> Signed-off-by: Leo Yan 

[...]

> ---
>  tools/perf/util/arm-spe.c | 19 +++
>  1 file changed, 19 insertions(+)
> 
> diff --git a/tools/perf/util/arm-spe.c b/tools/perf/util/arm-spe.c
> index 2539d4baec44..b48816d5c0b4 100644
> --- a/tools/perf/util/arm-spe.c
> +++ b/tools/perf/util/arm-spe.c
> @@ -26,6 +26,7 @@
>  #include "symbol.h"
>  #include "thread.h"
>  #include "thread-stack.h"
> +#include "tsc.h"
>  #include "tool.h"
>  #include "util/synthetic-events.h"
>  
> @@ -45,6 +46,8 @@ struct arm_spe {
>   struct machine  *machine;
>   u32 pmu_type;
>  
> + struct perf_tsc_conversion  tc;
> +
>   u8  timeless_decoding;
>   u8  data_queued;
>  
> @@ -1027,6 +1030,22 @@ int arm_spe_process_auxtrace_info(union perf_event 
> *event,
>   spe->pmu_type = auxtrace_info->priv[ARM_SPE_PMU_TYPE];
>  
>   spe->timeless_decoding = arm_spe__is_timeless_decoding(spe);
> +
> + /*
> +  * The synthesized event PERF_RECORD_TIME_CONV has been handled
> +  * ahead and the parameters for hardware clock are stored in
> +  * the session context.  Passes these parameters to the structure
> +  * perf_tsc_conversion in "spe->tc", which is used for later
> +  * conversion between clock counter and timestamp.
> +  */
> + spe->tc.time_shift = session->time_conv.time_shift;
> + spe->tc.time_mult = session->time_conv.time_mult;
> + spe->tc.time_zero = session->time_conv.time_zero;
> + spe->tc.time_cycles = session->time_conv.time_cycles;
> + spe->tc.time_mask = session->time_conv.time_mask;
> + spe->tc.cap_user_time_zero = session->time_conv.cap_user_time_zero;
> + spe->tc.cap_user_time_short = session->time_conv.cap_user_time_short;

As Adrain suggested, this patch also should check the size of event
PERF_RECORD_TIME_CONV for backwards-compability.   I will send a new
patch set for this.

Thanks,
Leo

>   spe->auxtrace.process_event = arm_spe_process_event;
>   spe->auxtrace.process_auxtrace_event = arm_spe_process_auxtrace_event;
>   spe->auxtrace.flush_events = arm_spe_flush;
> -- 
> 2.25.1
> 


Re: [PATCH] perf session: Dump PERF_RECORD_TIME_CONV event

2021-04-10 Thread Leo Yan
Hi Adrian,

On Sat, Apr 10, 2021 at 11:46:10AM +0300, Adrian Hunter wrote:

[...]

> Hi Leo
> 
> I think there might be some more work related to this.
> 
> Pedantically, shouldn't you cater for backward compatibility and
> not assume the following were in the perf.data file:
>   
>  
>__u64time_cycles;  
>   
>  
>__u64time_mask;
>   
>  
>bool cap_user_time_zero;   
>   
>  
>bool cap_user_time_short;   
> 
> That means checking the event size.
> 
> Also PERF_RECORD_TIME_CONV should have its own byte-swapper instead of  
> perf_event__all64_swap() - also checking event size.
> 
> i.e. fixes for:
> 
>   commit d110162cafc80dad0622cfd40f3113aebb77e1bb
>   Author: Leo Yan 
>   Date:   Mon Sep 14 19:53:09 2020 +0800
> 
> perf tsc: Support cap_user_time_short for event TIME_CONV

Indeed!  IIUC, should have three fixes with event size checking:

- One fix for dumping TIME_CONV event;
- One fix for byte-swapper (especially for bool values);
- One fix for commit d110162cafc80dad0622cfd40f3113aebb77e1bb;

Will follow up for the suggestions.  Thanks a lot for your insight
review.

Leo


[PATCH v3 6/6] perf arm-spe: Don't wait for PERF_RECORD_EXIT event

2021-04-09 Thread Leo Yan
When decode Arm SPE trace, it waits for PERF_RECORD_EXIT event (the last
perf event) for processing trace data, which is needless and even might
cause logic error, e.g. it might fail to correlate perf events with Arm
SPE events correctly.

So this patch removes the condition checking for PERF_RECORD_EXIT event.

Signed-off-by: Leo Yan 
---
 tools/perf/util/arm-spe.c | 6 +-
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/tools/perf/util/arm-spe.c b/tools/perf/util/arm-spe.c
index 8facda81a06c..5e98a29fcbdb 100644
--- a/tools/perf/util/arm-spe.c
+++ b/tools/perf/util/arm-spe.c
@@ -717,11 +717,7 @@ static int arm_spe_process_event(struct perf_session 
*session,
sample->time);
}
} else if (timestamp) {
-   if (event->header.type == PERF_RECORD_EXIT) {
-   err = arm_spe_process_queues(spe, timestamp);
-   if (err)
-   return err;
-   }
+   err = arm_spe_process_queues(spe, timestamp);
}
 
return err;
-- 
2.25.1



[PATCH v3 5/6] perf arm-spe: Bail out if the trace is later than perf event

2021-04-09 Thread Leo Yan
It's possible that record in Arm SPE trace is later than perf event and
vice versa.  This asks to correlate the perf events and Arm SPE
synthesized events to be processed in the manner of correct timing.

To achieve the time ordering, this patch reverses the flow, it firstly
calls arm_spe_sample() and then calls arm_spe_decode().  By comparing
the timestamp value and detect the perf event is coming earlier than Arm
SPE trace data, it bails out from the decoding loop, the last record is
pushed into auxtrace stack and is deferred to generate sample.  To track
the timestamp, everytime it updates timestamp for the latest record.

Signed-off-by: Leo Yan 
---
 tools/perf/util/arm-spe.c | 37 ++---
 1 file changed, 34 insertions(+), 3 deletions(-)

diff --git a/tools/perf/util/arm-spe.c b/tools/perf/util/arm-spe.c
index ec7df83b50fd..8facda81a06c 100644
--- a/tools/perf/util/arm-spe.c
+++ b/tools/perf/util/arm-spe.c
@@ -434,12 +434,36 @@ static int arm_spe_sample(struct arm_spe_queue *speq)
 static int arm_spe_run_decoder(struct arm_spe_queue *speq, u64 *timestamp)
 {
struct arm_spe *spe = speq->spe;
+   struct arm_spe_record *record;
int ret;
 
if (!spe->kernel_start)
spe->kernel_start = machine__kernel_start(spe->machine);
 
while (1) {
+   /*
+* The usual logic is firstly to decode the packets, and then
+* based the record to synthesize sample; but here the flow is
+* reversed: it calls arm_spe_sample() for synthesizing samples
+* prior to arm_spe_decode().
+*
+* Two reasons for this code logic:
+* 1. Firstly, when setup queue in arm_spe__setup_queue(), it
+* has decoded trace data and generated a record, but the record
+* is left to generate sample until run to here, so it's correct
+* to synthesize sample for the left record.
+* 2. After decoding trace data, it needs to compare the record
+* timestamp with the coming perf event, if the record timestamp
+* is later than the perf event, it needs bail out and pushs the
+* record into auxtrace heap, thus the record can be deferred to
+* synthesize sample until run to here at the next time; so this
+* can correlate samples between Arm SPE trace data and other
+* perf events with correct time ordering.
+*/
+   ret = arm_spe_sample(speq);
+   if (ret)
+   return ret;
+
ret = arm_spe_decode(speq->decoder);
if (!ret) {
pr_debug("No data or all data has been processed.\n");
@@ -453,10 +477,17 @@ static int arm_spe_run_decoder(struct arm_spe_queue 
*speq, u64 *timestamp)
if (ret < 0)
continue;
 
-   ret = arm_spe_sample(speq);
-   if (ret)
-   return ret;
+   record = >decoder->record;
 
+   /* Update timestamp for the last record */
+   if (record->timestamp > speq->timestamp)
+   speq->timestamp = record->timestamp;
+
+   /*
+* If the timestamp of the queue is later than timestamp of the
+* coming perf event, bail out so can allow the perf event to
+* be processed ahead.
+*/
if (!spe->timeless_decoding && speq->timestamp >= *timestamp) {
*timestamp = speq->timestamp;
return 0;
-- 
2.25.1



[PATCH v3 4/6] perf arm-spe: Assign kernel time to synthesized event

2021-04-09 Thread Leo Yan
In current code, it assigns the arch timer counter to the synthesized
samples Arm SPE trace, thus the samples don't contain the kernel time
but only contain the raw counter value.

To fix the issue, this patch converts the timer counter to kernel time
and assigns it to sample timestamp.

Signed-off-by: Leo Yan 
---
 tools/perf/util/arm-spe.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/perf/util/arm-spe.c b/tools/perf/util/arm-spe.c
index f66e10c62473..ec7df83b50fd 100644
--- a/tools/perf/util/arm-spe.c
+++ b/tools/perf/util/arm-spe.c
@@ -234,7 +234,7 @@ static void arm_spe_prep_sample(struct arm_spe *spe,
struct arm_spe_record *record = >decoder->record;
 
if (!spe->timeless_decoding)
-   sample->time = speq->timestamp;
+   sample->time = tsc_to_perf_time(record->timestamp, >tc);
 
sample->ip = record->from_ip;
sample->cpumode = arm_spe_cpumode(spe, sample->ip);
-- 
2.25.1



[PATCH v3 3/6] perf arm-spe: Convert event kernel time to counter value

2021-04-09 Thread Leo Yan
When handle a perf event, Arm SPE decoder needs to decide if this perf
event is earlier or later than the samples from Arm SPE trace data; to
do comparision, it needs to use the same unit for the time.

This patch converts the event kernel time to arch timer's counter value,
thus it can be used to compare with counter value contained in Arm SPE
Timestamp packet.

Signed-off-by: Leo Yan 
---
 tools/perf/util/arm-spe.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/perf/util/arm-spe.c b/tools/perf/util/arm-spe.c
index b48816d5c0b4..f66e10c62473 100644
--- a/tools/perf/util/arm-spe.c
+++ b/tools/perf/util/arm-spe.c
@@ -669,7 +669,7 @@ static int arm_spe_process_event(struct perf_session 
*session,
}
 
if (sample->time && (sample->time != (u64) -1))
-   timestamp = sample->time;
+   timestamp = perf_time_to_tsc(sample->time, >tc);
else
timestamp = 0;
 
-- 
2.25.1



[PATCH v3 2/6] perf arm-spe: Save clock parameters from TIME_CONV event

2021-04-09 Thread Leo Yan
During the recording phase, "perf record" tool synthesizes event
PERF_RECORD_TIME_CONV for the hardware clock parameters and saves the
event into the data file.

Afterwards, when processing the data file, the event TIME_CONV will be
processed at the very early time and is stored into session context.

This patch extracts these parameters from the session context and saves
into the structure "spe->tc" with the type perf_tsc_conversion, so that
the parameters are ready for conversion between clock counter and time
stamp.

Signed-off-by: Leo Yan 
---
 tools/perf/util/arm-spe.c | 19 +++
 1 file changed, 19 insertions(+)

diff --git a/tools/perf/util/arm-spe.c b/tools/perf/util/arm-spe.c
index 2539d4baec44..b48816d5c0b4 100644
--- a/tools/perf/util/arm-spe.c
+++ b/tools/perf/util/arm-spe.c
@@ -26,6 +26,7 @@
 #include "symbol.h"
 #include "thread.h"
 #include "thread-stack.h"
+#include "tsc.h"
 #include "tool.h"
 #include "util/synthetic-events.h"
 
@@ -45,6 +46,8 @@ struct arm_spe {
struct machine  *machine;
u32 pmu_type;
 
+   struct perf_tsc_conversion  tc;
+
u8  timeless_decoding;
u8  data_queued;
 
@@ -1027,6 +1030,22 @@ int arm_spe_process_auxtrace_info(union perf_event 
*event,
spe->pmu_type = auxtrace_info->priv[ARM_SPE_PMU_TYPE];
 
spe->timeless_decoding = arm_spe__is_timeless_decoding(spe);
+
+   /*
+* The synthesized event PERF_RECORD_TIME_CONV has been handled
+* ahead and the parameters for hardware clock are stored in
+* the session context.  Passes these parameters to the structure
+* perf_tsc_conversion in "spe->tc", which is used for later
+* conversion between clock counter and timestamp.
+*/
+   spe->tc.time_shift = session->time_conv.time_shift;
+   spe->tc.time_mult = session->time_conv.time_mult;
+   spe->tc.time_zero = session->time_conv.time_zero;
+   spe->tc.time_cycles = session->time_conv.time_cycles;
+   spe->tc.time_mask = session->time_conv.time_mask;
+   spe->tc.cap_user_time_zero = session->time_conv.cap_user_time_zero;
+   spe->tc.cap_user_time_short = session->time_conv.cap_user_time_short;
+
spe->auxtrace.process_event = arm_spe_process_event;
spe->auxtrace.process_auxtrace_event = arm_spe_process_auxtrace_event;
spe->auxtrace.flush_events = arm_spe_flush;
-- 
2.25.1



[PATCH v3 1/6] perf arm-spe: Remove unused enum value ARM_SPE_PER_CPU_MMAPS

2021-04-09 Thread Leo Yan
The enum value 'ARM_SPE_PER_CPU_MMAPS' is never used so remove it.

Signed-off-by: Leo Yan 
---
 tools/perf/util/arm-spe.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tools/perf/util/arm-spe.h b/tools/perf/util/arm-spe.h
index 98d3235781c3..105ce0ea0a01 100644
--- a/tools/perf/util/arm-spe.h
+++ b/tools/perf/util/arm-spe.h
@@ -11,7 +11,6 @@
 
 enum {
ARM_SPE_PMU_TYPE,
-   ARM_SPE_PER_CPU_MMAPS,
ARM_SPE_AUXTRACE_PRIV_MAX,
 };
 
-- 
2.25.1



[PATCH v3 0/6] perf arm-spe: Enable timestamp

2021-04-09 Thread Leo Yan
This patch set is to enable timestamp for Arm SPE trace.  It reads out
TSC parameters from the TIME_CONV event, the parameters are used for
conversion between timer counter and kernel time and which is applied
for Arm SPE samples.

This version dropped the change for adding hardware clock parameters
into auxtrace info, alternatively, it utilizes the TIME_CONV event to
extract the clock parameters which is used for timestamp calculation.

This patch set can be clearly applied on perf/core branch with:

  commit 2c0cb9f56020 ("perf test: Add a shell test for 'perf stat 
--bpf-counters' new option")

Ths patch series has been tested on Hisilicon D06 platform.

Changes from v2:
* Changed to use TIME_CONV event for extracing clock parameters (Al).

Changes from v1:
* Rebased patch series on the latest perf/core branch;
* Fixed the patch for dumping TSC parameters to support both the
  older and new auxtrace info format.


Leo Yan (6):
  perf arm-spe: Remove unused enum value ARM_SPE_PER_CPU_MMAPS
  perf arm-spe: Save clock parameters from TIME_CONV event
  perf arm-spe: Convert event kernel time to counter value
  perf arm-spe: Assign kernel time to synthesized event
  perf arm-spe: Bail out if the trace is later than perf event
  perf arm-spe: Don't wait for PERF_RECORD_EXIT event

 tools/perf/util/arm-spe.c | 66 +--
 tools/perf/util/arm-spe.h |  1 -
 2 files changed, 56 insertions(+), 11 deletions(-)

-- 
2.25.1



[PATCH] perf session: Dump PERF_RECORD_TIME_CONV event

2021-04-09 Thread Leo Yan
Now perf tool uses the common stub function process_event_op2_stub() for
dumping TIME_CONV event, thus it doesn't output the clock parameters
contained in the event.

This patch adds the callback function for dumping the hardware clock
parameters in TIME_CONV event.

Before:

  # perf report -D

  0x978 [0x38]: event: 79
  .
  . ... raw event: size 56 bytes
  .  :  4f 00 00 00 00 00 38 00 15 00 00 00 00 00 00 00  O.8.
  .  0010:  00 00 40 01 00 00 00 00 86 89 0b bf df ff ff ff  
..@
  .  0020:  d1 c1 b2 39 03 00 00 00 ff ff ff ff ff ff ff 00  
9.
  .  0030:  01 01 00 00 00 00 00 00  

  0 0 0x978 [0x38]: PERF_RECORD_TIME_CONV
  : unhandled!

  [...]

After:

  # perf report -D

  0x978 [0x38]: event: 79
  .
  . ... raw event: size 56 bytes
  .  :  4f 00 00 00 00 00 38 00 15 00 00 00 00 00 00 00  O.8.
  .  0010:  00 00 40 01 00 00 00 00 86 89 0b bf df ff ff ff  
..@
  .  0020:  d1 c1 b2 39 03 00 00 00 ff ff ff ff ff ff ff 00  
9.
  .  0030:  01 01 00 00 00 00 00 00  

  0 0 0x978 [0x38]: PERF_RECORD_TIME_CONV
  ... Time Shift  21
  ... Time Muliplier  20971520
  ... Time Zero   18446743935180835206
  ... Time Cycles 13852918225
  ... Time Mask   0xff
  ... Cap Time Zero   1
  ... Cap Time Short  1
  : unhandled!

  [...]

Signed-off-by: Leo Yan 
---
 tools/perf/util/session.c | 13 -
 tools/perf/util/tsc.c | 18 ++
 tools/perf/util/tsc.h |  4 
 3 files changed, 34 insertions(+), 1 deletion(-)

diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 9a8808507bd9..75931c8054aa 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -29,6 +29,7 @@
 #include "thread-stack.h"
 #include "sample-raw.h"
 #include "stat.h"
+#include "tsc.h"
 #include "ui/progress.h"
 #include "../perf.h"
 #include "arch/common.h"
@@ -451,6 +452,16 @@ static int process_stat_round_stub(struct perf_session 
*perf_session __maybe_unu
return 0;
 }
 
+static int process_event_time_conv_stub(struct perf_session *perf_session 
__maybe_unused,
+   union perf_event *event)
+{
+   if (dump_trace)
+   perf_event__fprintf_time_conv(event, stdout);
+
+   dump_printf(": unhandled!\n");
+   return 0;
+}
+
 static int perf_session__process_compressed_event_stub(struct perf_session 
*session __maybe_unused,
   union perf_event *event 
__maybe_unused,
   u64 file_offset 
__maybe_unused)
@@ -532,7 +543,7 @@ void perf_tool__fill_defaults(struct perf_tool *tool)
if (tool->stat_round == NULL)
tool->stat_round = process_stat_round_stub;
if (tool->time_conv == NULL)
-   tool->time_conv = process_event_op2_stub;
+   tool->time_conv = process_event_time_conv_stub;
if (tool->feature == NULL)
tool->feature = process_event_op2_stub;
if (tool->compressed == NULL)
diff --git a/tools/perf/util/tsc.c b/tools/perf/util/tsc.c
index 62b4c75c966c..4ac3cc72f3e1 100644
--- a/tools/perf/util/tsc.c
+++ b/tools/perf/util/tsc.c
@@ -1,5 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 #include 
+#include 
+#include 
 
 #include 
 #include 
@@ -110,3 +112,19 @@ u64 __weak rdtsc(void)
 {
return 0;
 }
+
+size_t perf_event__fprintf_time_conv(union perf_event *event, FILE *fp)
+{
+   struct perf_record_time_conv *tc = (struct perf_record_time_conv 
*)event;
+   size_t ret;
+
+   ret  = fprintf(fp, "\n... Time Shift  %" PRI_lu64 "\n", 
tc->time_shift);
+   ret += fprintf(fp, "... Time Muliplier  %" PRI_lu64 "\n", 
tc->time_mult);
+   ret += fprintf(fp, "... Time Zero   %" PRI_lu64 "\n", 
tc->time_zero);
+   ret += fprintf(fp, "... Time Cycles %" PRI_lu64 "\n", 
tc->time_cycles);
+   ret += fprintf(fp, "... Time Mask   %#" PRI_lx64 "\n", 
tc->time_mask);
+   ret += fprintf(fp, "... Cap Time Zero   %" PRId32 "\n", 
tc->cap_user_time_zero);
+   ret += fprintf(fp, "... Cap Time Short  %" PRId32 "\n", 
tc->cap_user_time_short);
+
+   return ret;
+}
diff --git a/tools/perf/util/tsc.h b/tools/perf/util/tsc.h
index 72a15419f3b3..7d83a31732a7 100644
--- a/tools/perf/util/tsc.h
+++ b/tools/perf/util/tsc.h
@@ -4,6 +4,8 @@
 
 #include 
 
+#include "event.h"
+
 struct perf_tsc_conversion {
u16 time_shift;
u32 time_mult;
@@ -24,4 +26,6 @@ u64 perf_time_to_tsc(u64 ns, struct perf_tsc_conversion *tc);
 u64 tsc_to_perf_time(u64 cyc, struct perf_tsc_conversion *tc);
 u64 rdtsc(void);
 
+size_t perf_event__fprintf_time_conv(union perf_event *event, FILE *fp);
+
 #endif // __PERF_TSC_H
-- 
2.25.1



Re: [PATCH v2 0/7] perf arm-spe: Enable timestamp

2021-04-07 Thread Leo Yan
On Wed, Apr 07, 2021 at 04:28:40PM +0300, Adrian Hunter wrote:
> On 7/04/21 4:15 pm, Leo Yan wrote:
> > Hi Al,
> > 
> > On Tue, Apr 06, 2021 at 09:38:32AM +, Al Grant wrote:
> > 
> > [...]
> > 
> >>> This patch set is to enable timestamp for Arm SPE trace.  It reads out TSC
> >>> parameters from mmap page and stores into auxtrace info structure;
> >>
> >> Why not synthesize a PERF_RECORD_TIME_CONV - isn't that specifically to
> >> capture the TSC parameters from the mmap page? If a generic mechanism
> >> exists it would be better to use it, otherwise we'll have to do this again 
> >> for
> >> future trace formats.
> > 
> > Good point!  Actually "perf record" tool has synthesized event
> > PERF_RECORD_TIME_CONV.  This patch series is studying the
> > implementation from Intel-PT, so the question is why the existed
> > implementations (like Intel-PT, Intel-BTS) don't directly use
> > PERF_RECORD_TIME_CONV for retriving TSC parameters.
> 
> PERF_RECORD_TIME_CONV was added later because the TSC information is
> needed by jitdump.

Thanks for the info, Adrian.

If so, it's good for Arm SPE to use PERF_RECORD_TIME_CONV for TSC
parameters.  Will spin patch series for this.

Leo


Re: [PATCH v2 0/7] perf arm-spe: Enable timestamp

2021-04-07 Thread Leo Yan
Hi Al,

On Tue, Apr 06, 2021 at 09:38:32AM +, Al Grant wrote:

[...]

> > This patch set is to enable timestamp for Arm SPE trace.  It reads out TSC
> > parameters from mmap page and stores into auxtrace info structure;
> 
> Why not synthesize a PERF_RECORD_TIME_CONV - isn't that specifically to
> capture the TSC parameters from the mmap page? If a generic mechanism
> exists it would be better to use it, otherwise we'll have to do this again for
> future trace formats.

Good point!  Actually "perf record" tool has synthesized event
PERF_RECORD_TIME_CONV.  This patch series is studying the
implementation from Intel-PT, so the question is why the existed
implementations (like Intel-PT, Intel-BTS) don't directly use
PERF_RECORD_TIME_CONV for retriving TSC parameters.

I agree using PERF_RECORD_TIME_CONV for TSC parameter is better than
extending auxtrace info.  Will experiment for this.

> perf_read_tsc_conversion and perf_event__synth_time_conv are currently
> in arch/x86/util/tsc.c, but nothing in them is x86-specific and they could be
> moved somewhere more generic.

This is not true on the mainline kernel; these functions have been
moved into the file util/tsc.c.

Thanks for suggestions,
Leo


[PATCH v2 7/7] perf arm-spe: Don't wait for PERF_RECORD_EXIT event

2021-04-03 Thread Leo Yan
When decode Arm SPE trace, it waits for PERF_RECORD_EXIT event (the last
perf event) for processing trace data, which is needless and even might
cause logic error, e.g. it might fail to correlate perf events with Arm
SPE events correctly.

So this patch removes the condition checking for PERF_RECORD_EXIT event.

Signed-off-by: Leo Yan 
---
 tools/perf/util/arm-spe.c | 6 +-
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/tools/perf/util/arm-spe.c b/tools/perf/util/arm-spe.c
index 99a394c366e0..17dcad99912a 100644
--- a/tools/perf/util/arm-spe.c
+++ b/tools/perf/util/arm-spe.c
@@ -717,11 +717,7 @@ static int arm_spe_process_event(struct perf_session 
*session,
sample->time);
}
} else if (timestamp) {
-   if (event->header.type == PERF_RECORD_EXIT) {
-   err = arm_spe_process_queues(spe, timestamp);
-   if (err)
-   return err;
-   }
+   err = arm_spe_process_queues(spe, timestamp);
}
 
return err;
-- 
2.25.1



[PATCH v2 6/7] perf arm-spe: Bail out if the trace is later than perf event

2021-04-03 Thread Leo Yan
It's possible that record in Arm SPE trace is later than perf event and
vice versa.  This asks to correlate the perf events and Arm SPE
synthesized events to be processed in the manner of correct timing.

To achieve the time ordering, this patch reverses the flow, it firstly
calls arm_spe_sample() and then calls arm_spe_decode().  By comparing
the timestamp value and detect the perf event is coming earlier than Arm
SPE trace data, it bails out from the decoding loop, the last record is
pushed into auxtrace stack and is deferred to generate sample.  To track
the timestamp, everytime it updates timestamp for the latest record.

Signed-off-by: Leo Yan 
---
 tools/perf/util/arm-spe.c | 37 ++---
 1 file changed, 34 insertions(+), 3 deletions(-)

diff --git a/tools/perf/util/arm-spe.c b/tools/perf/util/arm-spe.c
index 80f5659e7f7e..99a394c366e0 100644
--- a/tools/perf/util/arm-spe.c
+++ b/tools/perf/util/arm-spe.c
@@ -434,12 +434,36 @@ static int arm_spe_sample(struct arm_spe_queue *speq)
 static int arm_spe_run_decoder(struct arm_spe_queue *speq, u64 *timestamp)
 {
struct arm_spe *spe = speq->spe;
+   struct arm_spe_record *record;
int ret;
 
if (!spe->kernel_start)
spe->kernel_start = machine__kernel_start(spe->machine);
 
while (1) {
+   /*
+* The usual logic is firstly to decode the packets, and then
+* based the record to synthesize sample; but here the flow is
+* reversed: it calls arm_spe_sample() for synthesizing samples
+* prior to arm_spe_decode().
+*
+* Two reasons for this code logic:
+* 1. Firstly, when setup queue in arm_spe__setup_queue(), it
+* has decoded trace data and generated a record, but the record
+* is left to generate sample until run to here, so it's correct
+* to synthesize sample for the left record.
+* 2. After decoding trace data, it needs to compare the record
+* timestamp with the coming perf event, if the record timestamp
+* is later than the perf event, it needs bail out and pushs the
+* record into auxtrace heap, thus the record can be deferred to
+* synthesize sample until run to here at the next time; so this
+* can correlate samples between Arm SPE trace data and other
+* perf events with correct time ordering.
+*/
+   ret = arm_spe_sample(speq);
+   if (ret)
+   return ret;
+
ret = arm_spe_decode(speq->decoder);
if (!ret) {
pr_debug("No data or all data has been processed.\n");
@@ -453,10 +477,17 @@ static int arm_spe_run_decoder(struct arm_spe_queue 
*speq, u64 *timestamp)
if (ret < 0)
continue;
 
-   ret = arm_spe_sample(speq);
-   if (ret)
-   return ret;
+   record = >decoder->record;
 
+   /* Update timestamp for the last record */
+   if (record->timestamp > speq->timestamp)
+   speq->timestamp = record->timestamp;
+
+   /*
+* If the timestamp of the queue is later than timestamp of the
+* coming perf event, bail out so can allow the perf event to
+* be processed ahead.
+*/
if (!spe->timeless_decoding && speq->timestamp >= *timestamp) {
*timestamp = speq->timestamp;
return 0;
-- 
2.25.1



[PATCH v2 5/7] perf arm-spe: Assign kernel time to synthesized event

2021-04-03 Thread Leo Yan
In current code, it assigns the arch timer counter to the synthesized
samples Arm SPE trace, thus the samples don't contain the kernel time
but only contain the raw counter value.

To fix the issue, this patch converts the timer counter to kernel time
and assigns it to sample timestamp.

Signed-off-by: Leo Yan 
---
 tools/perf/util/arm-spe.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/perf/util/arm-spe.c b/tools/perf/util/arm-spe.c
index 4cf558b0218a..80f5659e7f7e 100644
--- a/tools/perf/util/arm-spe.c
+++ b/tools/perf/util/arm-spe.c
@@ -234,7 +234,7 @@ static void arm_spe_prep_sample(struct arm_spe *spe,
struct arm_spe_record *record = >decoder->record;
 
if (!spe->timeless_decoding)
-   sample->time = speq->timestamp;
+   sample->time = tsc_to_perf_time(record->timestamp, >tc);
 
sample->ip = record->from_ip;
sample->cpumode = arm_spe_cpumode(spe, sample->ip);
-- 
2.25.1



[PATCH v2 4/7] perf arm-spe: Convert event kernel time to counter value

2021-04-03 Thread Leo Yan
When handle a perf event, Arm SPE decoder needs to decide if this perf
event is earlier or later than the samples from Arm SPE trace data; to
do comparision, it needs to use the same unit for the time.

This patch converts the event kernel time to arch timer's counter value,
thus it can be used to compare with counter value contained in Arm SPE
Timestamp packet.

Signed-off-by: Leo Yan 
---
 tools/perf/util/arm-spe.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/perf/util/arm-spe.c b/tools/perf/util/arm-spe.c
index 69ce3483d1af..4cf558b0218a 100644
--- a/tools/perf/util/arm-spe.c
+++ b/tools/perf/util/arm-spe.c
@@ -669,7 +669,7 @@ static int arm_spe_process_event(struct perf_session 
*session,
}
 
if (sample->time && (sample->time != (u64) -1))
-   timestamp = sample->time;
+   timestamp = perf_time_to_tsc(sample->time, >tc);
else
timestamp = 0;
 
-- 
2.25.1



[PATCH v2 3/7] perf arm-spe: Dump TSC parameters

2021-04-03 Thread Leo Yan
The TSC parameters are stored in auxtrace info, this patch dumps these
parameters for reporting the raw data.

Signed-off-by: Leo Yan 
---
 tools/perf/util/arm-spe.c | 42 ++-
 1 file changed, 37 insertions(+), 5 deletions(-)

diff --git a/tools/perf/util/arm-spe.c b/tools/perf/util/arm-spe.c
index 2539d4baec44..69ce3483d1af 100644
--- a/tools/perf/util/arm-spe.c
+++ b/tools/perf/util/arm-spe.c
@@ -26,6 +26,7 @@
 #include "symbol.h"
 #include "thread.h"
 #include "thread-stack.h"
+#include "tsc.h"
 #include "tool.h"
 #include "util/synthetic-events.h"
 
@@ -45,6 +46,8 @@ struct arm_spe {
struct machine  *machine;
u32 pmu_type;
 
+   struct perf_tsc_conversion  tc;
+
u8  timeless_decoding;
u8  data_queued;
 
@@ -803,14 +806,23 @@ static bool arm_spe_evsel_is_auxtrace(struct perf_session 
*session,
 
 static const char * const arm_spe_info_fmts[] = {
[ARM_SPE_PMU_TYPE]  = "  PMU Type   %"PRId64"\n",
+   [ARM_SPE_TIME_SHIFT]= "  Time Shift %"PRIu64"\n",
+   [ARM_SPE_TIME_MULT] = "  Time Muliplier %"PRIu64"\n",
+   [ARM_SPE_TIME_ZERO] = "  Time Zero  %"PRIu64"\n",
+   [ARM_SPE_TIME_CYCLES]   = "  Time Cycles%"PRIu64"\n",
+   [ARM_SPE_TIME_MASK] = "  Time Mask  %#"PRIx64"\n",
+   [ARM_SPE_CAP_USER_TIME_SHORT]   = "  Cap Time Short %"PRId64"\n",
 };
 
-static void arm_spe_print_info(__u64 *arr)
+static void arm_spe_print_info(__u64 *arr, int start, int finish)
 {
+   int i;
+
if (!dump_trace)
return;
 
-   fprintf(stdout, arm_spe_info_fmts[ARM_SPE_PMU_TYPE], 
arr[ARM_SPE_PMU_TYPE]);
+   for (i = start; i <= finish; i++)
+   fprintf(stdout, arm_spe_info_fmts[i], arr[i]);
 }
 
 struct arm_spe_synth {
@@ -1001,11 +1013,19 @@ arm_spe_synth_events(struct arm_spe *spe, struct 
perf_session *session)
return 0;
 }
 
+static bool arm_spe_has(struct perf_record_auxtrace_info *auxtrace_info,
+   int pos)
+{
+   return auxtrace_info->header.size >=
+   (sizeof(struct perf_record_auxtrace_info) +
+(sizeof(u64) * (pos + 1)));
+}
+
 int arm_spe_process_auxtrace_info(union perf_event *event,
  struct perf_session *session)
 {
struct perf_record_auxtrace_info *auxtrace_info = >auxtrace_info;
-   size_t min_sz = sizeof(u64) * ARM_SPE_AUXTRACE_PRIV_MAX;
+   size_t min_sz = sizeof(u64) * ARM_SPE_TIME_SHIFT;
struct arm_spe *spe;
int err;
 
@@ -1025,6 +1045,20 @@ int arm_spe_process_auxtrace_info(union perf_event 
*event,
spe->machine = >machines.host; /* No kvm support */
spe->auxtrace_type = auxtrace_info->type;
spe->pmu_type = auxtrace_info->priv[ARM_SPE_PMU_TYPE];
+   arm_spe_print_info(_info->priv[0], ARM_SPE_PMU_TYPE,
+  ARM_SPE_PMU_TYPE);
+
+   if (arm_spe_has(auxtrace_info, ARM_SPE_CAP_USER_TIME_SHORT)) {
+   spe->tc.time_shift = auxtrace_info->priv[ARM_SPE_TIME_SHIFT];
+   spe->tc.time_mult = auxtrace_info->priv[ARM_SPE_TIME_MULT];
+   spe->tc.time_zero = auxtrace_info->priv[ARM_SPE_TIME_ZERO];
+   spe->tc.time_cycles = auxtrace_info->priv[ARM_SPE_TIME_CYCLES];
+   spe->tc.time_mask = auxtrace_info->priv[ARM_SPE_TIME_MASK];
+   spe->tc.cap_user_time_short =
+   auxtrace_info->priv[ARM_SPE_CAP_USER_TIME_SHORT];
+   arm_spe_print_info(_info->priv[0], ARM_SPE_TIME_SHIFT,
+  ARM_SPE_CAP_USER_TIME_SHORT);
+   }
 
spe->timeless_decoding = arm_spe__is_timeless_decoding(spe);
spe->auxtrace.process_event = arm_spe_process_event;
@@ -1035,8 +1069,6 @@ int arm_spe_process_auxtrace_info(union perf_event *event,
spe->auxtrace.evsel_is_auxtrace = arm_spe_evsel_is_auxtrace;
session->auxtrace = >auxtrace;
 
-   arm_spe_print_info(_info->priv[0]);
-
if (dump_trace)
return 0;
 
-- 
2.25.1



[PATCH v2 2/7] perf arm-spe: Store TSC parameters in auxtrace info

2021-04-03 Thread Leo Yan
The TSC parameters are used for conversion between arch timer counter
and kernel timestamp, this patch stores the parameters into the struct
perf_record_auxtrace_info, and it is saved in perf data file.

Signed-off-by: Leo Yan 
---
 tools/perf/arch/arm64/util/arm-spe.c | 23 +++
 tools/perf/util/arm-spe.h|  6 ++
 2 files changed, 29 insertions(+)

diff --git a/tools/perf/arch/arm64/util/arm-spe.c 
b/tools/perf/arch/arm64/util/arm-spe.c
index 414c8a5584b1..dd940cf16f49 100644
--- a/tools/perf/arch/arm64/util/arm-spe.c
+++ b/tools/perf/arch/arm64/util/arm-spe.c
@@ -15,7 +15,9 @@
 #include "../../../util/event.h"
 #include "../../../util/evsel.h"
 #include "../../../util/evlist.h"
+#include "../../../util/mmap.h"
 #include "../../../util/session.h"
+#include "../../../util/tsc.h"
 #include  // page_size
 #include "../../../util/pmu.h"
 #include "../../../util/debug.h"
@@ -47,6 +49,9 @@ static int arm_spe_info_fill(struct auxtrace_record *itr,
struct arm_spe_recording *sper =
container_of(itr, struct arm_spe_recording, itr);
struct perf_pmu *arm_spe_pmu = sper->arm_spe_pmu;
+   struct perf_event_mmap_page *pc;
+   struct perf_tsc_conversion tc = { .time_mult = 0, };
+   int err;
 
if (priv_size != ARM_SPE_AUXTRACE_PRIV_SIZE)
return -EINVAL;
@@ -54,8 +59,26 @@ static int arm_spe_info_fill(struct auxtrace_record *itr,
if (!session->evlist->core.nr_mmaps)
return -EINVAL;
 
+   pc = session->evlist->mmap[0].core.base;
+   if (pc) {
+   err = perf_read_tsc_conversion(pc, );
+   if (err) {
+   if (err != -EOPNOTSUPP)
+   return err;
+   }
+
+   if (!tc.time_mult)
+   ui__warning("Arm SPE: arch timer not available\n");
+   }
+
auxtrace_info->type = PERF_AUXTRACE_ARM_SPE;
auxtrace_info->priv[ARM_SPE_PMU_TYPE] = arm_spe_pmu->type;
+   auxtrace_info->priv[ARM_SPE_TIME_SHIFT] = tc.time_shift;
+   auxtrace_info->priv[ARM_SPE_TIME_MULT] = tc.time_mult;
+   auxtrace_info->priv[ARM_SPE_TIME_ZERO] = tc.time_zero;
+   auxtrace_info->priv[ARM_SPE_TIME_CYCLES] = tc.time_cycles;
+   auxtrace_info->priv[ARM_SPE_TIME_MASK] = tc.time_mask;
+   auxtrace_info->priv[ARM_SPE_CAP_USER_TIME_SHORT] = 
tc.cap_user_time_short;
 
return 0;
 }
diff --git a/tools/perf/util/arm-spe.h b/tools/perf/util/arm-spe.h
index 105ce0ea0a01..5bf3e838d226 100644
--- a/tools/perf/util/arm-spe.h
+++ b/tools/perf/util/arm-spe.h
@@ -11,6 +11,12 @@
 
 enum {
ARM_SPE_PMU_TYPE,
+   ARM_SPE_TIME_SHIFT,
+   ARM_SPE_TIME_MULT,
+   ARM_SPE_TIME_ZERO,
+   ARM_SPE_TIME_CYCLES,
+   ARM_SPE_TIME_MASK,
+   ARM_SPE_CAP_USER_TIME_SHORT,
ARM_SPE_AUXTRACE_PRIV_MAX,
 };
 
-- 
2.25.1



[PATCH v2 1/7] perf arm-spe: Remove unused enum value ARM_SPE_PER_CPU_MMAPS

2021-04-03 Thread Leo Yan
The enum value 'ARM_SPE_PER_CPU_MMAPS' is never used so remove it.

Signed-off-by: Leo Yan 
---
 tools/perf/util/arm-spe.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tools/perf/util/arm-spe.h b/tools/perf/util/arm-spe.h
index 98d3235781c3..105ce0ea0a01 100644
--- a/tools/perf/util/arm-spe.h
+++ b/tools/perf/util/arm-spe.h
@@ -11,7 +11,6 @@
 
 enum {
ARM_SPE_PMU_TYPE,
-   ARM_SPE_PER_CPU_MMAPS,
ARM_SPE_AUXTRACE_PRIV_MAX,
 };
 
-- 
2.25.1



[PATCH v2 0/7] perf arm-spe: Enable timestamp

2021-04-03 Thread Leo Yan
As we know, the timestamp is important for AUX trace; it's mainly used
to correlate between perf events and AUX trace, allows to generate
events with time ordered manner.  There have several good examples of
enabling timestamp for AUX trace (like Intel-pt, Intel-bts, etc).

Since the conversion between TSC and kernel timestamp has been supported
on Arm64, TSC is a naming convention from x86, but perf now has reused
it to support Arm arch timer counter.

This patch set is to enable timestamp for Arm SPE trace.  It reads out
TSC parameters from mmap page and stores into auxtrace info structure;
the TSC parameters are used for conversion between timer counter and
kernel time and which is applied for Arm SPE samples.

This patch set can be clearly applied on perf/core branch with:

  commit 6859bc0e78c6 ("perf stat: Improve readability of shadow stats")

Ths patch series has been tested on Hisilicon D06 platform.

After:

  # perf script -F comm,time,cpu,pid,dso,ip,sym

  perf  2408 [032]   168.680297:  bd1253690a3c perf_event_exec 
([kernel.kallsyms])
  perf  2408 [032]   168.680297:  bd1253690a3c perf_event_exec 
([kernel.kallsyms])
   false_sharing.e  2408 [032]   168.680317:  bd1253683f50 
perf_iterate_ctx.constprop.0 ([kernel.kallsyms])
   false_sharing.e  2408 [032]   168.680317:  bd1253683f50 
perf_iterate_ctx.constprop.0 ([kernel.kallsyms])
   false_sharing.e  2408 [032]   168.680319:  bd1253683f70 
perf_iterate_ctx.constprop.0 ([kernel.kallsyms])
   false_sharing.e  2408 [032]   168.680319:  bd1253683f70 
perf_iterate_ctx.constprop.0 ([kernel.kallsyms])
   false_sharing.e  2408 [032]   168.680367:  bd12539b03ec 
__arch_clear_user ([kernel.kallsyms])
   false_sharing.e  2408 [032]   168.680375:  bd1253721440 kmem_cache_alloc 
([kernel.kallsyms])
   false_sharing.e  2408 [032]   168.680375:  bd1253721440 kmem_cache_alloc 
([kernel.kallsyms])
   false_sharing.e  2408 [032]   168.680375:  bd1253721440 kmem_cache_alloc 
([kernel.kallsyms])
   false_sharing.e  2408 [032]   168.680375:  bd1253721440 kmem_cache_alloc 
([kernel.kallsyms])
   false_sharing.e  2408 [032]   168.680376:  bd1253683f70 
perf_iterate_ctx.constprop.0 ([kernel.kallsyms])
   false_sharing.e  2408 [032]   168.680376:  bd1253683f70 
perf_iterate_ctx.constprop.0 ([kernel.kallsyms])
   false_sharing.e  2408 [032]   168.680376:  bd1253683f70 
perf_iterate_ctx.constprop.0 ([kernel.kallsyms])

Changes from v1:
* Rebased patch series on the latest perf/core branch;
* Fixed the patch for dumping TSC parameters to support both the
  older and new auxtrace info format.


Leo Yan (7):
  perf arm-spe: Remove unused enum value ARM_SPE_PER_CPU_MMAPS
  perf arm-spe: Store TSC parameters in auxtrace info
  perf arm-spe: Dump TSC parameters
  perf arm-spe: Convert event kernel time to counter value
  perf arm-spe: Assign kernel time to synthesized event
  perf arm-spe: Bail out if the trace is later than perf event
  perf arm-spe: Don't wait for PERF_RECORD_EXIT event

 tools/perf/arch/arm64/util/arm-spe.c | 23 +++
 tools/perf/util/arm-spe.c| 89 +++-
 tools/perf/util/arm-spe.h|  7 ++-
 3 files changed, 103 insertions(+), 16 deletions(-)

-- 
2.25.1



Re: [PATCH] mailbox: hisilicon: Use the correct HiSilicon copyright

2021-03-30 Thread Leo Yan
On Tue, Mar 30, 2021 at 02:48:40PM +0800, Hao Fang wrote:
> s/Hisilicon/HiSilicon/g.
> It should use capital S, according to
> https://www.hisilicon.com/en/terms-of-use.
> 
> Signed-off-by: Hao Fang 

Though the kernel has tons of "Hisilicon", for this patch:

Reviewed-by: Leo Yan 

> ---
>  drivers/mailbox/hi3660-mailbox.c | 4 ++--
>  drivers/mailbox/hi6220-mailbox.c | 4 ++--
>  2 files changed, 4 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/mailbox/hi3660-mailbox.c 
> b/drivers/mailbox/hi3660-mailbox.c
> index 53f4bc2..45c6d69 100644
> --- a/drivers/mailbox/hi3660-mailbox.c
> +++ b/drivers/mailbox/hi3660-mailbox.c
> @@ -1,5 +1,5 @@
>  // SPDX-License-Identifier: GPL-2.0
> -// Copyright (c) 2017-2018 Hisilicon Limited.
> +// Copyright (c) 2017-2018 HiSilicon Limited.
>  // Copyright (c) 2017-2018 Linaro Limited.
>  
>  #include 
> @@ -297,5 +297,5 @@ static void __exit hi3660_mbox_exit(void)
>  module_exit(hi3660_mbox_exit);
>  
>  MODULE_LICENSE("GPL");
> -MODULE_DESCRIPTION("Hisilicon Hi3660 Mailbox Controller");
> +MODULE_DESCRIPTION("HiSilicon Hi3660 Mailbox Controller");
>  MODULE_AUTHOR("Leo Yan ");
> diff --git a/drivers/mailbox/hi6220-mailbox.c 
> b/drivers/mailbox/hi6220-mailbox.c
> index cc236ac..9fdc25e 100644
> --- a/drivers/mailbox/hi6220-mailbox.c
> +++ b/drivers/mailbox/hi6220-mailbox.c
> @@ -1,8 +1,8 @@
>  // SPDX-License-Identifier: GPL-2.0-only
>  /*
> - * Hisilicon's Hi6220 mailbox driver
> + * HiSilicon's Hi6220 mailbox driver
>   *
> - * Copyright (c) 2015 Hisilicon Limited.
> + * Copyright (c) 2015 HiSilicon Limited.
>   * Copyright (c) 2015 Linaro Limited.
>   *
>   * Author: Leo Yan 
> -- 
> 2.8.1
> 


Re: [PATCH -next] coresight: etm-perf: Mark format_attr_contextid with static keyword

2021-03-23 Thread Leo Yan
On Tue, Mar 23, 2021 at 07:54:52AM +, Zou Wei wrote:
> Fix the following sparse warning:
> 
> drivers/hwtracing/coresight/coresight-etm-perf.c:61:25: warning: symbol
> 'format_attr_contextid' was not declared. Should it be static?
> 
> Signed-off-by: Zou Wei 

Reviewed-by: Leo Yan 

> ---
>  drivers/hwtracing/coresight/coresight-etm-perf.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/drivers/hwtracing/coresight/coresight-etm-perf.c 
> b/drivers/hwtracing/coresight/coresight-etm-perf.c
> index 0f603b4094f2..bdbb77334329 100644
> --- a/drivers/hwtracing/coresight/coresight-etm-perf.c
> +++ b/drivers/hwtracing/coresight/coresight-etm-perf.c
> @@ -58,7 +58,7 @@ static ssize_t format_attr_contextid_show(struct device 
> *dev,
>   return sprintf(page, "config:%d\n", pid_fmt);
>  }
>  
> -struct device_attribute format_attr_contextid =
> +static struct device_attribute format_attr_contextid =
>   __ATTR(contextid, 0444, format_attr_contextid_show, NULL);
>  
>  static struct attribute *etm_config_formats_attr[] = {
> -- 
> 2.17.1
> 


Re: [PATCH v2] arm64: dts: msm8916: Enable CoreSight STM component

2021-03-21 Thread Leo Yan
On Sun, Mar 21, 2021 at 08:11:05PM +0800, Leo Yan wrote:
> Add DT binding for CoreSight System Trace Macrocell (STM) on msm8916,
> which can benefit the CoreSight development on DB410c.
> 
> Signed-off-by: Georgi Djakov 
> Signed-off-by: Leo Yan 

When I rebased this patch, I didn't know why the patch auther's name was
changed unexpectly.  So have sent patch v3 to correct it.

Sorry for spamming.
Leo


[PATCH v3] arm64: dts: msm8916: Enable CoreSight STM component

2021-03-21 Thread Leo Yan
From: Georgi Djakov 

Add DT binding for CoreSight System Trace Macrocell (STM) on msm8916,
which can benefit the CoreSight development on DB410c.

Signed-off-by: Georgi Djakov 
Signed-off-by: Leo Yan 
---

Changes from v2:
* Correct for author name.

Changes from v1:
* alphabetically and address ordering for DT node; pad addresses with
  zeroes (Stephan Gerhold).

 arch/arm64/boot/dts/qcom/apq8016-sbc.dtsi |  1 +
 arch/arm64/boot/dts/qcom/msm8916.dtsi | 27 +++
 2 files changed, 28 insertions(+)

diff --git a/arch/arm64/boot/dts/qcom/apq8016-sbc.dtsi 
b/arch/arm64/boot/dts/qcom/apq8016-sbc.dtsi
index 3a9538e1ec97..2165b7415add 100644
--- a/arch/arm64/boot/dts/qcom/apq8016-sbc.dtsi
+++ b/arch/arm64/boot/dts/qcom/apq8016-sbc.dtsi
@@ -410,6 +410,7 @@ _codec {
  { status = "okay"; };
  { status = "okay"; };
  { status = "okay"; };
+ { status = "okay"; };
  { status = "okay"; };
 
 _rpm_regulators {
diff --git a/arch/arm64/boot/dts/qcom/msm8916.dtsi 
b/arch/arm64/boot/dts/qcom/msm8916.dtsi
index 402e891a84ab..f02b976480d5 100644
--- a/arch/arm64/boot/dts/qcom/msm8916.dtsi
+++ b/arch/arm64/boot/dts/qcom/msm8916.dtsi
@@ -489,6 +489,26 @@ snoc: interconnect@58 {
 < RPM_SMD_SNOC_A_CLK>;
};
 
+   stm: stm@802000 {
+   compatible = "arm,coresight-stm", "arm,primecell";
+   reg = <0x00802000 0x1000>,
+ <0x0928 0x18>;
+   reg-names = "stm-base", "stm-stimulus-base";
+
+   clocks = < RPM_QDSS_CLK>, < RPM_QDSS_A_CLK>;
+   clock-names = "apb_pclk", "atclk";
+
+   status = "disabled";
+
+   out-ports {
+   port {
+   stm_out: endpoint {
+   remote-endpoint = 
<_in7>;
+   };
+   };
+   };
+   };
+
/* System CTIs */
/* CTI 0 - TMC connections */
cti0: cti@81 {
@@ -562,6 +582,13 @@ funnel0_in4: endpoint {
remote-endpoint = 
<_out>;
};
};
+
+   port@7 {
+   reg = <7>;
+   funnel0_in7: endpoint {
+   remote-endpoint = <_out>;
+   };
+   };
};
 
out-ports {
-- 
2.25.1



[PATCH v2] arm64: dts: msm8916: Enable CoreSight STM component

2021-03-21 Thread Leo Yan
Add DT binding for CoreSight System Trace Macrocell (STM) on msm8916,
which can benefit the CoreSight development on DB410c.

Signed-off-by: Georgi Djakov 
Signed-off-by: Leo Yan 
---

Changes from v1:
* alphabetically and address ordering for DT node; pad addresses with
* zeroes (Stephan Gerhold).

 arch/arm64/boot/dts/qcom/apq8016-sbc.dtsi |  1 +
 arch/arm64/boot/dts/qcom/msm8916.dtsi | 27 +++
 2 files changed, 28 insertions(+)

diff --git a/arch/arm64/boot/dts/qcom/apq8016-sbc.dtsi 
b/arch/arm64/boot/dts/qcom/apq8016-sbc.dtsi
index 3a9538e1ec97..2165b7415add 100644
--- a/arch/arm64/boot/dts/qcom/apq8016-sbc.dtsi
+++ b/arch/arm64/boot/dts/qcom/apq8016-sbc.dtsi
@@ -410,6 +410,7 @@ _codec {
  { status = "okay"; };
  { status = "okay"; };
  { status = "okay"; };
+ { status = "okay"; };
  { status = "okay"; };
 
 _rpm_regulators {
diff --git a/arch/arm64/boot/dts/qcom/msm8916.dtsi 
b/arch/arm64/boot/dts/qcom/msm8916.dtsi
index 402e891a84ab..f02b976480d5 100644
--- a/arch/arm64/boot/dts/qcom/msm8916.dtsi
+++ b/arch/arm64/boot/dts/qcom/msm8916.dtsi
@@ -489,6 +489,26 @@ snoc: interconnect@58 {
 < RPM_SMD_SNOC_A_CLK>;
};
 
+   stm: stm@802000 {
+   compatible = "arm,coresight-stm", "arm,primecell";
+   reg = <0x00802000 0x1000>,
+ <0x0928 0x18>;
+   reg-names = "stm-base", "stm-stimulus-base";
+
+   clocks = < RPM_QDSS_CLK>, < RPM_QDSS_A_CLK>;
+   clock-names = "apb_pclk", "atclk";
+
+   status = "disabled";
+
+   out-ports {
+   port {
+   stm_out: endpoint {
+   remote-endpoint = 
<_in7>;
+   };
+   };
+   };
+   };
+
/* System CTIs */
/* CTI 0 - TMC connections */
cti0: cti@81 {
@@ -562,6 +582,13 @@ funnel0_in4: endpoint {
remote-endpoint = 
<_out>;
};
};
+
+   port@7 {
+   reg = <7>;
+   funnel0_in7: endpoint {
+   remote-endpoint = <_out>;
+   };
+   };
};
 
out-ports {
-- 
2.25.1



[PATCH] perf test: Change to use bash for daemon test

2021-03-20 Thread Leo Yan
When executed the daemon test on Arm64 and x86 with Debian (Buster)
distro, both skip the test case with the log:

  # ./perf test -v 76
  76: daemon operations   :
  --- start ---
  test child forked, pid 11687
  test daemon list
  trap: SIGINT: bad trap
  ./tests/shell/daemon.sh: 173: local: cpu-clock: bad variable name
  test child finished with -2
   end 
  daemon operations: Skip

So the error happens for the variable expansion when use local variable
in the shell script.  Since Debian Buster uses dash but not bash as
non-interactive shell, when execute the daemon testing, it hits a
known issue for dash which was reported [1].

To resolve this issue, one option is to add double quotes for all local
variables assignment, so need to change the code from:

  local line=`perf daemon --config ${config} -x: | head -2 | tail -1`

  ... to:

  local line="`perf daemon --config ${config} -x: | head -2 | tail -1`"

But the testing script has bunch of local variables, this leads to big
changes for whole script.

On the other hand, the testing script asks to use the "local" feature
which is bash-specific, so this patch explicitly uses "#!/bin/bash" to
ensure running the script with bash.

After:

  # ./perf test -v 76
  76: daemon operations   :
  --- start ---
  test child forked, pid 11329
  test daemon list
  test daemon reconfig
  test daemon stop
  test daemon signal
  signal 12 sent to session 'test [11596]'
  signal 12 sent to session 'test [11596]'
  test daemon ping
  test daemon lock
  test child finished with 0
   end 
  daemon operations: Ok

[1] https://bugs.launchpad.net/ubuntu/+source/dash/+bug/139097

Fixes: 2291bb915b55 ("perf tests: Add daemon 'list' command test")
Signed-off-by: Leo Yan 
---
 tools/perf/tests/shell/daemon.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/perf/tests/shell/daemon.sh b/tools/perf/tests/shell/daemon.sh
index ee4a30ca3f57..45fc24af5b07 100755
--- a/tools/perf/tests/shell/daemon.sh
+++ b/tools/perf/tests/shell/daemon.sh
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # daemon operations
 # SPDX-License-Identifier: GPL-2.0
 
-- 
2.25.1



Re: [PATCH v3 4/8] perf cs-etm: Fix bitmap for option

2021-03-05 Thread Leo Yan
Hi Arnaldo,

On Fri, Mar 05, 2021 at 02:29:44PM -0300, Arnaldo Carvalho de Melo wrote:
> Em Tue, Feb 09, 2021 at 09:58:55AM +0800, Leo Yan escreveu:
> > On Mon, Feb 08, 2021 at 01:46:41PM -0700, Mathieu Poirier wrote:
> > > On Sat, Feb 06, 2021 at 11:08:29PM +0800, Leo Yan wrote:
> > > > From: Suzuki K Poulose 
> > > > 
> > > > When set option with macros ETM_OPT_CTXTID and ETM_OPT_TS, it wrongly
> > > > takes these two values (14 and 28 prespectively) as bit masks, but
> > > > actually both are the offset for bits.  But this doesn't lead to
> > > > further failure due to the AND logic operation will be always true for
> > > > ETM_OPT_CTXTID / ETM_OPT_TS.
> > > > 
> > > > This patch defines new independent macros (rather than using the
> > > > "config" bits) for requesting the "contextid" and "timestamp" for
> > > > cs_etm_set_option().
> > > > 
> > > > [leoy: Extract the change as a separate patch for easier review]
> > > 
> > > This should go just above your name - see below.
> 
> I fixed this up and added this patch to my perf/urgent branch, for
> v5.12, since the kernel bits are upstream and this is a fix.

Yeah, it makes sense to pick this patch into perf/urgent branch since
it's a fixing patch.

Actually, this patch has been merged into the tmp.perf/core branch [1],
after you move it to the perf/urgent branch, I can confirm all other
patches for perf tool in this series have been merged into the
tmp.perf/core branch.

Thanks,
Leo

[1] 
https://git.kernel.org/pub/scm/linux/kernel/git/acme/linux.git/commit/?h=tmp.perf/core=8c559e8d68630d64d932bada633705f6551427df


Re: [PATCH RESEND WITH CCs v3 3/4] perf tools: enable dwarf_callchain_users on aarch64

2021-03-05 Thread Leo Yan
On Fri, Mar 05, 2021 at 07:51:20PM +0800, Leo Yan wrote:

[...]

> > diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
> > index 2a845d6cac09..93661a3eaeb1 100644
> > --- a/tools/perf/builtin-report.c
> > +++ b/tools/perf/builtin-report.c
> > @@ -405,6 +405,10 @@ static int report__setup_sample_type(struct report 
> > *rep)
> >  
> > callchain_param_setup(sample_type);
> >  
> > +   if (callchain_param.record_mode == CALLCHAIN_FP &&
> > +   strncmp(rep->session->header.env.arch, "aarch64", 7) == 
> > 0)
> > +   dwarf_callchain_users = true;
> > +
> 
> I don't have knowledge for dwarf or FP.
> 
> This patch is suspicious for me that since it only fixes the issue for
> "perf report" command, but it cannot support "perf script".
> 
> I did a quick testing for "perf script" command with the test code from
> patch 04, seems to me it cannot fix the fp omitting issue for
> "perf script" command:
> 
>   arm64_fp_test 11211  2282.355095: 176307 cycles: 
>   c2e40740 f2+0x10 (/root/arm64_fp_test)
>   c2e4061c main+0xc (/root/arm64_fp_test)
>   961fbd24 __libc_start_main+0xe4 
> (/usr/lib/aarch64-linux-gnu/libc-2.28.so)
>   c2e4065c _start+0x34 (/root/arm64_fp_test)
> 
> Could you check for this?  Thanks!

Maybe we can consolidate the setting for the global variable
"dwarf_callchain_users" with below change; this can help us to cover
the tools for most cases.  I used the below change to replact patch
03, "perf report" and "perf script" both can work well with it.

Please note, if you want to move forward with this way, it's better to
use a saperate patch for firstly refactoring the function
script__setup_sample_type() by using the general API
callchain_param_setup() to replace the duplicate code pieces for
callchain parameter setting up.

After that, you could apply the reset change for adding new parameter
"arch" for the function script__setup_sample_type().


diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 2a845d6cac09..ca2e8c9096ea 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -1090,7 +1090,8 @@ static int process_attr(struct perf_tool *tool 
__maybe_unused,
 * on events sample_type.
 */
sample_type = evlist__combined_sample_type(*pevlist);
-   callchain_param_setup(sample_type);
+   callchain_param_setup(sample_type,
+ perf_env__arch((*pevlist)->env));
return 0;
 }
 
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index 5915f19cee55..c49212c135b2 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -2250,7 +2250,8 @@ static int process_attr(struct perf_tool *tool, union 
perf_event *event,
 * on events sample_type.
 */
sample_type = evlist__combined_sample_type(evlist);
-   callchain_param_setup(sample_type);
+   callchain_param_setup(sample_type,
+ perf_env__arch((*pevlist)->env));
 
/* Enable fields for callchain entries */
if (symbol_conf.use_callchain &&
@@ -3309,16 +3310,8 @@ static void script__setup_sample_type(struct perf_script 
*script)
struct perf_session *session = script->session;
u64 sample_type = evlist__combined_sample_type(session->evlist);
 
-   if (symbol_conf.use_callchain || symbol_conf.cumulate_callchain) {
-   if ((sample_type & PERF_SAMPLE_REGS_USER) &&
-   (sample_type & PERF_SAMPLE_STACK_USER)) {
-   callchain_param.record_mode = CALLCHAIN_DWARF;
-   dwarf_callchain_users = true;
-   } else if (sample_type & PERF_SAMPLE_BRANCH_STACK)
-   callchain_param.record_mode = CALLCHAIN_LBR;
-   else
-   callchain_param.record_mode = CALLCHAIN_FP;
-   }
+   callchain_param_setup(sample_type,
+ perf_env__arch(session->machines.host.env));
 
if (script->stitch_lbr && (callchain_param.record_mode != 
CALLCHAIN_LBR)) {
pr_warning("Can't find LBR callchain. Switch off 
--stitch-lbr.\n"
diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c
index 1b60985690bb..d9766b54cd1a 100644
--- a/tools/perf/util/callchain.c
+++ b/tools/perf/util/callchain.c
@@ -1600,7 +1600,7 @@ void callchain_cursor_reset(struct callchain_cursor 
*cursor)
map__zput(node->ms.map);
 }
 
-void callchain_param_setup(u64 sample_type)
+void callchain_param_setup(u64 sample_type, const char *arch)
 {
if (symbol_con

Re: [PATCH RESEND WITH CCs v3 3/4] perf tools: enable dwarf_callchain_users on aarch64

2021-03-05 Thread Leo Yan
Hi Alexandre,

On Thu, Mar 04, 2021 at 04:32:54PM +, Alexandre Truong wrote:
> On arm64, enable dwarf_callchain_users which will be needed
> to do a dwarf unwind in order to get the caller of the leaf frame.
> 
> Signed-off-by: Alexandre Truong 
> Cc: John Garry 
> Cc: Will Deacon 
> Cc: Mathieu Poirier 
> Cc: Leo Yan 
> Cc: Peter Zijlstra 
> Cc: Ingo Molnar 
> Cc: Arnaldo Carvalho de Melo 
> Cc: Mark Rutland 
> Cc: Alexander Shishkin 
> Cc: Jiri Olsa 
> Cc: Namhyung Kim 
> Cc: Kemeng Shi 
> Cc: Ian Rogers 
> Cc: Andi Kleen 
> Cc: Kan Liang 
> Cc: Jin Yao 
> Cc: Adrian Hunter 
> Cc: Suzuki K Poulose 
> Cc: Al Grant 
> Cc: James Clark 
> Cc: Wilco Dijkstra 
> ---
>  tools/perf/builtin-report.c | 4 
>  1 file changed, 4 insertions(+)
> 
> diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
> index 2a845d6cac09..93661a3eaeb1 100644
> --- a/tools/perf/builtin-report.c
> +++ b/tools/perf/builtin-report.c
> @@ -405,6 +405,10 @@ static int report__setup_sample_type(struct report *rep)
>  
>   callchain_param_setup(sample_type);
>  
> + if (callchain_param.record_mode == CALLCHAIN_FP &&
> + strncmp(rep->session->header.env.arch, "aarch64", 7) == 
> 0)
> + dwarf_callchain_users = true;
> +

I don't have knowledge for dwarf or FP.

This patch is suspicious for me that since it only fixes the issue for
"perf report" command, but it cannot support "perf script".

I did a quick testing for "perf script" command with the test code from
patch 04, seems to me it cannot fix the fp omitting issue for
"perf script" command:

  arm64_fp_test 11211  2282.355095: 176307 cycles: 
  c2e40740 f2+0x10 (/root/arm64_fp_test)
  c2e4061c main+0xc (/root/arm64_fp_test)
  961fbd24 __libc_start_main+0xe4 
(/usr/lib/aarch64-linux-gnu/libc-2.28.so)
  c2e4065c _start+0x34 (/root/arm64_fp_test)

Could you check for this?  Thanks!

Leo

>   if (rep->stitch_lbr && (callchain_param.record_mode != CALLCHAIN_LBR)) {
>   ui__warning("Can't find LBR callchain. Switch off 
> --stitch-lbr.\n"
>   "Please apply --call-graph lbr when recording.\n");
> -- 
> 2.23.0
> 


Re: [PATCH 0/6] coresight: Patches for v5.12 (perf tools)

2021-03-03 Thread Leo Yan
On Tue, Mar 02, 2021 at 02:11:10PM -0300, Arnaldo Carvalho de Melo wrote:
>
>
> On March 2, 2021 2:02:17 PM GMT-03:00, Mike Leach  
> wrote:
> >On Tue, 2 Mar 2021 at 16:42, Mathieu Poirier
> > wrote:
> >>
> >> On Tue, Mar 02, 2021 at 01:24:27PM -0300, Arnaldo Carvalho de Melo
> >wrote:
>
>
> >> > Can you please try one more time, these are the last csets on this
> >> > branch:
> >> >
> >> >   $ git log --oneline acme/tmp.perf/core -10
> >> >   8e1488a46dcf73b1 (HEAD -> perf/core, five/perf/core,
> >acme/tmp.perf/core, acme.korg/tmp.perf/core) perf cs-etm: Detect pid in
> >VMID for kernel running at EL2
> >> >   47f0d94c203751dd perf cs-etm: Add helper cs_etm__get_pid_fmt()
> >> >   30cb76aabfb4deab perf cs-etm: Support PID tracing in config
> >> >   8c559e8d68630d64 perf cs-etm: Fix bitmap for option
> >> >   2bb4ccbd95d7fbf5 tools headers UAPI: Update tools' copy of
> >linux/coresight-pmu.h
> >> >   42b2b570b34afb5f perf cs-etm: Update ETM metadata format
> >> >   83bf6fb8b076c72f perf vendor events power9: Remove unsupported
> >metrics
> >> >   34968b9327c83589 perf buildid-cache: Add test for PE executable
> >> >   9bb8b74bdb186bd3 perf docs: Add man pages to see also
> >> >   d9fd5a718977702f perf tools: Generate mips syscalls_n64.c syscall
> >table
> >> >   $
> >>
> >> As far as I can tell you have all 6 patches.
> >>
> >
> >Agreed - [1] I was trying is in fact:
> >42b2b570b34afb5f perf cs-etm: Update ETM metadata format
> >in the above list.
>
> Ok, I misunderstood, good that's all already in, thanks for checking!

Thanks for the effort, Arnaldo/Mathieu/Mike.


Re: [PATCH 2/7] perf cs-etm: Only search timestamp in current sample's queue.

2021-03-02 Thread Leo Yan
On Mon, Mar 01, 2021 at 05:28:57PM +0200, James Clark wrote:
> 
> 
> On 20/02/2021 13:50, Leo Yan wrote:
> > On Fri, Feb 12, 2021 at 04:45:08PM +0200, James Clark wrote:
> >> Change initial timestamp search to only operate on the queue
> >> related to the current event. In a later change the bounds
> >> of the aux record will also be used to reset the decoder and
> >> the record is only relevant to a single queue.
> > 
> > I roughly understand this patch tries to establish the mechanism for
> > timstamp search per CPU, but I am struggling to understand what's issue
> > you try to address.
> > 
> > So could you give more description for "what's current issue?" and
> > "why need to use this way to fix the issue?".  This would be very
> > appreciated.
> 
> Hi Leo,
> 
> The issue is that the aux records used to reset the decoder are associated
> with a specific CPU/aux queue. Currently when any new data is received, all
> queues are searched for a timestamp. We can't do it that way any more because
> the aux records aren't available yet.
> 
> The reason to fix it this way is because now we can only do decode when
> an aux record is received. This will happen multiple times, and will also
> be cpu/queue specific.

Okay, I think you are try to establish the logic as below:

- Step 1: Wait for PERF_RECORD_AUX event;
- Step 2: Receive PERF_RECORD_AUX event, knows the aux buffer
  offset and size;
- Step 3: Update the auxtrace queue based on the info from
  PERF_RECORD_AUX event;
- Step 4: Find the first trace for timestamp, drop any trace data
  prior to the timestamp;
- Step 5: Conintue to decode the trace data which is contained in the
  current PERF_RECORD_AUX event;
- Step 6: Finish the decoding and goto step1.

> > 
> >> This change makes some files that had coresight data
> >> but didn't syntesise any events start working and generating
> >> events. I'm not sure of the reason for that. I'd expect this
> >> change to only affect the ordering of events.
> > 
> > This seems to me that this patch introduces regression.
> 
> I'm wondering if it is a regression, or accidentally fixing a bug.
> It doesn't seem like it's possible to go from not generating any samples to
> generating lots without accidentally fixing an existing issue. If there is
> valid data there, what would be stopping it from generating any samples?

I think we need to clarify what's the current code logic:

- Step 1: at beginning, only prepare for auxtrace queues, for N:1
  model (all ETMs use the same sink), there have only one queue; for
  1:1 model (one ETM has its own sink), there have multiple queues.

  The function cs_etm__update_queues() is for this step, it doesn't
  generate any sample and only prepare for the first timestamp so
  later the sampels can be compared with each other for the trace data
  coming from multiple CPUs.

- Step 2: it starts to decode the trace data and synthesize samples,
  this is finished by function cs_etm__process_queues().

Seems to me, now we need to fix the issue in step2, in step2, it
decodes the whole AUX buffer, so the main target is to limit the
decoding length, which should base on the info provided by
PERF_RECORD_AUX event.

IIUC, now you are trying to fix issue in step 1, but step 1 is not the
root cause for the issue.

> I do need to look into this more closely though to find the real reason for
> it, which will probably shed more light on it.
> 
> > 
> >> Signed-off-by: James Clark 
> >> ---
> >>  tools/perf/util/cs-etm.c | 30 ++
> >>  1 file changed, 14 insertions(+), 16 deletions(-)
> >>
> >> diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c
> >> index 27894facae5e..8f8b448632fb 100644
> >> --- a/tools/perf/util/cs-etm.c
> >> +++ b/tools/perf/util/cs-etm.c
> >> @@ -97,7 +97,7 @@ struct cs_etm_queue {
> >>  /* RB tree for quick conversion between traceID and metadata pointers */
> >>  static struct intlist *traceid_list;
> >>  
> >> -static int cs_etm__update_queues(struct cs_etm_auxtrace *etm);
> >> +static int cs_etm__update_queues(struct cs_etm_auxtrace *etm, int cpu);
> >>  static int cs_etm__process_queues(struct cs_etm_auxtrace *etm);
> >>  static int cs_etm__process_timeless_queues(struct cs_etm_auxtrace *etm,
> >>   pid_t tid);
> >> @@ -524,7 +524,6 @@ static void cs_etm__dump_event(struct cs_etm_auxtrace 
> >> *etm,
> >>  static int cs_etm__flush_events(struct perf_session *session,
> >>struct perf_tool *tool)
> >>  {
> >> -  

Re: [PATCH 3/7] perf cs-etm: Save aux records in each etm queue

2021-03-02 Thread Leo Yan
On Mon, Mar 01, 2021 at 05:43:43PM +0200, James Clark wrote:

[...]

> > I'd like to propose to add a new field "cs_etm_queue::buf_rec_len", it
> > stands for the record length based on the RECORD_AUX event.  In
> > theory, this value should be always less than "cs_etm_queue::buf_len".
> > 
> > When every time the "PERF_RECORD_AUX" event is coming, we find out the
> > corresponding queue (so this can be applied for "1:1" or "N:1" models
> > for source and sink), and accumulate "perf_record_aux::aux_size" into
> > "cs_etm_queue::buf_rec_len".
> > 
> > At the decoder side, it decreases "etmq->buf_rec_len" until to zero for
> > the current round of decoding (see cs_etm__decode_data_block()).  Since
> > all the "PERF_RECORD_AUX" event will be processed before
> > "PERF_RECORD_EXIT" event, so we don't worry the tail trace data will be
> > ignored.
> > 
> > The main reason for this suggestion is it don't need to change the
> > significant logic in current code.  I will try to do experiment for this
> > idea and share back.
> > 
> > James, if you think I miss anything, please correct me as needed.
> > Thanks!
> > 
> 
> This is an interesting idea, I think we could push decoded packets into the
> min heap as the aux records are received, and not do anything with them until
> the end of the data is reached. That way instead of saving aux records, we'd
> save the result of the decode for each aux record.
> 
> Currently each cs_etm_queue has a cs_etm_traceid_queue/cs_etm_packet_queue 
> for each
> stream, but that would have to be changed to have multiple ones because 
> multiple
> packets could be decoded to get through the whole aux record.
> 
> It would be a similarly sized change, and could also have a bigger impact on
> memory. So I'm not sure if it would help to reduce the changes, but it is 
> possible.

Below change is still very coarse and I just did very basic testing for
it, so didn't cover all cases; so simply use it to demonstrate the basic
idea.

Before the event PERF_RECORD_AUX arrives, we don't decode any trace
data.  And after PERF_RECORD_AUX coming, the aux buffer size will be
accumulated into the queue, and decode the trace data for the queue
based on the accumulated buffer length.

diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c
index b9c1d329a7f1..3bd5609b6de4 100644
--- a/tools/perf/util/cs-etm.c
+++ b/tools/perf/util/cs-etm.c
@@ -89,7 +89,7 @@ struct cs_etm_queue {
u8 pending_timestamp;
u64 offset;
const unsigned char *buf;
-   size_t buf_len, buf_used;
+   size_t aux_buf_len, buf_len, buf_used;
/* Conversion between traceID and index in traceid_queues array */
struct intlist *traceid_queues_list;
struct cs_etm_traceid_queue **traceid_queues;
@@ -1085,6 +1085,7 @@ cs_etm__get_trace(struct cs_etm_queue *etmq)
if (old_buffer)
auxtrace_buffer__drop_data(old_buffer);
etmq->buf_len = 0;
+   etmq->aux_buf_len = 0;
return 0;
}
 
@@ -2052,6 +2053,7 @@ static int cs_etm__decode_data_block(struct cs_etm_queue 
*etmq)
etmq->offset += processed;
etmq->buf_used += processed;
etmq->buf_len -= processed;
+   etmq->aux_buf_len -= processed;
 
 out:
return ret;
@@ -2177,7 +2179,7 @@ static int cs_etm__run_decoder(struct cs_etm_queue *etmq)
 */
err = cs_etm__process_traceid_queue(etmq, tidq);
 
-   } while (etmq->buf_len);
+   } while (etmq->aux_buf_len > 0);
 
if (err == 0)
/* Flush any remaining branch stack entries */
@@ -2216,6 +2218,27 @@ static int cs_etm__process_timeless_queues(struct 
cs_etm_auxtrace *etm,
return 0;
 }
 
+static void cs_etm__update_aux_buf_len(struct cs_etm_auxtrace *etm,
+ struct perf_record_aux *aux)
+{
+   unsigned int cs_queue_nr, queue_nr;
+   struct auxtrace_queue *queue;
+   struct cs_etm_queue *etmq;
+
+   if (!etm->heap.heap_cnt)
+   return;
+
+   /* Take the entry at the top of the min heap */
+   cs_queue_nr = etm->heap.heap_array[0].queue_nr;
+   queue_nr = TO_QUEUE_NR(cs_queue_nr);
+   queue = >queues.queue_array[queue_nr];
+   etmq = queue->priv;
+
+   etmq->aux_buf_len += aux->aux_size;
+   fprintf(stderr, "%s: aux_buf_len=%ld\n", __func__, etmq->aux_buf_len);
+   return;
+}
+
 static int cs_etm__process_queues(struct cs_etm_auxtrace *etm)
 {
int ret = 0;
@@ -2272,6 +2295,9 @@ static int cs_etm__process_queues(struct cs_etm_auxtrace 
*etm)
if (ret < 0)
goto out;
 
+   if (etmq->aux_buf_len <= 0)
+   goto out;
+
/*
 * No more auxtrace_buffers to process in this etmq, simply
 * move on to another entry in the auxtrace_heap.
@@ -2414,9 

Re: [PATCH 3/7] perf cs-etm: Save aux records in each etm queue

2021-02-26 Thread Leo Yan
On Fri, Feb 12, 2021 at 04:45:09PM +0200, James Clark wrote:
> The aux records will be used set the bounds of decoding in a
> later commit. In the future we may also want to use the flags
> of each record to control decoding.
> 
> Do these need to be saved in their entirety, or can pointers
> to each record safely be saved instead for later access?

Rather than introudcing the perf record list, I just wander if we can
use easier method to fix this problem.  So below is the rough idea
(though I don't really verify it):

The essential information we need is what's the valid buffer length can
be used for decoding.  Though cs_etm_queue::buf_len tracks the buffer
length, but it's the buffer length is for the whole AUX buffer, and
which belongs to multiple "PERF_RECORD_AUX" events.  So we cannot decode
at once for the whole trace data in the AUX trace buffer, on the other
hand, the incoming "PERF_RECORD_AUX" event can guide the CoreSight
decoder it should decode how much buffer size.  At the end, the trace
data can be decoded step by step based on the incoming "PERF_RECORD_AUX"
events.

I'd like to propose to add a new field "cs_etm_queue::buf_rec_len", it
stands for the record length based on the RECORD_AUX event.  In
theory, this value should be always less than "cs_etm_queue::buf_len".

When every time the "PERF_RECORD_AUX" event is coming, we find out the
corresponding queue (so this can be applied for "1:1" or "N:1" models
for source and sink), and accumulate "perf_record_aux::aux_size" into
"cs_etm_queue::buf_rec_len".

At the decoder side, it decreases "etmq->buf_rec_len" until to zero for
the current round of decoding (see cs_etm__decode_data_block()).  Since
all the "PERF_RECORD_AUX" event will be processed before
"PERF_RECORD_EXIT" event, so we don't worry the tail trace data will be
ignored.

The main reason for this suggestion is it don't need to change the
significant logic in current code.  I will try to do experiment for this
idea and share back.

James, if you think I miss anything, please correct me as needed.
Thanks!

Leo

> Signed-off-by: James Clark 
> ---
>  tools/perf/util/cs-etm.c | 32 +---
>  1 file changed, 29 insertions(+), 3 deletions(-)
> 
> diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c
> index 8f8b448632fb..88b541b2a804 100644
> --- a/tools/perf/util/cs-etm.c
> +++ b/tools/perf/util/cs-etm.c
> @@ -92,12 +92,16 @@ struct cs_etm_queue {
>   /* Conversion between traceID and index in traceid_queues array */
>   struct intlist *traceid_queues_list;
>   struct cs_etm_traceid_queue **traceid_queues;
> + int aux_record_list_len;
> + int aux_record_list_idx;
> + struct perf_record_aux *aux_record_list;
>  };
>  
>  /* RB tree for quick conversion between traceID and metadata pointers */
>  static struct intlist *traceid_list;
>  
> -static int cs_etm__update_queues(struct cs_etm_auxtrace *etm, int cpu);
> +static int cs_etm__update_queues(struct cs_etm_auxtrace *etm, int cpu,
> +  struct perf_record_aux *aux_record);
>  static int cs_etm__process_queues(struct cs_etm_auxtrace *etm);
>  static int cs_etm__process_timeless_queues(struct cs_etm_auxtrace *etm,
>  pid_t tid);
> @@ -585,6 +589,7 @@ static void cs_etm__free_queue(void *priv)
>  
>   cs_etm_decoder__free(etmq->decoder);
>   cs_etm__free_traceid_queues(etmq);
> + free(etmq->aux_record_list);
>   free(etmq);
>  }
>  
> @@ -759,6 +764,19 @@ static struct cs_etm_queue *cs_etm__alloc_queue(struct 
> cs_etm_auxtrace *etm)
>   return NULL;
>  }
>  
> +static int cs_etm__save_aux_record(struct cs_etm_queue *etmq,
> +struct perf_record_aux *aux_record)
> +{
> + etmq->aux_record_list = reallocarray(etmq->aux_record_list,
> +   etmq->aux_record_list_len+1,
> +   sizeof(*etmq->aux_record_list));
> + if (!etmq->aux_record_list)
> + return -ENOMEM;
> +
> + etmq->aux_record_list[etmq->aux_record_list_len++] = *aux_record;
> + return 0;
> +}
> +
>  static int cs_etm__search_first_timestamp(struct cs_etm_queue *etmq)
>  {
>   int ret = 0;
> @@ -865,7 +883,7 @@ static int cs_etm__setup_queues(struct cs_etm_auxtrace 
> *etm)
>   return 0;
>  }
>  
> -static int cs_etm__update_queues(struct cs_etm_auxtrace *etm, int cpu)
> +static int cs_etm__update_queues(struct cs_etm_auxtrace *etm, int cpu, 
> struct perf_record_aux *aux)
>  {
>   int ret;
>   if (etm->queues.new_data) {
> @@ -875,6 +893,14 @@ static int cs_etm__update_queues(struct cs_etm_auxtrace 
> *etm, int cpu)
>   return ret;
>   }
>  
> + /* In timeless mode, cpu is set to -1, and a single aux buffer is 
> filled */
> + if (cpu < 0)
> + cpu = 0;
> +
> + ret = cs_etm__save_aux_record(etm->queues.queue_array[cpu].priv, aux);
> + if 

Re: [PATCH 2/7] perf cs-etm: Only search timestamp in current sample's queue.

2021-02-20 Thread Leo Yan
On Fri, Feb 12, 2021 at 04:45:08PM +0200, James Clark wrote:
> Change initial timestamp search to only operate on the queue
> related to the current event. In a later change the bounds
> of the aux record will also be used to reset the decoder and
> the record is only relevant to a single queue.

I roughly understand this patch tries to establish the mechanism for
timstamp search per CPU, but I am struggling to understand what's issue
you try to address.

So could you give more description for "what's current issue?" and
"why need to use this way to fix the issue?".  This would be very
appreciated.

> This change makes some files that had coresight data
> but didn't syntesise any events start working and generating
> events. I'm not sure of the reason for that. I'd expect this
> change to only affect the ordering of events.

This seems to me that this patch introduces regression.

> Signed-off-by: James Clark 
> ---
>  tools/perf/util/cs-etm.c | 30 ++
>  1 file changed, 14 insertions(+), 16 deletions(-)
> 
> diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c
> index 27894facae5e..8f8b448632fb 100644
> --- a/tools/perf/util/cs-etm.c
> +++ b/tools/perf/util/cs-etm.c
> @@ -97,7 +97,7 @@ struct cs_etm_queue {
>  /* RB tree for quick conversion between traceID and metadata pointers */
>  static struct intlist *traceid_list;
>  
> -static int cs_etm__update_queues(struct cs_etm_auxtrace *etm);
> +static int cs_etm__update_queues(struct cs_etm_auxtrace *etm, int cpu);
>  static int cs_etm__process_queues(struct cs_etm_auxtrace *etm);
>  static int cs_etm__process_timeless_queues(struct cs_etm_auxtrace *etm,
>  pid_t tid);
> @@ -524,7 +524,6 @@ static void cs_etm__dump_event(struct cs_etm_auxtrace 
> *etm,
>  static int cs_etm__flush_events(struct perf_session *session,
>   struct perf_tool *tool)
>  {
> - int ret;
>   struct cs_etm_auxtrace *etm = container_of(session->auxtrace,
>  struct cs_etm_auxtrace,
>  auxtrace);
> @@ -534,11 +533,6 @@ static int cs_etm__flush_events(struct perf_session 
> *session,
>   if (!tool->ordered_events)
>   return -EINVAL;
>  
> - ret = cs_etm__update_queues(etm);
> -
> - if (ret < 0)
> - return ret;
> -

When flush events, it means the trace data is discontinuous or at the
end of trace data.  If the trace data is discontinuous, we need to use
cs_etm__update_queues() to create new queues.  So if we remove the
calling cs_etm__update_queues(), I suspect it cannot handle the
discontinuous trace data anymore.

>   if (etm->timeless_decoding)
>   return cs_etm__process_timeless_queues(etm, -1);
>  
> @@ -851,10 +845,7 @@ static int cs_etm__setup_queue(struct cs_etm_auxtrace 
> *etm,
>   etmq->queue_nr = queue_nr;
>   etmq->offset = 0;
>  
> - if (etm->timeless_decoding)
> - return 0;
> - else
> - return cs_etm__search_first_timestamp(etmq);
> + return 0;
>  }
>  
>  static int cs_etm__setup_queues(struct cs_etm_auxtrace *etm)
> @@ -874,14 +865,20 @@ static int cs_etm__setup_queues(struct cs_etm_auxtrace 
> *etm)
>   return 0;
>  }
>  
> -static int cs_etm__update_queues(struct cs_etm_auxtrace *etm)
> +static int cs_etm__update_queues(struct cs_etm_auxtrace *etm, int cpu)
>  {
> + int ret;
>   if (etm->queues.new_data) {
>   etm->queues.new_data = false;
> - return cs_etm__setup_queues(etm);
> + ret = cs_etm__setup_queues(etm);

Just remind, the new parameter "cpu" is introduced in this function,
in theory, cs_etm__update_queues() should work for the specified CPU.
But it always setup queues for all CPUs with calling
cs_etm__setup_queues().

> + if (ret)
> + return ret;
>   }
>  
> - return 0;
> + if (!etm->timeless_decoding)
> + return 
> cs_etm__search_first_timestamp(etm->queues.queue_array[cpu].priv);
> + else
> + return 0;

In the original code of cs_etm__update_queues(), if there have no any
new data (or has already setup queues), then it does nothing and
directly bails out.

After applied the up change, it will always search the first timestamp
for the "cpu".

>  }
>  
>  static inline
> @@ -2358,8 +2355,9 @@ static int cs_etm__process_event(struct perf_session 
> *session,
>   else
>   timestamp = 0;
>  
> - if (timestamp || etm->timeless_decoding) {
> - err = cs_etm__update_queues(etm);
> + if ((timestamp || etm->timeless_decoding)
> + && event->header.type == PERF_RECORD_AUX) {
> + err = cs_etm__update_queues(etm, sample->cpu);

Here might cause potential issue.  Let's see an example:

If CoreSight uses N:1 model for tracers and sink (just like multiple
ETMs output trace to the same 

Re: [PATCH 1/7] perf cs-etm: Split up etm queue setup function

2021-02-20 Thread Leo Yan
Hi James,

On Fri, Feb 12, 2021 at 04:45:07PM +0200, James Clark wrote:
> Refactor the function into separate allocation and
> timestamp search parts. Later the timestamp search
> will be done multiple times.

The new introduced function cs_etm__search_first_timestamp() is to
search timestamp; if it cannot find any valid timestamp, it will
drop all packets for every queue with the logic:

  cs_etm__search_first_timestamp()
  {
 timestamp = cs_etm__etmq_get_timestamp();

 if (timestamp)
   return auxtrace_heap__add();  -> heapify timestamp

 cs_etm__clear_all_packet_queues();  -> clear all packets
 return 0;
  }

If the function cs_etm__search_first_timestamp() is invoked for multiple
times, is it possible to clear all packets in the middle of decoding?

>From my understanding, it makes sense to drop the trace data at the
very early decoding so that can get the timestamp for packets,
afterwards the packets should always have valid timestamp?  If so, I
don't think it's necessary to contain the function
cs_etm__clear_all_packet_queues() in cs_etm__search_first_timestamp().

I will go back to check this conclusion is correct or not after I
have better understanding for the whole patch set.  Just note here.

Thanks,
Leo

> Signed-off-by: James Clark 
> ---
>  tools/perf/util/cs-etm.c | 60 +---
>  1 file changed, 31 insertions(+), 29 deletions(-)
> 
> diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c
> index a2a369e2fbb6..27894facae5e 100644
> --- a/tools/perf/util/cs-etm.c
> +++ b/tools/perf/util/cs-etm.c
> @@ -765,33 +765,12 @@ static struct cs_etm_queue *cs_etm__alloc_queue(struct 
> cs_etm_auxtrace *etm)
>   return NULL;
>  }
>  
> -static int cs_etm__setup_queue(struct cs_etm_auxtrace *etm,
> -struct auxtrace_queue *queue,
> -unsigned int queue_nr)
> +static int cs_etm__search_first_timestamp(struct cs_etm_queue *etmq)
>  {
>   int ret = 0;
> + u64 timestamp;
>   unsigned int cs_queue_nr;
>   u8 trace_chan_id;
> - u64 timestamp;
> - struct cs_etm_queue *etmq = queue->priv;
> -
> - if (list_empty(>head) || etmq)
> - goto out;
> -
> - etmq = cs_etm__alloc_queue(etm);
> -
> - if (!etmq) {
> - ret = -ENOMEM;
> - goto out;
> - }
> -
> - queue->priv = etmq;
> - etmq->etm = etm;
> - etmq->queue_nr = queue_nr;
> - etmq->offset = 0;
> -
> - if (etm->timeless_decoding)
> - goto out;
>  
>   /*
>* We are under a CPU-wide trace scenario.  As such we need to know
> @@ -808,7 +787,7 @@ static int cs_etm__setup_queue(struct cs_etm_auxtrace 
> *etm,
>*/
>   ret = cs_etm__get_data_block(etmq);
>   if (ret <= 0)
> - goto out;
> + return ret;
>  
>   /*
>* Run decoder on the trace block.  The decoder will stop when
> @@ -817,7 +796,7 @@ static int cs_etm__setup_queue(struct cs_etm_auxtrace 
> *etm,
>*/
>   ret = cs_etm__decode_data_block(etmq);
>   if (ret)
> - goto out;
> + return ret;
>  
>   /*
>* Function cs_etm_decoder__do_{hard|soft}_timestamp() does all
> @@ -849,10 +828,33 @@ static int cs_etm__setup_queue(struct cs_etm_auxtrace 
> *etm,
>* Note that packets decoded above are still in the traceID's packet
>* queue and will be processed in cs_etm__process_queues().
>*/
> - cs_queue_nr = TO_CS_QUEUE_NR(queue_nr, trace_chan_id);
> - ret = auxtrace_heap__add(>heap, cs_queue_nr, timestamp);
> -out:
> - return ret;
> + cs_queue_nr = TO_CS_QUEUE_NR(etmq->queue_nr, trace_chan_id);
> + return auxtrace_heap__add(>etm->heap, cs_queue_nr, timestamp);
> +}
> +
> +static int cs_etm__setup_queue(struct cs_etm_auxtrace *etm,
> +struct auxtrace_queue *queue,
> +unsigned int queue_nr)
> +{
> + struct cs_etm_queue *etmq = queue->priv;
> +
> + if (list_empty(>head) || etmq)
> + return 0;
> +
> + etmq = cs_etm__alloc_queue(etm);
> +
> + if (!etmq)
> + return -ENOMEM;
> +
> + queue->priv = etmq;
> + etmq->etm = etm;
> + etmq->queue_nr = queue_nr;
> + etmq->offset = 0;
> +
> + if (etm->timeless_decoding)
> + return 0;
> + else
> + return cs_etm__search_first_timestamp(etmq);
>  }
>  
>  static int cs_etm__setup_queues(struct cs_etm_auxtrace *etm)
> -- 
> 2.28.0
> 


[PATCH v1 2/2] perf test: Output the sub testing result in cs-etm

2021-02-15 Thread Leo Yan
The CoreSight testing contains sub cases, e.g. every CPU iterates the
possible conntected sinks and tests the paths between the associated ETM
with the found sink.  Besides the per-thread testing, it also contains
system wide testing and snapshot testing.

To easier observe results for the sub cases, this patch introduces a new
function arm_cs_report(), it outputs the result as "PASS" or "FAIL" for
every sub case; and it records the error in the variable "glb_err" which
is used as the final return value when exits the testing.

Before:

  # perf test 73 -v
  73: Check Arm CoreSight trace data recording and synthesized samples:
  --- start ---
  test child forked, pid 17423
  Recording trace (only user mode) with path: CPU0 => tmc_etf0
  Looking at perf.data file for dumping branch samples:
  Looking at perf.data file for reporting branch samples:
  Looking at perf.data file for instruction samples:
  Recording trace (only user mode) with path: CPU0 => tmc_etr0
  Looking at perf.data file for dumping branch samples:
  Looking at perf.data file for reporting branch samples:
  Looking at perf.data file for instruction samples:

  [...]

After:

  # perf test 73 -v
  73: Check Arm CoreSight trace data recording and synthesized samples:
  --- start ---
  test child forked, pid 17423
  Recording trace (only user mode) with path: CPU0 => tmc_etf0
  Looking at perf.data file for dumping branch samples:
  Looking at perf.data file for reporting branch samples:
  Looking at perf.data file for instruction samples:
  CoreSight path testing (CPU0 -> tmc_etf0): PASS
  Recording trace (only user mode) with path: CPU0 => tmc_etr0
  Looking at perf.data file for dumping branch samples:
  Looking at perf.data file for reporting branch samples:
  Looking at perf.data file for instruction samples:
  CoreSight path testing (CPU0 -> tmc_etr0): PASS
  [...]

Signed-off-by: Leo Yan 
---
 tools/perf/tests/shell/test_arm_coresight.sh | 24 
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/tools/perf/tests/shell/test_arm_coresight.sh 
b/tools/perf/tests/shell/test_arm_coresight.sh
index 59b647455ec6..c9eef0bba6f1 100755
--- a/tools/perf/tests/shell/test_arm_coresight.sh
+++ b/tools/perf/tests/shell/test_arm_coresight.sh
@@ -11,6 +11,7 @@
 
 perfdata=$(mktemp /tmp/__perf_test.perf.data.X)
 file=$(mktemp /tmp/temporary_file.X)
+glb_err=0
 
 skip_if_no_cs_etm_event() {
perf list | grep -q 'cs_etm//' && return 0
@@ -69,6 +70,15 @@ perf_report_instruction_samples() {
egrep " +[0-9]+\.[0-9]+% +$1" > /dev/null 2>&1
 }
 
+arm_cs_report() {
+   if [ $2 != 0 ]; then
+   echo "$1: FAIL"
+   glb_err=$2
+   else
+   echo "$1: PASS"
+   fi
+}
+
 is_device_sink() {
# If the node of "enable_sink" is existed under the device path, this
# means the device is a sink device.  Need to exclude 'tpiu' since it
@@ -113,9 +123,7 @@ arm_cs_iterate_devices() {
perf_report_instruction_samples touch
 
err=$?
-
-   # Exit when find failure
-   [ $err != 0 ] && exit $err
+   arm_cs_report "CoreSight path testing (CPU$2 -> 
$device_name)" $err
fi
 
arm_cs_iterate_devices $dev $2
@@ -143,9 +151,7 @@ arm_cs_etm_system_wide_test() {
perf_report_instruction_samples perf
 
err=$?
-
-   # Exit when find failure
-   [ $err != 0 ] && exit $err
+   arm_cs_report "CoreSight system wide testing" $err
 }
 
 arm_cs_etm_snapshot_test() {
@@ -169,12 +175,10 @@ arm_cs_etm_snapshot_test() {
perf_report_instruction_samples dd
 
err=$?
-
-   # Exit when find failure
-   [ $err != 0 ] && exit $err
+   arm_cs_report "CoreSight snapshot testing" $err
 }
 
 arm_cs_etm_traverse_path_test
 arm_cs_etm_system_wide_test
 arm_cs_etm_snapshot_test
-exit 0
+exit $glb_err
-- 
2.25.1



[PATCH v1 1/2] perf test: Suppress logs in cs-etm testing

2021-02-15 Thread Leo Yan
With the option '-v' for the verbose logs, "perf test" outputs tons of
logs for the CoreSight case, the logs are mainly introduced by the
decoding.  And it outputs some trivial info from "perf record" command
and there have debugging info for CPU number and device name when
iterates between ETMs and sinks.

For a neat output format, this patch redirects the output logs to
"/dev/null", thus can avoid to flood logs.  And it removes the redundant
log for CPU number and device name, which have already printed out the
relevant info in the function record_touch_file().

Signed-off-by: Leo Yan 
---
 tools/perf/tests/shell/test_arm_coresight.sh | 21 +---
 1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/tools/perf/tests/shell/test_arm_coresight.sh 
b/tools/perf/tests/shell/test_arm_coresight.sh
index 18fde2f179cd..59b647455ec6 100755
--- a/tools/perf/tests/shell/test_arm_coresight.sh
+++ b/tools/perf/tests/shell/test_arm_coresight.sh
@@ -33,7 +33,7 @@ record_touch_file() {
echo "Recording trace (only user mode) with path: CPU$2 => $1"
rm -f $file
perf record -o ${perfdata} -e cs_etm/@$1/u --per-thread \
-   -- taskset -c $2 touch $file
+   -- taskset -c $2 touch $file > /dev/null 2>&1
 }
 
 perf_script_branch_samples() {
@@ -43,8 +43,8 @@ perf_script_branch_samples() {
#   touch  6512  1 branches:u:  b220824c 
strcmp+0xc (/lib/aarch64-linux-gnu/ld-2.27.so)
#   touch  6512  1 branches:u:  b22082e0 
strcmp+0xa0 (/lib/aarch64-linux-gnu/ld-2.27.so)
#   touch  6512  1 branches:u:  b2208320 
strcmp+0xe0 (/lib/aarch64-linux-gnu/ld-2.27.so)
-   perf script -F,-time -i ${perfdata} | \
-   egrep " +$1 +[0-9]+ .* +branches:(.*:)? +"
+   perf script -F,-time -i ${perfdata} 2>&1 | \
+   egrep " +$1 +[0-9]+ .* +branches:(.*:)? +" > /dev/null 2>&1
 }
 
 perf_report_branch_samples() {
@@ -54,8 +54,8 @@ perf_report_branch_samples() {
#   73.04%73.04%  touchlibc-2.27.so  [.] _dl_addr
#7.71% 7.71%  touchlibc-2.27.so  [.] getenv
#2.59% 2.59%  touchld-2.27.so[.] strcmp
-   perf report --stdio -i ${perfdata} | \
-   egrep " +[0-9]+\.[0-9]+% +[0-9]+\.[0-9]+% +$1 "
+   perf report --stdio -i ${perfdata} 2>&1 | \
+   egrep " +[0-9]+\.[0-9]+% +[0-9]+\.[0-9]+% +$1 " > /dev/null 2>&1
 }
 
 perf_report_instruction_samples() {
@@ -65,8 +65,8 @@ perf_report_instruction_samples() {
#   68.12%  touchlibc-2.27.so   [.] _dl_addr
#5.80%  touchlibc-2.27.so   [.] getenv
#4.35%  touchld-2.27.so [.] _dl_fixup
-   perf report --itrace=i1000i --stdio -i ${perfdata} | \
-   egrep " +[0-9]+\.[0-9]+% +$1"
+   perf report --itrace=i1000i --stdio -i ${perfdata} 2>&1 | \
+   egrep " +[0-9]+\.[0-9]+% +$1" > /dev/null 2>&1
 }
 
 is_device_sink() {
@@ -129,9 +129,6 @@ arm_cs_etm_traverse_path_test() {
# Find the ETM device belonging to which CPU
cpu=`cat $dev/cpu`
 
-   echo $dev
-   echo $cpu
-
# Use depth-first search (DFS) to iterate outputs
arm_cs_iterate_devices $dev $cpu
done
@@ -139,7 +136,7 @@ arm_cs_etm_traverse_path_test() {
 
 arm_cs_etm_system_wide_test() {
echo "Recording trace with system wide mode"
-   perf record -o ${perfdata} -e cs_etm// -a -- ls
+   perf record -o ${perfdata} -e cs_etm// -a -- ls > /dev/null 2>&1
 
perf_script_branch_samples perf &&
perf_report_branch_samples perf &&
@@ -154,7 +151,7 @@ arm_cs_etm_system_wide_test() {
 arm_cs_etm_snapshot_test() {
echo "Recording trace with snapshot mode"
perf record -o ${perfdata} -e cs_etm// -S \
-   -- dd if=/dev/zero of=/dev/null &
+   -- dd if=/dev/zero of=/dev/null > /dev/null 2>&1 &
PERFPID=$!
 
# Wait for perf program
-- 
2.25.1



[PATCH v1 0/2] perf test: Output sub testing result in cs-etm

2021-02-15 Thread Leo Yan
inished with 0
   end ----
  Check Arm CoreSight trace data recording and synthesized samples: Ok

[1] https://lkft.linaro.org/


Leo Yan (2):
  perf test: Suppress logs in cs-etm testing
  perf test: Output the sub testing result in cs-etm

 tools/perf/tests/shell/test_arm_coresight.sh | 45 ++--
 1 file changed, 23 insertions(+), 22 deletions(-)

-- 
2.25.1



[PATCH v4 4/5] perf cs-etm: Add helper cs_etm__get_pid_fmt()

2021-02-13 Thread Leo Yan
This patch adds helper function cs_etm__get_pid_fmt(), by passing
parameter "traceID", it returns the PID format.

Signed-off-by: Leo Yan 
Reviewed-by: Mathieu Poirier 
Reviewed-by: Suzuki K Poulose 
---
 tools/perf/util/cs-etm.c | 42 
 tools/perf/util/cs-etm.h |  1 +
 2 files changed, 43 insertions(+)

diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c
index a2a369e2fbb6..b9c1d329a7f1 100644
--- a/tools/perf/util/cs-etm.c
+++ b/tools/perf/util/cs-etm.c
@@ -7,6 +7,7 @@
  */
 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -156,6 +157,47 @@ int cs_etm__get_cpu(u8 trace_chan_id, int *cpu)
return 0;
 }
 
+/*
+ * The returned PID format is presented by two bits:
+ *
+ *   Bit ETM_OPT_CTXTID: CONTEXTIDR or CONTEXTIDR_EL1 is traced;
+ *   Bit ETM_OPT_CTXTID2: CONTEXTIDR_EL2 is traced.
+ *
+ * It's possible that the two bits ETM_OPT_CTXTID and ETM_OPT_CTXTID2
+ * are enabled at the same time when the session runs on an EL2 kernel.
+ * This means the CONTEXTIDR_EL1 and CONTEXTIDR_EL2 both will be
+ * recorded in the trace data, the tool will selectively use
+ * CONTEXTIDR_EL2 as PID.
+ */
+int cs_etm__get_pid_fmt(u8 trace_chan_id, u64 *pid_fmt)
+{
+   struct int_node *inode;
+   u64 *metadata, val;
+
+   inode = intlist__find(traceid_list, trace_chan_id);
+   if (!inode)
+   return -EINVAL;
+
+   metadata = inode->priv;
+
+   if (metadata[CS_ETM_MAGIC] == __perf_cs_etmv3_magic) {
+   val = metadata[CS_ETM_ETMCR];
+   /* CONTEXTIDR is traced */
+   if (val & BIT(ETM_OPT_CTXTID))
+   *pid_fmt = BIT(ETM_OPT_CTXTID);
+   } else {
+   val = metadata[CS_ETMV4_TRCCONFIGR];
+   /* CONTEXTIDR_EL2 is traced */
+   if (val & (BIT(ETM4_CFG_BIT_VMID) | BIT(ETM4_CFG_BIT_VMID_OPT)))
+   *pid_fmt = BIT(ETM_OPT_CTXTID2);
+   /* CONTEXTIDR_EL1 is traced */
+   else if (val & BIT(ETM4_CFG_BIT_CTXTID))
+   *pid_fmt = BIT(ETM_OPT_CTXTID);
+   }
+
+   return 0;
+}
+
 void cs_etm__etmq_set_traceid_queue_timestamp(struct cs_etm_queue *etmq,
  u8 trace_chan_id)
 {
diff --git a/tools/perf/util/cs-etm.h b/tools/perf/util/cs-etm.h
index 4ad925d6d799..7cc3bba0017d 100644
--- a/tools/perf/util/cs-etm.h
+++ b/tools/perf/util/cs-etm.h
@@ -173,6 +173,7 @@ struct cs_etm_packet_queue {
 int cs_etm__process_auxtrace_info(union perf_event *event,
  struct perf_session *session);
 int cs_etm__get_cpu(u8 trace_chan_id, int *cpu);
+int cs_etm__get_pid_fmt(u8 trace_chan_id, u64 *pid_fmt);
 int cs_etm__etmq_set_tid(struct cs_etm_queue *etmq,
 pid_t tid, u8 trace_chan_id);
 bool cs_etm__etmq_is_timeless(struct cs_etm_queue *etmq);
-- 
2.25.1



[PATCH v4 5/5] perf cs-etm: Detect pid in VMID for kernel running at EL2

2021-02-13 Thread Leo Yan
From: Suzuki K Poulose 

The PID of the task could be traced as VMID when the kernel is running
at EL2.  Teach the decoder to look for VMID when the CONTEXTIDR (Arm32)
or CONTEXTIDR_EL1 (Arm64) is invalid but we have a valid VMID.

Cc: Mike Leach 
Cc: Mathieu Poirier 
Cc: Al Grant 
Signed-off-by: Suzuki K Poulose 
Co-developed-by: Leo Yan 
Signed-off-by: Leo Yan 
Reviewed-by: Mathieu Poirier 
---
 .../perf/util/cs-etm-decoder/cs-etm-decoder.c | 38 +--
 1 file changed, 34 insertions(+), 4 deletions(-)

diff --git a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c 
b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c
index 3f4bc4050477..4052c9ce6e2f 100644
--- a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c
+++ b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c
@@ -6,6 +6,7 @@
  * Author: Mathieu Poirier 
  */
 
+#include 
 #include 
 #include 
 #include 
@@ -491,13 +492,42 @@ cs_etm_decoder__set_tid(struct cs_etm_queue *etmq,
const ocsd_generic_trace_elem *elem,
const uint8_t trace_chan_id)
 {
-   pid_t tid;
+   pid_t tid = -1;
+   static u64 pid_fmt;
+   int ret;
 
-   /* Ignore PE_CONTEXT packets that don't have a valid contextID */
-   if (!elem->context.ctxt_id_valid)
+   /*
+* As all the ETMs run at the same exception level, the system should
+* have the same PID format crossing CPUs.  So cache the PID format
+* and reuse it for sequential decoding.
+*/
+   if (!pid_fmt) {
+   ret = cs_etm__get_pid_fmt(trace_chan_id, _fmt);
+   if (ret)
+   return OCSD_RESP_FATAL_SYS_ERR;
+   }
+
+   /*
+* Process the PE_CONTEXT packets if we have a valid contextID or VMID.
+* If the kernel is running at EL2, the PID is traced in CONTEXTIDR_EL2
+* as VMID, Bit ETM_OPT_CTXTID2 is set in this case.
+*/
+   switch (pid_fmt) {
+   case BIT(ETM_OPT_CTXTID):
+   if (elem->context.ctxt_id_valid)
+   tid = elem->context.context_id;
+   break;
+   case BIT(ETM_OPT_CTXTID2):
+   if (elem->context.vmid_valid)
+   tid = elem->context.vmid;
+   break;
+   default:
+   break;
+   }
+
+   if (tid == -1)
return OCSD_RESP_CONT;
 
-   tid =  elem->context.context_id;
if (cs_etm__etmq_set_tid(etmq, tid, trace_chan_id))
return OCSD_RESP_FATAL_SYS_ERR;
 
-- 
2.25.1



[PATCH v4 2/5] perf cs-etm: Fix bitmap for option

2021-02-13 Thread Leo Yan
From: Suzuki K Poulose 

When set option with macros ETM_OPT_CTXTID and ETM_OPT_TS, it wrongly
takes these two values (14 and 28 prespectively) as bit masks, but
actually both are the offset for bits.  But this doesn't lead to
further failure due to the AND logic operation will be always true for
ETM_OPT_CTXTID / ETM_OPT_TS.

This patch uses the BIT() macro for option bits, thus it can request the
correct bitmaps for "contextid" and "timestamp" when calling
cs_etm_set_option().

Signed-off-by: Suzuki K Poulose 
[Extract the change as a separate patch for easier review]
Signed-off-by: Leo Yan 
Reviewed-by: Mike Leach 
Reviewed-by: Mathieu Poirier 
---
 tools/perf/arch/arm/util/cs-etm.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tools/perf/arch/arm/util/cs-etm.c 
b/tools/perf/arch/arm/util/cs-etm.c
index bd446aba64f7..ad8421e8b651 100644
--- a/tools/perf/arch/arm/util/cs-etm.c
+++ b/tools/perf/arch/arm/util/cs-etm.c
@@ -169,17 +169,17 @@ static int cs_etm_set_option(struct auxtrace_record *itr,
!cpu_map__has(online_cpus, i))
continue;
 
-   if (option & ETM_OPT_CTXTID) {
+   if (option & BIT(ETM_OPT_CTXTID)) {
err = cs_etm_set_context_id(itr, evsel, i);
if (err)
goto out;
}
-   if (option & ETM_OPT_TS) {
+   if (option & BIT(ETM_OPT_TS)) {
err = cs_etm_set_timestamp(itr, evsel, i);
if (err)
goto out;
}
-   if (option & ~(ETM_OPT_CTXTID | ETM_OPT_TS))
+   if (option & ~(BIT(ETM_OPT_CTXTID) | BIT(ETM_OPT_TS)))
/* Nothing else is currently supported */
goto out;
}
@@ -406,7 +406,7 @@ static int cs_etm_recording_options(struct auxtrace_record 
*itr,
evsel__set_sample_bit(cs_etm_evsel, CPU);
 
err = cs_etm_set_option(itr, cs_etm_evsel,
-   ETM_OPT_CTXTID | ETM_OPT_TS);
+   BIT(ETM_OPT_CTXTID) | BIT(ETM_OPT_TS));
if (err)
goto out;
}
-- 
2.25.1



[PATCH v4 1/5] tools headers UAPI: Update tools' copy of linux/coresight-pmu.h

2021-02-13 Thread Leo Yan
To get the changes in the commit:

  "coresight: etm-perf: Clarify comment on perf options".

Signed-off-by: Leo Yan 
Reviewed-by: Suzuki K Poulose 
Reviewed-by: Mathieu Poirier 
---
 tools/include/linux/coresight-pmu.h | 17 -
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/tools/include/linux/coresight-pmu.h 
b/tools/include/linux/coresight-pmu.h
index b0e35eec6499..5dc47cfdcf07 100644
--- a/tools/include/linux/coresight-pmu.h
+++ b/tools/include/linux/coresight-pmu.h
@@ -10,11 +10,18 @@
 #define CORESIGHT_ETM_PMU_NAME "cs_etm"
 #define CORESIGHT_ETM_PMU_SEED  0x10
 
-/* ETMv3.5/PTM's ETMCR config bit */
-#define ETM_OPT_CYCACC  12
-#define ETM_OPT_CTXTID 14
-#define ETM_OPT_TS  28
-#define ETM_OPT_RETSTK 29
+/*
+ * Below are the definition of bit offsets for perf option, and works as
+ * arbitrary values for all ETM versions.
+ *
+ * Most of them are orignally from ETMv3.5/PTM's ETMCR config, therefore,
+ * ETMv3.5/PTM doesn't define ETMCR config bits with prefix "ETM3_" and
+ * directly use below macros as config bits.
+ */
+#define ETM_OPT_CYCACC 12
+#define ETM_OPT_CTXTID 14
+#define ETM_OPT_TS 28
+#define ETM_OPT_RETSTK 29
 
 /* ETMv4 CONFIGR programming bits for the ETM OPTs */
 #define ETM4_CFG_BIT_CYCACC4
-- 
2.25.1



[PATCH v4 3/5] perf cs-etm: Support PID tracing in config

2021-02-13 Thread Leo Yan
From: Suzuki K Poulose 

If the kernel is running at EL2, the pid of a task is exposed via VMID
instead of the CONTEXTID.  Add support for this in the perf tool.

This patch respects user setting if user has specified any configs
from "contextid", "contextid1" or "contextid2"; otherwise, it
dynamically sets config based on PMU format "contextid".

Cc: Mike Leach 
Cc: Mathieu Poirier 
Cc: Al Grant 
Signed-off-by: Suzuki K Poulose 
Co-developed-by: Leo Yan 
Signed-off-by: Leo Yan 
Reviewed-by: Mike Leach 
Reviewed-by: Mathieu Poirier 
---
 tools/include/linux/coresight-pmu.h |  3 ++
 tools/perf/arch/arm/util/cs-etm.c   | 61 +++--
 2 files changed, 52 insertions(+), 12 deletions(-)

diff --git a/tools/include/linux/coresight-pmu.h 
b/tools/include/linux/coresight-pmu.h
index 5dc47cfdcf07..4ac5c081af93 100644
--- a/tools/include/linux/coresight-pmu.h
+++ b/tools/include/linux/coresight-pmu.h
@@ -20,14 +20,17 @@
  */
 #define ETM_OPT_CYCACC 12
 #define ETM_OPT_CTXTID 14
+#define ETM_OPT_CTXTID215
 #define ETM_OPT_TS 28
 #define ETM_OPT_RETSTK 29
 
 /* ETMv4 CONFIGR programming bits for the ETM OPTs */
 #define ETM4_CFG_BIT_CYCACC4
 #define ETM4_CFG_BIT_CTXTID6
+#define ETM4_CFG_BIT_VMID  7
 #define ETM4_CFG_BIT_TS11
 #define ETM4_CFG_BIT_RETSTK12
+#define ETM4_CFG_BIT_VMID_OPT  15
 
 static inline int coresight_get_trace_id(int cpu)
 {
diff --git a/tools/perf/arch/arm/util/cs-etm.c 
b/tools/perf/arch/arm/util/cs-etm.c
index ad8421e8b651..08c329d6b2a5 100644
--- a/tools/perf/arch/arm/util/cs-etm.c
+++ b/tools/perf/arch/arm/util/cs-etm.c
@@ -67,6 +67,7 @@ static int cs_etm_set_context_id(struct auxtrace_record *itr,
char path[PATH_MAX];
int err = -EINVAL;
u32 val;
+   u64 contextid;
 
ptr = container_of(itr, struct cs_etm_recording, itr);
cs_etm_pmu = ptr->cs_etm_pmu;
@@ -86,25 +87,59 @@ static int cs_etm_set_context_id(struct auxtrace_record 
*itr,
goto out;
}
 
+   /* User has configured for PID tracing, respects it. */
+   contextid = evsel->core.attr.config &
+   (BIT(ETM_OPT_CTXTID) | BIT(ETM_OPT_CTXTID2));
+
/*
-* TRCIDR2.CIDSIZE, bit [9-5], indicates whether contextID tracing
-* is supported:
-*  0b0 Context ID tracing is not supported.
-*  0b00100 Maximum of 32-bit Context ID size.
-*  All other values are reserved.
+* If user doesn't configure the contextid format, parse PMU format and
+* enable PID tracing according to the "contextid" format bits:
+*
+*   If bit ETM_OPT_CTXTID is set, trace CONTEXTIDR_EL1;
+*   If bit ETM_OPT_CTXTID2 is set, trace CONTEXTIDR_EL2.
 */
-   val = BMVAL(val, 5, 9);
-   if (!val || val != 0x4) {
-   err = -EINVAL;
-   goto out;
+   if (!contextid)
+   contextid = perf_pmu__format_bits(_etm_pmu->format,
+ "contextid");
+
+   if (contextid & BIT(ETM_OPT_CTXTID)) {
+   /*
+* TRCIDR2.CIDSIZE, bit [9-5], indicates whether contextID
+* tracing is supported:
+*  0b0 Context ID tracing is not supported.
+*  0b00100 Maximum of 32-bit Context ID size.
+*  All other values are reserved.
+*/
+   val = BMVAL(val, 5, 9);
+   if (!val || val != 0x4) {
+   pr_err("%s: CONTEXTIDR_EL1 isn't supported\n",
+  CORESIGHT_ETM_PMU_NAME);
+   err = -EINVAL;
+   goto out;
+   }
+   }
+
+   if (contextid & BIT(ETM_OPT_CTXTID2)) {
+   /*
+* TRCIDR2.VMIDOPT[30:29] != 0 and
+* TRCIDR2.VMIDSIZE[14:10] == 0b00100 (32bit virtual contextid)
+* We can't support CONTEXTIDR in VMID if the size of the
+* virtual context id is < 32bit.
+* Any value of VMIDSIZE >= 4 (i.e, > 32bit) is fine for us.
+*/
+   if (!BMVAL(val, 29, 30) || BMVAL(val, 10, 14) < 4) {
+   pr_err("%s: CONTEXTIDR_EL2 isn't supported\n",
+  CORESIGHT_ETM_PMU_NAME);
+   err = -EINVAL;
+   goto out;
+   }
}
 
/* All good, let the kernel know */
-   evsel->core.attr.config |= (1 << ETM_OPT_CTXTID);
+   evsel->core.attr.config |= contextid;
err = 0;
 
 out:
-
return err;
 }
 
@@ -485,7 +520,9 @@ static u64 cs_etmv4_get_config(struct auxtrace_record *itr)
config |= BIT(ETM4_CFG_BIT_TS);
if (config_op

[PATCH v4 0/5] perf cs-etm: Fix pid tracing with VHE

2021-02-13 Thread Leo Yan
This patch set is to follow up the patch series "coresight: etm-perf:
Fix pid tracing with VHE" [1].

Since the kernel and documentation patches have been picked up by
Mathieu Poirier and will go through the char-misc-next branch [2] to the
mainline kernel; the rest patches are for the perf tool, so combine
these patches into this patch set.

The patch set can be cleanly applied on perf/core branch with:

  commit 6db59d357e8e ("perf arm64/s390: Fix printf conversion specifier for IP 
addresses")

And this patch set has been verified on Arm Juno-r2 board.

Changes from v3:
* Added Reviewed-by tags (Mathieu/Mike/Suzuki);
* Changed to use the existed macros for option bits in patch 02/05
  (Mathieu).


[1] https://lore.kernel.org/patchwork/cover/1376776/
[2] 
https://git.kernel.org/pub/scm/linux/kernel/git/gregkh/char-misc.git/log/?h=char-misc-next


Leo Yan (2):
  tools headers UAPI: Update tools' copy of linux/coresight-pmu.h
  perf cs-etm: Add helper cs_etm__get_pid_fmt()

Suzuki K Poulose (3):
  perf cs-etm: Fix bitmap for option
  perf cs-etm: Support PID tracing in config
  perf cs-etm: Detect pid in VMID for kernel running at EL2

 tools/include/linux/coresight-pmu.h   | 20 --
 tools/perf/arch/arm/util/cs-etm.c | 69 ++-
 .../perf/util/cs-etm-decoder/cs-etm-decoder.c | 38 --
 tools/perf/util/cs-etm.c  | 42 +++
 tools/perf/util/cs-etm.h  |  1 +
 5 files changed, 145 insertions(+), 25 deletions(-)

-- 
2.25.1



Re: [PATCH v2 1/6] perf arm-spe: Enable sample type PERF_SAMPLE_DATA_SRC

2021-02-12 Thread Leo Yan
On Fri, Feb 12, 2021 at 05:43:40PM -0300, Arnaldo Carvalho de Melo wrote:
> Em Thu, Feb 11, 2021 at 03:38:51PM +0200, James Clark escreveu:
> > From: Leo Yan 
> > 
> > This patch is to enable sample type PERF_SAMPLE_DATA_SRC for Arm SPE in
> > the perf data, when output the tracing data, it tells tools that it
> > contains data source in the memory event.
> 
> Thanks, series applied.

Thanks a lot, James and Arnaldo.


Re: [PATCH 8/8] perf arm-spe: Set thread TID

2021-02-10 Thread Leo Yan
Hi James,

On Wed, Feb 10, 2021 at 12:16:58PM +0200, James Clark wrote:
> 
> 
> On 09/02/2021 17:36, James Clark wrote:
> > 
> > 
> > On 04/02/2021 12:27, Leo Yan wrote:
> >> On Mon, Feb 01, 2021 at 07:40:45PM +0200, James Clark wrote:
> >>>
> >>> On 31/01/2021 14:01, Leo Yan wrote:
> >>>> Option 1: by merging patches 07/08 and 08/08, we can firstly support PID
> >>>> tracing for root namespace, and later we can extend to support PID
> >>>> tracing in container (and in VMs).
> >>>>
> >> Arm SPE has the problem for step2, due to the trace uses statistical
> >> approach, it doesn't trace the complete branch instructions, so it
> >> cannot promise to capture all branches for the symbol "__switch_to".
> >> If we only use the events PERF_RECORD_SWITCH /
> >> PERF_RECORD_SWITCH_CPU_WIDE, then it will lead to the coarse result
> >> for PID tracing.
> >>
> >> For this reason, seems to me it's pragmatic to use CONTEXTIDR for
> >> PID tracing at current stage, at least it can allow the root domain
> >> tracing works accurately.  But this will leave the issue for tracing
> >> PID in non root namespace, we need to figure out solution later.
> >>
> >> Hi Mark.R, Al, do you have any comments for this?
> > 
> > Hi Leo,
> > 
> > I spoke with Al and his suggestion is to clear the PID value if the event
> > was opened outside of the root namespace.
> > 
> > I think that's not a bad idea as it gets us PIDs in most cases but also
> > doesn't show any incorrect data. Do you know if it's possible to determine
> > that from a perf.data file? Unfortunately it doesn't seem to be possible
> > to disable CONTEXTIDR tracing when opening the event as it's compile time
> > only and can't be disabled dynamically.
> > 
> > James
> > 
> 
> I've had a think about it and I think we should do one of two things:

Thanks a lot for digging!

> #1) Remove the PID setting from the data source patchset. This will keep the
> existing behaviour of using the PID of the first traced process only even
> if there are forks. Later we can implement #2 or attempt to make it work
> even in non root namespaces.

I agree.  Let's simplify the data source patch set; could you resend the
data source patch set so this can allow perf maintainer to easier follow
up (and merge) the patch series?  Thanks!

> I'm not sure how this will impact your c2c patchset if you are relying on
> the PID data Leo?

Yes, based on the experiment, if we want to extend "perf c2c" for
exhibit multi-threading info, then it depends on PID tracing.

> #2) Make a change in the SPE driver to add an option for disabling CONTEXTIDR.
> We will disable this from userspace if the event is opened in a non root
> namespace. So we will only show PID data if we know it's valid, otherwise
> the existing behaviour of only using the first PID will remain.

Yeah, just a minor difference in my head.

Yes, we can use the kernel to export an extra PMU format, e.g. a new PMU format
"contextid", so the kernel provides a knob for userspace (this is similiar with
perf cs-etm :)).

I am just wandering if we can disable CONTEXTIDR tracing in the kernel side,
e.g. when the kernel detects if it's running on non root namespace, it should
not set bit SYS_PMSCR_EL1_CX_SHIFT; so if the tool in the userspace has
specified the PMU format "contextid" from non root namespace, the kernel should
report failure for without permission.

This seems to me, at least, we can have a sane solution for root
namespace.

Thanks,
Leo


Re: [PATCH] perf tools: Fix arm64 build error with gcc-11

2021-02-09 Thread Leo Yan
On Tue, Feb 09, 2021 at 02:18:26PM +, John Garry wrote:
> On 09/02/2021 12:17, Leo Yan wrote:
> > Hi Jianlin,
> > 
> > On Tue, Feb 09, 2021 at 07:33:57PM +0800, Jianlin Lv wrote:
> > > gcc version: 11.0.0 20210208 (experimental) (GCC)
> > > 
> > > Following build error on arm64:
> > > 
> > > ...
> > > In function ‘printf’,
> > >  inlined from ‘regs_dump__printf’ at util/session.c:1141:3,
> > >  inlined from ‘regs__printf’ at util/session.c:1169:2:
> > > /usr/include/aarch64-linux-gnu/bits/stdio2.h:107:10: \
> > >error: ‘%-5s’ directive argument is null [-Werror=format-overflow=]
> > > 
> > > 107 |   return __printf_chk (__USE_FORTIFY_LEVEL - 1, __fmt, \
> > >  __va_arg_pack ());
> > > 
> > > ..
> > > In function ‘fprintf’,
> > >inlined from ‘perf_sample__fprintf_regs.isra’ at \
> > >  builtin-script.c:622:14:
> > > /usr/include/aarch64-linux-gnu/bits/stdio2.h:100:10: \
> > >   error: ‘%5s’ directive argument is null [-Werror=format-overflow=]
> > >100 |   return __fprintf_chk (__stream, __USE_FORTIFY_LEVEL - 1, __fmt,
> > >101 | __va_arg_pack ());
> > > 
> > > cc1: all warnings being treated as errors
> > > ...
> > > 
> > > This patch fixes Wformat-overflow warnings by replacing the return
> > > value NULL of perf_reg_name with "unknown".
> > > 
> > > Signed-off-by: Jianlin Lv 
> > > ---
> > >   tools/perf/arch/arm64/include/perf_regs.h | 4 ++--
> > >   1 file changed, 2 insertions(+), 2 deletions(-)
> > > 
> > > diff --git a/tools/perf/arch/arm64/include/perf_regs.h 
> > > b/tools/perf/arch/arm64/include/perf_regs.h
> > > index baaa5e64a3fb..901419f907c0 100644
> > > --- a/tools/perf/arch/arm64/include/perf_regs.h
> > > +++ b/tools/perf/arch/arm64/include/perf_regs.h
> > > @@ -85,10 +85,10 @@ static inline const char *perf_reg_name(int id)
> > >   case PERF_REG_ARM64_PC:
> > >   return "pc";
> > >   default:
> > > - return NULL;
> > > + return "unknown";
> > >   }
> > > - return NULL;
> > > + return "unknown";
> > 
> > This issue is a common issue crossing all archs.  So it's better to
> > change the code in the places where calls perf_reg_name(), e.g. in
> > util/session.c:
> > 
> > --- a/tools/perf/util/session.c
> > +++ b/tools/perf/util/session.c
> > @@ -1135,12 +1135,14 @@ static void branch_stack__printf(struct perf_sample 
> > *sample, bool callstack)
> >   static void regs_dump__printf(u64 mask, u64 *regs)
> >   {
> >  unsigned rid, i = 0;
> > +   char *reg_name;
> >  for_each_set_bit(rid, (unsigned long *) , sizeof(mask) * 8) {
> >  u64 val = regs[i++];
> > +   reg_name = perf_reg_name(rid);
> >  printf(" %-5s 0x%016" PRIx64 "\n",
> > -  perf_reg_name(rid), val);
> > +  reg_name ?: "Unknown", val);
> >  }
> >   }
> > 
> > And another potential issue is the format specifier "%-5s", it prints
> > out maximum to 5 chars,
> 
> Doesn't the width field specify the min, not max, number of characters?

Thanks for correction, John.

I wrongly understood it and sorry for confusion.  Wiki says [1]:

"The Width field specifies a minimum number of characters to output,
and is typically used to pad fixed-width fields in tabulated output,
where the fields would otherwise be smaller, although it does not
cause truncation of oversized fields."

Thanks,
Leo

[1] https://en.wikipedia.org/wiki/Printf_format_string#Width_field


Re: [PATCH] perf tools: Fix arm64 build error with gcc-11

2021-02-09 Thread Leo Yan
Hi Jianlin,

On Tue, Feb 09, 2021 at 07:33:57PM +0800, Jianlin Lv wrote:
> gcc version: 11.0.0 20210208 (experimental) (GCC)
> 
> Following build error on arm64:
> 
> ...
> In function ‘printf’,
> inlined from ‘regs_dump__printf’ at util/session.c:1141:3,
> inlined from ‘regs__printf’ at util/session.c:1169:2:
> /usr/include/aarch64-linux-gnu/bits/stdio2.h:107:10: \
>   error: ‘%-5s’ directive argument is null [-Werror=format-overflow=]
> 
> 107 |   return __printf_chk (__USE_FORTIFY_LEVEL - 1, __fmt, \
> __va_arg_pack ());
> 
> ..
> In function ‘fprintf’,
>   inlined from ‘perf_sample__fprintf_regs.isra’ at \
> builtin-script.c:622:14:
> /usr/include/aarch64-linux-gnu/bits/stdio2.h:100:10: \
>   error: ‘%5s’ directive argument is null [-Werror=format-overflow=]
>   100 |   return __fprintf_chk (__stream, __USE_FORTIFY_LEVEL - 1, __fmt,
>   101 | __va_arg_pack ());
> 
> cc1: all warnings being treated as errors
> ...
> 
> This patch fixes Wformat-overflow warnings by replacing the return
> value NULL of perf_reg_name with "unknown".
> 
> Signed-off-by: Jianlin Lv 
> ---
>  tools/perf/arch/arm64/include/perf_regs.h | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/tools/perf/arch/arm64/include/perf_regs.h 
> b/tools/perf/arch/arm64/include/perf_regs.h
> index baaa5e64a3fb..901419f907c0 100644
> --- a/tools/perf/arch/arm64/include/perf_regs.h
> +++ b/tools/perf/arch/arm64/include/perf_regs.h
> @@ -85,10 +85,10 @@ static inline const char *perf_reg_name(int id)
>   case PERF_REG_ARM64_PC:
>   return "pc";
>   default:
> - return NULL;
> + return "unknown";
>   }
>  
> - return NULL;
> + return "unknown";

This issue is a common issue crossing all archs.  So it's better to
change the code in the places where calls perf_reg_name(), e.g. in
util/session.c:

--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -1135,12 +1135,14 @@ static void branch_stack__printf(struct perf_sample 
*sample, bool callstack)
 static void regs_dump__printf(u64 mask, u64 *regs)
 {
unsigned rid, i = 0;
+   char *reg_name;
 
for_each_set_bit(rid, (unsigned long *) , sizeof(mask) * 8) {
u64 val = regs[i++];
 
+   reg_name = perf_reg_name(rid);
printf(" %-5s 0x%016" PRIx64 "\n",
-  perf_reg_name(rid), val);
+  reg_name ?: "Unknown", val);
}
 }

And another potential issue is the format specifier "%-5s", it prints
out maximum to 5 chars, but actually string "Unknown" has 7 chars.
Actually the format specifier breaks other archs register names, e.g.
[1][2], seems to me, it's better to change as "%-8s", you might need
to use a dedicated patch for format specifier changes.

Thanks,
Leo


[1] 
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/tools/perf/arch/powerpc/include/perf_regs.h#n57
[2] 
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/tools/perf/arch/csky/include/perf_regs.h#n83


Re: [PATCH v3 4/8] perf cs-etm: Fix bitmap for option

2021-02-08 Thread Leo Yan
On Mon, Feb 08, 2021 at 01:46:41PM -0700, Mathieu Poirier wrote:
> On Sat, Feb 06, 2021 at 11:08:29PM +0800, Leo Yan wrote:
> > From: Suzuki K Poulose 
> > 
> > When set option with macros ETM_OPT_CTXTID and ETM_OPT_TS, it wrongly
> > takes these two values (14 and 28 prespectively) as bit masks, but
> > actually both are the offset for bits.  But this doesn't lead to
> > further failure due to the AND logic operation will be always true for
> > ETM_OPT_CTXTID / ETM_OPT_TS.
> > 
> > This patch defines new independent macros (rather than using the
> > "config" bits) for requesting the "contextid" and "timestamp" for
> > cs_etm_set_option().
> > 
> > [leoy: Extract the change as a separate patch for easier review]
> 
> This should go just above your name - see below.
> 
> > Signed-off-by: Suzuki K Poulose 
> > Signed-off-by: Leo Yan 
> > Reviewed-by: Mike Leach 
> 
>  Signed-off-by: Suzuki K Poulose 
>  [Extract the change as a separate patch for easier review]
>  Signed-off-by: Leo Yan 
>  Reviewed-by: Mike Leach 
> 
> > ---
> >  tools/perf/arch/arm/util/cs-etm.c | 12 
> >  1 file changed, 8 insertions(+), 4 deletions(-)
> > 
> > diff --git a/tools/perf/arch/arm/util/cs-etm.c 
> > b/tools/perf/arch/arm/util/cs-etm.c
> > index bd446aba64f7..c25c878fd06c 100644
> > --- a/tools/perf/arch/arm/util/cs-etm.c
> > +++ b/tools/perf/arch/arm/util/cs-etm.c
> > @@ -156,6 +156,10 @@ static int cs_etm_set_timestamp(struct auxtrace_record 
> > *itr,
> > return err;
> >  }
> >  
> > +#define ETM_SET_OPT_CTXTID (1 << 0)
> > +#define ETM_SET_OPT_TS (1 << 1)
> > +#define ETM_SET_OPT_MASK   (ETM_SET_OPT_CTXTID | ETM_SET_OPT_TS)
> > +
> 
> I would much rather see this fixed with the BIT() macro as it is done in the
> rest of this set than defining new constant.
> 
> With the above:
> 
> Reviewed-by: Mathieu Poirier 
> 
> I have picked up the kernel portion of this set.  I suggest you fix the above
> and send another revision to Arnaldo with my RBs.

Will do this.  Thanks for suggestion, Mathieu.

Leo

[...]


[PATCH v3 8/8] Documentation: coresight: Add PID tracing description

2021-02-06 Thread Leo Yan
After support the PID tracing for the kernel in EL1 or EL2, the usage
gets more complicated.

This patch gives description for the PMU formats of contextID configs,
this can help users to understand how to control the knobs for PID
tracing when the kernel is in different ELs.

Signed-off-by: Leo Yan 
---
 Documentation/trace/coresight/coresight.rst | 32 +
 1 file changed, 32 insertions(+)

diff --git a/Documentation/trace/coresight/coresight.rst 
b/Documentation/trace/coresight/coresight.rst
index 0b73acb44efa..169749efd8d1 100644
--- a/Documentation/trace/coresight/coresight.rst
+++ b/Documentation/trace/coresight/coresight.rst
@@ -512,6 +512,38 @@ The --itrace option controls the type and frequency of 
synthesized events
 Note that only 64-bit programs are currently supported - further work is
 required to support instruction decode of 32-bit Arm programs.
 
+2.2) Tracing PID
+
+The kernel can be built to write the PID value into the PE ContextID registers.
+For a kernel running at EL1, the PID is stored in CONTEXTIDR_EL1.  A PE may
+implement Arm Virtualization Host Extensions (VHE), which the kernel can
+run at EL2 as a virtualisation host; in this case, the PID value is stored in
+CONTEXTIDR_EL2.
+
+perf provides PMU formats that program the ETM to insert these values into the
+trace data; the PMU formats are defined as below:
+
+  "contextid1": Available on both EL1 kernel and EL2 kernel.  When the
+kernel is running at EL1, "contextid1" enables the PID
+tracing; when the kernel is running at EL2, this enables
+tracing the PID of guest applications.
+
+  "contextid2": Only usable when the kernel is running at EL2.  When
+selected, enables PID tracing on EL2 kernel.
+
+  "contextid":  Will be an alias for the option that enables PID
+tracing.  I.e,
+contextid == contextid1, on EL1 kernel.
+contextid == contextid2, on EL2 kernel.
+
+perf will always enable PID tracing at the relevant EL, this is accomplished by
+automatically enable the "contextid" config - but for EL2 it is possible to 
make
+specific adjustments using configs "contextid1" and "contextid2", E.g. if a 
user
+wants to trace PIDs for both host and guest, the two configs "contextid1" and
+"contextid2" can be set at the same time:
+
+  perf record -e cs_etm/contextid1,contextid2/u -- vm
+
 
 Generating coverage files for Feedback Directed Optimization: AutoFDO
 -
-- 
2.25.1



[PATCH v3 6/8] perf cs-etm: Add helper cs_etm__get_pid_fmt()

2021-02-06 Thread Leo Yan
This patch adds helper function cs_etm__get_pid_fmt(), by passing
parameter "traceID", it returns the PID format.

Signed-off-by: Leo Yan 
---
 tools/perf/util/cs-etm.c | 42 
 tools/perf/util/cs-etm.h |  1 +
 2 files changed, 43 insertions(+)

diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c
index a2a369e2fbb6..b9c1d329a7f1 100644
--- a/tools/perf/util/cs-etm.c
+++ b/tools/perf/util/cs-etm.c
@@ -7,6 +7,7 @@
  */
 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -156,6 +157,47 @@ int cs_etm__get_cpu(u8 trace_chan_id, int *cpu)
return 0;
 }
 
+/*
+ * The returned PID format is presented by two bits:
+ *
+ *   Bit ETM_OPT_CTXTID: CONTEXTIDR or CONTEXTIDR_EL1 is traced;
+ *   Bit ETM_OPT_CTXTID2: CONTEXTIDR_EL2 is traced.
+ *
+ * It's possible that the two bits ETM_OPT_CTXTID and ETM_OPT_CTXTID2
+ * are enabled at the same time when the session runs on an EL2 kernel.
+ * This means the CONTEXTIDR_EL1 and CONTEXTIDR_EL2 both will be
+ * recorded in the trace data, the tool will selectively use
+ * CONTEXTIDR_EL2 as PID.
+ */
+int cs_etm__get_pid_fmt(u8 trace_chan_id, u64 *pid_fmt)
+{
+   struct int_node *inode;
+   u64 *metadata, val;
+
+   inode = intlist__find(traceid_list, trace_chan_id);
+   if (!inode)
+   return -EINVAL;
+
+   metadata = inode->priv;
+
+   if (metadata[CS_ETM_MAGIC] == __perf_cs_etmv3_magic) {
+   val = metadata[CS_ETM_ETMCR];
+   /* CONTEXTIDR is traced */
+   if (val & BIT(ETM_OPT_CTXTID))
+   *pid_fmt = BIT(ETM_OPT_CTXTID);
+   } else {
+   val = metadata[CS_ETMV4_TRCCONFIGR];
+   /* CONTEXTIDR_EL2 is traced */
+   if (val & (BIT(ETM4_CFG_BIT_VMID) | BIT(ETM4_CFG_BIT_VMID_OPT)))
+   *pid_fmt = BIT(ETM_OPT_CTXTID2);
+   /* CONTEXTIDR_EL1 is traced */
+   else if (val & BIT(ETM4_CFG_BIT_CTXTID))
+   *pid_fmt = BIT(ETM_OPT_CTXTID);
+   }
+
+   return 0;
+}
+
 void cs_etm__etmq_set_traceid_queue_timestamp(struct cs_etm_queue *etmq,
  u8 trace_chan_id)
 {
diff --git a/tools/perf/util/cs-etm.h b/tools/perf/util/cs-etm.h
index 4ad925d6d799..7cc3bba0017d 100644
--- a/tools/perf/util/cs-etm.h
+++ b/tools/perf/util/cs-etm.h
@@ -173,6 +173,7 @@ struct cs_etm_packet_queue {
 int cs_etm__process_auxtrace_info(union perf_event *event,
  struct perf_session *session);
 int cs_etm__get_cpu(u8 trace_chan_id, int *cpu);
+int cs_etm__get_pid_fmt(u8 trace_chan_id, u64 *pid_fmt);
 int cs_etm__etmq_set_tid(struct cs_etm_queue *etmq,
 pid_t tid, u8 trace_chan_id);
 bool cs_etm__etmq_is_timeless(struct cs_etm_queue *etmq);
-- 
2.25.1



[PATCH v3 7/8] perf cs-etm: Detect pid in VMID for kernel running at EL2

2021-02-06 Thread Leo Yan
From: Suzuki K Poulose 

The PID of the task could be traced as VMID when the kernel is running
at EL2.  Teach the decoder to look for VMID when the CONTEXTIDR (Arm32)
or CONTEXTIDR_EL1 (Arm64) is invalid but we have a valid VMID.

Cc: Mike Leach 
Cc: Mathieu Poirier 
Cc: Al Grant 
Signed-off-by: Suzuki K Poulose 
Co-developed-by: Leo Yan 
Signed-off-by: Leo Yan 
---
 .../perf/util/cs-etm-decoder/cs-etm-decoder.c | 38 +--
 1 file changed, 34 insertions(+), 4 deletions(-)

diff --git a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c 
b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c
index 3f4bc4050477..4052c9ce6e2f 100644
--- a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c
+++ b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c
@@ -6,6 +6,7 @@
  * Author: Mathieu Poirier 
  */
 
+#include 
 #include 
 #include 
 #include 
@@ -491,13 +492,42 @@ cs_etm_decoder__set_tid(struct cs_etm_queue *etmq,
const ocsd_generic_trace_elem *elem,
const uint8_t trace_chan_id)
 {
-   pid_t tid;
+   pid_t tid = -1;
+   static u64 pid_fmt;
+   int ret;
 
-   /* Ignore PE_CONTEXT packets that don't have a valid contextID */
-   if (!elem->context.ctxt_id_valid)
+   /*
+* As all the ETMs run at the same exception level, the system should
+* have the same PID format crossing CPUs.  So cache the PID format
+* and reuse it for sequential decoding.
+*/
+   if (!pid_fmt) {
+   ret = cs_etm__get_pid_fmt(trace_chan_id, _fmt);
+   if (ret)
+   return OCSD_RESP_FATAL_SYS_ERR;
+   }
+
+   /*
+* Process the PE_CONTEXT packets if we have a valid contextID or VMID.
+* If the kernel is running at EL2, the PID is traced in CONTEXTIDR_EL2
+* as VMID, Bit ETM_OPT_CTXTID2 is set in this case.
+*/
+   switch (pid_fmt) {
+   case BIT(ETM_OPT_CTXTID):
+   if (elem->context.ctxt_id_valid)
+   tid = elem->context.context_id;
+   break;
+   case BIT(ETM_OPT_CTXTID2):
+   if (elem->context.vmid_valid)
+   tid = elem->context.vmid;
+   break;
+   default:
+   break;
+   }
+
+   if (tid == -1)
return OCSD_RESP_CONT;
 
-   tid =  elem->context.context_id;
if (cs_etm__etmq_set_tid(etmq, tid, trace_chan_id))
return OCSD_RESP_FATAL_SYS_ERR;
 
-- 
2.25.1



[PATCH v3 5/8] perf cs-etm: Support PID tracing in config

2021-02-06 Thread Leo Yan
From: Suzuki K Poulose 

If the kernel is running at EL2, the pid of a task is exposed via VMID
instead of the CONTEXTID.  Add support for this in the perf tool.

This patch respects user setting if user has specified any configs
from "contextid", "contextid1" or "contextid2"; otherwise, it
dynamically sets config based on PMU format "contextid".

Cc: Mike Leach 
Cc: Mathieu Poirier 
Cc: Al Grant 
Signed-off-by: Suzuki K Poulose 
Co-developed-by: Leo Yan 
Signed-off-by: Leo Yan 
Reviewed-by: Mike Leach 
---
 tools/include/linux/coresight-pmu.h |  3 ++
 tools/perf/arch/arm/util/cs-etm.c   | 61 +++--
 2 files changed, 52 insertions(+), 12 deletions(-)

diff --git a/tools/include/linux/coresight-pmu.h 
b/tools/include/linux/coresight-pmu.h
index 5dc47cfdcf07..4ac5c081af93 100644
--- a/tools/include/linux/coresight-pmu.h
+++ b/tools/include/linux/coresight-pmu.h
@@ -20,14 +20,17 @@
  */
 #define ETM_OPT_CYCACC 12
 #define ETM_OPT_CTXTID 14
+#define ETM_OPT_CTXTID215
 #define ETM_OPT_TS 28
 #define ETM_OPT_RETSTK 29
 
 /* ETMv4 CONFIGR programming bits for the ETM OPTs */
 #define ETM4_CFG_BIT_CYCACC4
 #define ETM4_CFG_BIT_CTXTID6
+#define ETM4_CFG_BIT_VMID  7
 #define ETM4_CFG_BIT_TS11
 #define ETM4_CFG_BIT_RETSTK12
+#define ETM4_CFG_BIT_VMID_OPT  15
 
 static inline int coresight_get_trace_id(int cpu)
 {
diff --git a/tools/perf/arch/arm/util/cs-etm.c 
b/tools/perf/arch/arm/util/cs-etm.c
index c25c878fd06c..fa6f91a7c8a1 100644
--- a/tools/perf/arch/arm/util/cs-etm.c
+++ b/tools/perf/arch/arm/util/cs-etm.c
@@ -67,6 +67,7 @@ static int cs_etm_set_context_id(struct auxtrace_record *itr,
char path[PATH_MAX];
int err = -EINVAL;
u32 val;
+   u64 contextid;
 
ptr = container_of(itr, struct cs_etm_recording, itr);
cs_etm_pmu = ptr->cs_etm_pmu;
@@ -86,25 +87,59 @@ static int cs_etm_set_context_id(struct auxtrace_record 
*itr,
goto out;
}
 
+   /* User has configured for PID tracing, respects it. */
+   contextid = evsel->core.attr.config &
+   (BIT(ETM_OPT_CTXTID) | BIT(ETM_OPT_CTXTID2));
+
/*
-* TRCIDR2.CIDSIZE, bit [9-5], indicates whether contextID tracing
-* is supported:
-*  0b0 Context ID tracing is not supported.
-*  0b00100 Maximum of 32-bit Context ID size.
-*  All other values are reserved.
+* If user doesn't configure the contextid format, parse PMU format and
+* enable PID tracing according to the "contextid" format bits:
+*
+*   If bit ETM_OPT_CTXTID is set, trace CONTEXTIDR_EL1;
+*   If bit ETM_OPT_CTXTID2 is set, trace CONTEXTIDR_EL2.
 */
-   val = BMVAL(val, 5, 9);
-   if (!val || val != 0x4) {
-   err = -EINVAL;
-   goto out;
+   if (!contextid)
+   contextid = perf_pmu__format_bits(_etm_pmu->format,
+ "contextid");
+
+   if (contextid & BIT(ETM_OPT_CTXTID)) {
+   /*
+* TRCIDR2.CIDSIZE, bit [9-5], indicates whether contextID
+* tracing is supported:
+*  0b0 Context ID tracing is not supported.
+*  0b00100 Maximum of 32-bit Context ID size.
+*  All other values are reserved.
+*/
+   val = BMVAL(val, 5, 9);
+   if (!val || val != 0x4) {
+   pr_err("%s: CONTEXTIDR_EL1 isn't supported\n",
+  CORESIGHT_ETM_PMU_NAME);
+   err = -EINVAL;
+   goto out;
+   }
+   }
+
+   if (contextid & BIT(ETM_OPT_CTXTID2)) {
+   /*
+* TRCIDR2.VMIDOPT[30:29] != 0 and
+* TRCIDR2.VMIDSIZE[14:10] == 0b00100 (32bit virtual contextid)
+* We can't support CONTEXTIDR in VMID if the size of the
+* virtual context id is < 32bit.
+* Any value of VMIDSIZE >= 4 (i.e, > 32bit) is fine for us.
+*/
+   if (!BMVAL(val, 29, 30) || BMVAL(val, 10, 14) < 4) {
+   pr_err("%s: CONTEXTIDR_EL2 isn't supported\n",
+  CORESIGHT_ETM_PMU_NAME);
+   err = -EINVAL;
+   goto out;
+   }
}
 
/* All good, let the kernel know */
-   evsel->core.attr.config |= (1 << ETM_OPT_CTXTID);
+   evsel->core.attr.config |= contextid;
err = 0;
 
 out:
-
return err;
 }
 
@@ -489,7 +524,9 @@ static u64 cs_etmv4_get_config(struct auxtrace_record *itr)
config |= BIT(ETM4_CFG_BIT_TS);
if (config_opts & BIT(ETM_OPT_RETSTK))
  

[PATCH v3 4/8] perf cs-etm: Fix bitmap for option

2021-02-06 Thread Leo Yan
From: Suzuki K Poulose 

When set option with macros ETM_OPT_CTXTID and ETM_OPT_TS, it wrongly
takes these two values (14 and 28 prespectively) as bit masks, but
actually both are the offset for bits.  But this doesn't lead to
further failure due to the AND logic operation will be always true for
ETM_OPT_CTXTID / ETM_OPT_TS.

This patch defines new independent macros (rather than using the
"config" bits) for requesting the "contextid" and "timestamp" for
cs_etm_set_option().

[leoy: Extract the change as a separate patch for easier review]
Signed-off-by: Suzuki K Poulose 
Signed-off-by: Leo Yan 
Reviewed-by: Mike Leach 
---
 tools/perf/arch/arm/util/cs-etm.c | 12 
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/tools/perf/arch/arm/util/cs-etm.c 
b/tools/perf/arch/arm/util/cs-etm.c
index bd446aba64f7..c25c878fd06c 100644
--- a/tools/perf/arch/arm/util/cs-etm.c
+++ b/tools/perf/arch/arm/util/cs-etm.c
@@ -156,6 +156,10 @@ static int cs_etm_set_timestamp(struct auxtrace_record 
*itr,
return err;
 }
 
+#define ETM_SET_OPT_CTXTID (1 << 0)
+#define ETM_SET_OPT_TS (1 << 1)
+#define ETM_SET_OPT_MASK   (ETM_SET_OPT_CTXTID | ETM_SET_OPT_TS)
+
 static int cs_etm_set_option(struct auxtrace_record *itr,
 struct evsel *evsel, u32 option)
 {
@@ -169,17 +173,17 @@ static int cs_etm_set_option(struct auxtrace_record *itr,
!cpu_map__has(online_cpus, i))
continue;
 
-   if (option & ETM_OPT_CTXTID) {
+   if (option & ETM_SET_OPT_CTXTID) {
err = cs_etm_set_context_id(itr, evsel, i);
if (err)
goto out;
}
-   if (option & ETM_OPT_TS) {
+   if (option & ETM_SET_OPT_TS) {
err = cs_etm_set_timestamp(itr, evsel, i);
if (err)
goto out;
}
-   if (option & ~(ETM_OPT_CTXTID | ETM_OPT_TS))
+   if (option & ~(ETM_SET_OPT_MASK))
/* Nothing else is currently supported */
goto out;
}
@@ -406,7 +410,7 @@ static int cs_etm_recording_options(struct auxtrace_record 
*itr,
evsel__set_sample_bit(cs_etm_evsel, CPU);
 
err = cs_etm_set_option(itr, cs_etm_evsel,
-   ETM_OPT_CTXTID | ETM_OPT_TS);
+   ETM_SET_OPT_CTXTID | ETM_SET_OPT_TS);
if (err)
goto out;
}
-- 
2.25.1



[PATCH v3 3/8] coresight: etm-perf: Support PID tracing for kernel at EL2

2021-02-06 Thread Leo Yan
From: Suzuki K Poulose 

When the kernel is running at EL2, the PID is stored in CONTEXTIDR_EL2.
So, tracing CONTEXTIDR_EL1 doesn't give us the pid of the process.
Thus we should trace the VMID with VMIDOPT set to trace CONTEXTIDR_EL2
instead of CONTEXTIDR_EL1.  Given that we have an existing config
option "contextid" and this will be useful for tracing virtual machines
(when we get to support virtualization).

So instead, this patch extends option CTXTID with an extra bit
ETM_OPT_CTXTID2 (bit 15), thus on an EL2 kernel, we will have another
bit available for the perf tool: ETM_OPT_CTXTID is for kernel running in
EL1, ETM_OPT_CTXTID2 is used when kernel runs in EL2 with VHE enabled.

The tool must be backward compatible for users, i.e, "contextid" today
traces PID and that should remain the same; for this purpose, the perf
tool is updated to automatically set corresponding bit for the
"contextid" config, therefore, the user doesn't have to bother which EL
the kernel is running.

  i.e, perf record -e cs_etm/contextid/u --

will always do the "pid" tracing, independent of the kernel EL.

The driver parses the format "contextid", which traces CONTEXTIDR_EL1
for ETM_OPT_CTXTID (on EL1 kernel) and traces CONTEXTIDR_EL2 for
ETM_OPT_CTXTID2 (on EL2 kernel).

Besides the enhancement for format "contexid", extra two formats are
introduced: "contextid1" and "contextid2".  This considers to support
tracing both CONTEXTIDR_EL1 and CONTEXTIDR_EL2 when the kernel is
running at EL2.  Finally, the PMU formats are defined as follow:

  "contextid1": Available on both EL1 kernel and EL2 kernel.  When the
kernel is running at EL1, "contextid1" enables the PID
tracing; when the kernel is running at EL2, this enables
tracing the PID of guest applications.

  "contextid2": Only usable when the kernel is running at EL2.  When
selected, enables PID tracing on EL2 kernel.

  "contextid":  Will be an alias for the option that enables PID
tracing.  I.e,
contextid == contextid1, on EL1 kernel.
contextid == contextid2, on EL2 kernel.

Cc: Mathieu Poirier 
Cc: Al Grant 
Cc: Mike Leach 
Cc: Leo Yan 
Signed-off-by: Suzuki K Poulose 
[ Added two config formats: contextid1, contextid2 ]
Signed-off-by: Leo Yan 
Reviewed-by: Mike Leach 
---
 .../hwtracing/coresight/coresight-etm-perf.c  | 27 ++-
 .../coresight/coresight-etm4x-core.c  | 13 +
 include/linux/coresight-pmu.h |  3 +++
 3 files changed, 42 insertions(+), 1 deletion(-)

diff --git a/drivers/hwtracing/coresight/coresight-etm-perf.c 
b/drivers/hwtracing/coresight/coresight-etm-perf.c
index 465ef1aa8c82..0f603b4094f2 100644
--- a/drivers/hwtracing/coresight/coresight-etm-perf.c
+++ b/drivers/hwtracing/coresight/coresight-etm-perf.c
@@ -32,15 +32,40 @@ static DEFINE_PER_CPU(struct coresight_device *, csdev_src);
  * now take them as general formats and apply on all ETMs.
  */
 PMU_FORMAT_ATTR(cycacc,"config:" __stringify(ETM_OPT_CYCACC));
-PMU_FORMAT_ATTR(contextid, "config:" __stringify(ETM_OPT_CTXTID));
+/* contextid1 enables tracing CONTEXTIDR_EL1 for ETMv4 */
+PMU_FORMAT_ATTR(contextid1,"config:" __stringify(ETM_OPT_CTXTID));
+/* contextid2 enables tracing CONTEXTIDR_EL2 for ETMv4 */
+PMU_FORMAT_ATTR(contextid2,"config:" __stringify(ETM_OPT_CTXTID2));
 PMU_FORMAT_ATTR(timestamp, "config:" __stringify(ETM_OPT_TS));
 PMU_FORMAT_ATTR(retstack,  "config:" __stringify(ETM_OPT_RETSTK));
 /* Sink ID - same for all ETMs */
 PMU_FORMAT_ATTR(sinkid,"config2:0-31");
 
+/*
+ * contextid always traces the "PID".  The PID is in CONTEXTIDR_EL1
+ * when the kernel is running at EL1; when the kernel is at EL2,
+ * the PID is in CONTEXTIDR_EL2.
+ */
+static ssize_t format_attr_contextid_show(struct device *dev,
+ struct device_attribute *attr,
+ char *page)
+{
+   int pid_fmt = ETM_OPT_CTXTID;
+
+#if defined(CONFIG_CORESIGHT_SOURCE_ETM4X)
+   pid_fmt = is_kernel_in_hyp_mode() ? ETM_OPT_CTXTID2 : ETM_OPT_CTXTID;
+#endif
+   return sprintf(page, "config:%d\n", pid_fmt);
+}
+
+struct device_attribute format_attr_contextid =
+   __ATTR(contextid, 0444, format_attr_contextid_show, NULL);
+
 static struct attribute *etm_config_formats_attr[] = {
_attr_cycacc.attr,
_attr_contextid.attr,
+   _attr_contextid1.attr,
+   _attr_contextid2.attr,
_attr_timestamp.attr,
_attr_retstack.attr,
_attr_sinkid.attr,
diff --git a/drivers/hwtracing/coresight/coresight-etm4x-core.c 
b/drivers/hwtracing/coresight/coresight-etm4x-core.c
index b20b6ff17cf6..8681c225b0

[PATCH v3 1/8] coresight: etm-perf: Clarify comment on perf options

2021-02-06 Thread Leo Yan
In theory, the options should be arbitrary values and are neutral for
any ETM version; so far perf tool uses ETMv3.5/PTM ETMCR config bits
except for register's bit definitions, also uses as options.

This can introduce confusion, especially if we want to add a new option
but the new option is not supported by ETMv3.5/PTM ETMCR.  But on the
other hand, we cannot change options since these options are generic
CoreSight PMU ABI.

For easier maintenance and avoid confusion, this patch refines the
comment to clarify perf options, and gives out the background info for
these bits are coming from ETMv3.5/PTM.  Afterwards, we should take
these options as general knobs, and if there have any confliction with
ETMv3.5/PTM, should consider to define saperate macros for ETMv3.5/PTM
ETMCR config bits.

Suggested-by: Suzuki K Poulose 
Signed-off-by: Leo Yan 
Reviewed-by: Suzuki K Poulose 
---
 .../hwtracing/coresight/coresight-etm-perf.c|  5 -
 include/linux/coresight-pmu.h   | 17 -
 2 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/drivers/hwtracing/coresight/coresight-etm-perf.c 
b/drivers/hwtracing/coresight/coresight-etm-perf.c
index bdc34ca449f7..465ef1aa8c82 100644
--- a/drivers/hwtracing/coresight/coresight-etm-perf.c
+++ b/drivers/hwtracing/coresight/coresight-etm-perf.c
@@ -27,7 +27,10 @@ static bool etm_perf_up;
 static DEFINE_PER_CPU(struct perf_output_handle, ctx_handle);
 static DEFINE_PER_CPU(struct coresight_device *, csdev_src);
 
-/* ETMv3.5/PTM's ETMCR is 'config' */
+/*
+ * The PMU formats were orignally for ETMv3.5/PTM's ETMCR 'config';
+ * now take them as general formats and apply on all ETMs.
+ */
 PMU_FORMAT_ATTR(cycacc,"config:" __stringify(ETM_OPT_CYCACC));
 PMU_FORMAT_ATTR(contextid, "config:" __stringify(ETM_OPT_CTXTID));
 PMU_FORMAT_ATTR(timestamp, "config:" __stringify(ETM_OPT_TS));
diff --git a/include/linux/coresight-pmu.h b/include/linux/coresight-pmu.h
index b0e35eec6499..5dc47cfdcf07 100644
--- a/include/linux/coresight-pmu.h
+++ b/include/linux/coresight-pmu.h
@@ -10,11 +10,18 @@
 #define CORESIGHT_ETM_PMU_NAME "cs_etm"
 #define CORESIGHT_ETM_PMU_SEED  0x10
 
-/* ETMv3.5/PTM's ETMCR config bit */
-#define ETM_OPT_CYCACC  12
-#define ETM_OPT_CTXTID 14
-#define ETM_OPT_TS  28
-#define ETM_OPT_RETSTK 29
+/*
+ * Below are the definition of bit offsets for perf option, and works as
+ * arbitrary values for all ETM versions.
+ *
+ * Most of them are orignally from ETMv3.5/PTM's ETMCR config, therefore,
+ * ETMv3.5/PTM doesn't define ETMCR config bits with prefix "ETM3_" and
+ * directly use below macros as config bits.
+ */
+#define ETM_OPT_CYCACC 12
+#define ETM_OPT_CTXTID 14
+#define ETM_OPT_TS 28
+#define ETM_OPT_RETSTK 29
 
 /* ETMv4 CONFIGR programming bits for the ETM OPTs */
 #define ETM4_CFG_BIT_CYCACC4
-- 
2.25.1



[PATCH v3 2/8] tools headers UAPI: Update tools' copy of linux/coresight-pmu.h

2021-02-06 Thread Leo Yan
To get the changes in the commit:

  "coresight: etm-perf: Clarify comment on perf options".

Signed-off-by: Leo Yan 
Reviewed-by: Suzuki K Poulose 
---
 tools/include/linux/coresight-pmu.h | 17 -
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/tools/include/linux/coresight-pmu.h 
b/tools/include/linux/coresight-pmu.h
index b0e35eec6499..5dc47cfdcf07 100644
--- a/tools/include/linux/coresight-pmu.h
+++ b/tools/include/linux/coresight-pmu.h
@@ -10,11 +10,18 @@
 #define CORESIGHT_ETM_PMU_NAME "cs_etm"
 #define CORESIGHT_ETM_PMU_SEED  0x10
 
-/* ETMv3.5/PTM's ETMCR config bit */
-#define ETM_OPT_CYCACC  12
-#define ETM_OPT_CTXTID 14
-#define ETM_OPT_TS  28
-#define ETM_OPT_RETSTK 29
+/*
+ * Below are the definition of bit offsets for perf option, and works as
+ * arbitrary values for all ETM versions.
+ *
+ * Most of them are orignally from ETMv3.5/PTM's ETMCR config, therefore,
+ * ETMv3.5/PTM doesn't define ETMCR config bits with prefix "ETM3_" and
+ * directly use below macros as config bits.
+ */
+#define ETM_OPT_CYCACC 12
+#define ETM_OPT_CTXTID 14
+#define ETM_OPT_TS 28
+#define ETM_OPT_RETSTK 29
 
 /* ETMv4 CONFIGR programming bits for the ETM OPTs */
 #define ETM4_CFG_BIT_CYCACC4
-- 
2.25.1



[PATCH v3 0/8] coresight: etm-perf: Fix pid tracing with VHE

2021-02-06 Thread Leo Yan
This patch series is to support PID tracing with Virtualization Host
Extensions (VHE).

To be backward compatibility, and can both support PID tracing for the
kernel is running at either EL1 or EL2, the two new PMU formats
"contextid1" and "contextid2" are introduced, which works as switches
to trace PID for EL1 kernel and EL2 kernel respectively.

The existed PMU format "contextid" needs to be backward compatible for
users, it's changed as an alias for "contextid1" on EL1 kernel and for
"contextid2" on EL2 kernel.  Therefore, even without setting "contextid"
config, the perf tool can dynamically pick up the config for PID
tracing, the user doesn't have to set the "contexid" config manually.

This patch series can be cleanly applied on perf/core branch:

  commit cd07e536b020 ("Merge remote-tracking branch 'torvalds/master' into 
perf/core")

... and applied on the mainline kernel:

  commit 1e0d27fce010 ("Merge branch 'akpm' (patches from Andrew)")

The patch series has been tested on Arm Juno-r2 board.  Verified the
kernel with EL1 and didn't find issue; after some hacking in kernel
driver and tool to emulate the code paths for kernel on EL2, can see
the code path is hit without failure.


Changes from v2:
* Split into two patches for clarification comment on perf options, one
  patch is for kernel change and the another one is for tools' change
  (Suzuki);
* Simplified cs_etm__get_pid_fmt() to return ETM_OPT_CTXTID2 OR
  ETM_OPT_CTXTID, but not both (Suzuki);
* Cached "pid_fmt" in cs_etm_decoder__set_tid() (Suzuki);
* Refined documentation for more clear description for PMU format usages
  (Mike);
* Added Suzuki's and Mike's Reviewed tags.

Changes from v1:
* Refactored PMU formats, added formats "contextid1"/"contextid2", and
  reworked format "contextid" (Suzuki/Mathieu);
* Refined the comments for perf configs (Leo/Mike);
* Added patch 07/07 for description PID tracing in docs;
* Found the issue for bitmap for option, extracted patch 03/07 for the
  fixing.

Changes from RFC:
* Added comments to clarify cases requested (Leo);
* Explain the change to generic flags for cs_etm_set_option() in the
  commit description;
* Stored PID format in metadata and passed it to decoder (Leo);
* Enhanced cs-etm for backward compatibility (Denis Nikitin).


Leo Yan (4):
  coresight: etm-perf: Clarify comment on perf options
  tools headers UAPI: Update tools' copy of linux/coresight-pmu.h
  perf cs-etm: Add helper cs_etm__get_pid_fmt()
  Documentation: coresight: Add PID tracing description

Suzuki K Poulose (4):
  coresight: etm-perf: Support PID tracing for kernel at EL2
  perf cs-etm: Fix bitmap for option
  perf cs-etm: Support PID tracing in config
  perf cs-etm: Detect pid in VMID for kernel running at EL2

 Documentation/trace/coresight/coresight.rst   | 32 
 .../hwtracing/coresight/coresight-etm-perf.c  | 32 +++-
 .../coresight/coresight-etm4x-core.c  | 13 
 include/linux/coresight-pmu.h | 20 +++--
 tools/include/linux/coresight-pmu.h   | 20 +++--
 tools/perf/arch/arm/util/cs-etm.c | 73 +++
 .../perf/util/cs-etm-decoder/cs-etm-decoder.c | 38 +-
 tools/perf/util/cs-etm.c  | 42 +++
 tools/perf/util/cs-etm.h  |  1 +
 9 files changed, 239 insertions(+), 32 deletions(-)

-- 
2.25.1



Re: [PATCH v2 7/7] Documentation: coresight: Add PID tracing description

2021-02-04 Thread Leo Yan
On Thu, Feb 04, 2021 at 12:14:12PM +, Mike Leach wrote:

[...]

> > >>> +To support tracing PID for the kernel runs at different exception 
> > >>> levels,
> > >>> +the PMU formats are defined as follow:
> > >>> +
> > >>> +  "contextid1": Available on both EL1 kernel and EL2 kernel.  When the
> > >>> +kernel is running at EL1, "contextid1" enables the PID
> > >>> +tracing; when the kernel is running at EL2, this 
> > >>> enables
> > >>> +tracing the PID of guest applications.
> > >>> +
> > >>> +  "contextid2": Only usable when the kernel is running at EL2.  When
> > >>> +selected, enables PID tracing on EL2 kernel.
> > >>> +
> > >>> +  "contextid":  Will be an alias for the option that enables PID
> > >>> +tracing.  I.e,
> > >>> +contextid == contextid1, on EL1 kernel.
> > >>> +contextid == contextid2, on EL2 kernel.
> > >>> +
> > >>> +The perf tool automatically sets corresponding bit for the "contextid" 
> > >>> config,
> > >>> +therefore, the user doesn't have to bother which EL the kernel is 
> > >>> running.
> > >>> +
> > >>> +  i.e, perf record -e cs_etm/contextid/u -- uname
> > >>> +or perf record -e cs_etm//u -- uname
> > >>> +
> > >>> +will always do the "PID" tracing, independent of the kernel EL.
> > >>> +
> > >>
> > >> This is telling me that both cs_etm// and cs_etm/contextid/ have the
> > >> same effect - trace PID. Is this correct?
> > >
> >
> > Just to make this clear, this is not a side effect of the patch.
> 
> Which is fine - but the documentation should accurately reflect what
> is happening on the system.
> This is a new paragraph about the PID tracing or otherwise, Even if
> some of the effects pre-date this patch, they have to be accurately
> communicated.
> I am also reading the new paragraph in the context of the rest of the
> coresight.rst document - which is a user level document explaining the
> basic operation of the coresight system and tools.
> This document mentions no other perf command line parameters relevant
> to coresight other than the @sink option.It actually calls out to the
> OpenCSD docs to provide further information.
> 
> > The perf
> > tool driver automatically adds the "contextid" tracing and timestamp for
> > "system wide" and process bound events, as they traces get mixed into
> > the single sink. So these options are added implicitly by the perf tool
> > to make the decoding easier.
> >
> 
> That's fine - I have no problem with contextID trace enabled by
> default. Context ID is relatively low overhead - and only emitted at
> start of trace  / context changes.
> But the explanation of the parameters currently reads as though they
> always have an effect - and not putting them in there will omit the
> effect - unless you spot the very subtle line at the end.
> 
> The user does not need to know about parameters that have no effect!

Thanks for the suggestion, Mike.

> Perhaps a better approach would be to explain the above - an explicit
> statement that "perf will always enable PID/ contextID tracing at the
> relevant EL - but for EL2 it is possible to make specific adjustments
> using parameters..."

Usually users assume the PMU format has no effect if without set it; but
this is not the case for the config "contextid", this config has been
automatically enabled by perf tool.

Based on your suggesiton, will refine the descrption for two things:
clarify what's the common usage for EL1/EL2, and what's specific for
EL2.

Thanks,
Leo


Re: [PATCH v2 5/7] perf cs-etm: Add helper cs_etm__get_pid_fmt()

2021-02-04 Thread Leo Yan
On Thu, Feb 04, 2021 at 10:54:24AM +, Suzuki Kuruppassery Poulose wrote:

[...]

> > > > +int cs_etm__get_pid_fmt(u8 trace_chan_id, u64 *pid_fmt)
> > > > +{
> > > > +   struct int_node *inode;
> > > > +   u64 *metadata, val;
> > > > +
> > > > +   inode = intlist__find(traceid_list, trace_chan_id);
> > > > +   if (!inode)
> > > > +   return -EINVAL;
> > > > +
> > > > +   metadata = inode->priv;
> > > > +
> > > > +   if (metadata[CS_ETM_MAGIC] == __perf_cs_etmv3_magic) {
> > > > +   val = metadata[CS_ETM_ETMCR];
> > > > +   /* CONTEXTIDR is traced */
> > > > +   if (val & BIT(ETM_OPT_CTXTID))
> > > > +   *pid_fmt = BIT(ETM_OPT_CTXTID);
> > > > +   } else {
> > > > +   val = metadata[CS_ETMV4_TRCCONFIGR];
> > > > +
> > > > +   *pid_fmt = 0;
> > > > +
> > > > +   /* CONTEXTIDR_EL2 is traced */
> > > > +   if (val & (BIT(ETM4_CFG_BIT_VMID) | 
> > > > BIT(ETM4_CFG_BIT_VMID_OPT)))
> > > > +   *pid_fmt = BIT(ETM_OPT_CTXTID2);
> > > > +
> > > > +   /* CONTEXTIDR_EL1 is traced */
> > > > +   if (val & BIT(ETM4_CFG_BIT_CTXTID))
> > > 
> > > I haven't looked at how this gets used. But, Shouldn't this be :
> > > 
> > >   else if (val & BIT(ETM4_CFG_BIT_CTXTID)) ?
> > 
> > Actually it's deliberately to set both bits ETM_OPT_CTXTID2 and
> > ETM_OPT_CTXTID if user has enable configs "contextid1" and
> > "contextid2".  So this is exactly the reversed flow in the
> > function cs_etmv4_get_config().
> 
> The point is, we don't care if the user selected both options. What we
> care is, where can we find the PID. CONTEXTIDR_EL1 or CONTEXTIDR_EL2.
> As such, get_pid_fmt simply should make that decision and pass it on.
> So, if the CONTEXTIDR_EL2 is selected (which can only be done successfully
> on an EL2 kernel), thats our pid.
> 
> So we should return the format for the PID here. i.e
>  ETM_OPT_CTXTID2 OR ETM_OPT_CTXTID. But not both.

Okay, if so I will follow your suggestion.

Thanks,
Leo


Re: [PATCH 8/8] perf arm-spe: Set thread TID

2021-02-04 Thread Leo Yan
On Mon, Feb 01, 2021 at 07:40:45PM +0200, James Clark wrote:
> 
> On 31/01/2021 14:01, Leo Yan wrote:
> > Option 1: by merging patches 07/08 and 08/08, we can firstly support PID
> > tracing for root namespace, and later we can extend to support PID
> > tracing in container (and in VMs).
> > 
> > Option 2: we can use the software method to establish PID for SPE
> > trace, which can base on kernel's events PERF_RECORD_SWITCH /
> > PERF_RECORD_SWITCH_CPU_WIDE and check context switch ip.
> > 
> > To be honest, I am a bit concern for option 1 for later might
> > introduce regression when later support PID for containers (and VMs).
> > If you have a plan for option 1, I think it's good to record current
> > limitation and the plan for next step in the commit log, so we can merge
> > this patch at this time and later extend for containers.
> > 
> > Otherwise, we need to consider how to implement the PID tracing with
> > option 2.  If it is the case, we should firstly only merge patches
> > 01 ~ 06 for data source enabling.  How about you think for this?
> 
> In my opinion we should do option 1 and use what is there at the moment. That
> gets users 90% of the functionality right now.
> 
> I plan to look at option 2 at some point, and it can always be added on top of
> option 1 or replace what is there. But I don't know when I would get to it or
> how long it will take.

Firstly, sorry for long replying.

Have offline discussion with James and I took time to look into Intel PT
implementation for PID tracing.

AFAICT, for tracing root namespace, the option 1 with using CONTEXTIDR is
the most reliable way for Arm SPE; at beginning I thought option 2 is
better choice for Arm SPE, but after went through Intel PT's code, I think
Arm SPE cannot achieve the same result with Intel PT, this is caused by
its "statistical" character.

Let me explain why Arm SPE has problem with the option 2.  If we want to
enable option 2 by using perf context switch events and switch_ip
approach, it uses below logic:

  Step1: when event PERF_RECORD_SWITCH or PERF_RECORD_SWITCH_CPU_WIDE
  is coming, invokes below functions.  So it tells the "machine"
  context that the process is switched to new one; at this step, it
  simply caches the new PID/TID into the "machine" context.  But the
  samples doesn't really set the new value.

intel_pt_context_switch()
  `> machine__set_current_tid()

  Step2: when detect the branch instruction's target address equals
  to the address of symbol "__switch_to", this means the the CPU
  really switches context to the next process in the low level code,
  afterwards it will retrieve the cached TID/PID from the "machine"
  context and set the correct PID for "ptq->pid" (see
  intel_pt_sample_set_pid_tid_cpu()), then "ptq->tid" is
  used for synthesizing samples.

Arm SPE has the problem for step2, due to the trace uses statistical
approach, it doesn't trace the complete branch instructions, so it
cannot promise to capture all branches for the symbol "__switch_to".
If we only use the events PERF_RECORD_SWITCH /
PERF_RECORD_SWITCH_CPU_WIDE, then it will lead to the coarse result
for PID tracing.

For this reason, seems to me it's pragmatic to use CONTEXTIDR for
PID tracing at current stage, at least it can allow the root domain
tracing works accurately.  But this will leave the issue for tracing
PID in non root namespace, we need to figure out solution later.

Hi Mark.R, Al, do you have any comments for this?

Thanks,
Leo

> >> Signed-off-by: Leo Yan 
> >> Signed-off-by: James Clark 
> > 
> > Besides for techinical question, you could add your "Co-developed-by"
> > tags for patches 06, 07, 08/08, which you have took time to refin them.
> > 
> > Thanks you for kindly efforts.
> > 
> > [1] https://lore.kernel.org/patchwork/patch/1353286/
> > 
> >> Cc: Peter Zijlstra 
> >> Cc: Ingo Molnar 
> >> Cc: Arnaldo Carvalho de Melo 
> >> Cc: Mark Rutland 
> >> Cc: Alexander Shishkin 
> >> Cc: Jiri Olsa 
> >> Cc: Namhyung Kim 
> >> Cc: John Garry 
> >> Cc: Will Deacon 
> >> Cc: Mathieu Poirier 
> >> Cc: Al Grant 
> >> Cc: Andre Przywara 
> >> Cc: Wei Li 
> >> Cc: Tan Xiaojun 
> >> Cc: Adrian Hunter 
> >> ---
> >>  tools/perf/util/arm-spe.c | 75 ++-
> >>  1 file changed, 50 insertions(+), 25 deletions(-)
> >>
> >> diff --git a/tools/perf/util/arm-spe.c b/tools/perf/util/arm-spe.c
> >> index 27a0b9dfe22d..9828fad7e516 100644
> >> --- a/tools/perf/util/arm-spe.c
> >> +++ b/tools/p

Re: [PATCH v2 7/7] Documentation: coresight: Add PID tracing description

2021-02-03 Thread Leo Yan
Hi Mike,

On Wed, Feb 03, 2021 at 05:39:54PM +, Mike Leach wrote:

[...]

> > +2.2) Tracing PID
> > +
> > +When the kernel is running at EL2 with Virtualization Host Extensions 
> > (VHE),
> > +perf records CONTEXTIDR_EL2 in the trace data and can be used as PID when
> > +decoding; and if the kernel is running at EL1 with nVHE, CONTEXTIDR_EL1 is
> > +traced for PID.
> > +
> 
> Would this introductory paragraph be better if is explained where the
> kernel stores the PID for the different levels, then we logically move
> on to how to trace this in perf.
> 
> e.g:-
> 
> "The lernel can be built to write the PID value into the PE ContextID 
> registers.
> For a kernel running at EL1, the PID is stored in CONTEXTIDR_EL1.
> A PE may implement ARM Virtualisation Host Extensions (VHE), were the
> kernel can run at EL2 as a virtualisation host.
> In this case the PID value is stored in CONTEXTIDR_EL2.
> perf provides PMU options which program the ETM to insert these values
> into the trace data."

Will in next spin; thanks a lot for writing up!

> > +To support tracing PID for the kernel runs at different exception levels,
> > +the PMU formats are defined as follow:
> > +
> > +  "contextid1": Available on both EL1 kernel and EL2 kernel.  When the
> > +kernel is running at EL1, "contextid1" enables the PID
> > +tracing; when the kernel is running at EL2, this enables
> > +tracing the PID of guest applications.
> > +
> > +  "contextid2": Only usable when the kernel is running at EL2.  When
> > +selected, enables PID tracing on EL2 kernel.
> > +
> > +  "contextid":  Will be an alias for the option that enables PID
> > +tracing.  I.e,
> > +contextid == contextid1, on EL1 kernel.
> > +contextid == contextid2, on EL2 kernel.
> > +
> > +The perf tool automatically sets corresponding bit for the "contextid" 
> > config,
> > +therefore, the user doesn't have to bother which EL the kernel is running.
> > +
> > +  i.e, perf record -e cs_etm/contextid/u -- uname
> > +or perf record -e cs_etm//u -- uname
> > +
> > +will always do the "PID" tracing, independent of the kernel EL.
> > +
> 
> This is telling me that both cs_etm// and cs_etm/contextid/ have the
> same effect - trace PID. Is this correct?

Correct.

> If so, then contextid, contextid1 and contextid2 have no effect except
> in specific EL2 circumstances.

Yes, exactly.

Thanks,
Leo

> > +When the kernel is running at EL2 with VHE, if user wants to trace both the
> > +PIDs for both host and guest, the two configs "contextid1" and "contextid2"
> > +can be set at the same time:
> > +
> > +  perf record -e cs_etm/contextid1,contextid2/u -- uname
> > +
> >
> 
> 
> Regards
> 
> Mike
> 
> 
> >  Generating coverage files for Feedback Directed Optimization: AutoFDO
> >  -
> > --
> > 2.25.1
> >
> 
> 
> --
> Mike Leach
> Principal Engineer, ARM Ltd.
> Manchester Design Centre. UK


Re: [PATCH v2 7/7] Documentation: coresight: Add PID tracing description

2021-02-03 Thread Leo Yan
On Tue, Feb 02, 2021 at 11:24:48PM +, Suzuki Kuruppassery Poulose wrote:
> On 2/2/21 4:38 PM, Leo Yan wrote:
> > After support the PID tracing for the kernel in EL1 or EL2, the usage
> > gets more complicated.
> > 
> > This patch gives description for the PMU formats of contextID configs,
> > this can help users to understand how to control the knobs for PID
> > tracing when the kernel is in different ELs.
> > 
> > Signed-off-by: Leo Yan 
> > ---
> >   Documentation/trace/coresight/coresight.rst | 37 +
> >   1 file changed, 37 insertions(+)
> > 
> > diff --git a/Documentation/trace/coresight/coresight.rst 
> > b/Documentation/trace/coresight/coresight.rst
> > index 0b73acb44efa..771558f22938 100644
> > --- a/Documentation/trace/coresight/coresight.rst
> > +++ b/Documentation/trace/coresight/coresight.rst
> > @@ -512,6 +512,43 @@ The --itrace option controls the type and frequency of 
> > synthesized events
> >   Note that only 64-bit programs are currently supported - further work is
> >   required to support instruction decode of 32-bit Arm programs.
> > +2.2) Tracing PID
> > +
> > +When the kernel is running at EL2 with Virtualization Host Extensions 
> > (VHE),
> > +perf records CONTEXTIDR_EL2 in the trace data and can be used as PID when
> > +decoding; and if the kernel is running at EL1 with nVHE, CONTEXTIDR_EL1 is
> > +traced for PID.
> > +
> > +To support tracing PID for the kernel runs at different exception levels,
> > +the PMU formats are defined as follow:
> > +
> > +  "contextid1": Available on both EL1 kernel and EL2 kernel.  When the
> > +kernel is running at EL1, "contextid1" enables the PID
> > +tracing; when the kernel is running at EL2, this enables
> > +tracing the PID of guest applications.
> > +
> > +  "contextid2": Only usable when the kernel is running at EL2.  When
> > +selected, enables PID tracing on EL2 kernel.
> > +
> > +  "contextid":  Will be an alias for the option that enables PID
> > +tracing.  I.e,
> > +contextid == contextid1, on EL1 kernel.
> > +contextid == contextid2, on EL2 kernel.
> > +
> > +The perf tool automatically sets corresponding bit for the "contextid" 
> > config,
> > +therefore, the user doesn't have to bother which EL the kernel is running.
> > +
> > +  i.e, perf record -e cs_etm/contextid/u -- uname
> > +or perf record -e cs_etm//u -- uname
> > +
> > +will always do the "PID" tracing, independent of the kernel EL.
> > +
> > +When the kernel is running at EL2 with VHE, if user wants to trace both the
> > +PIDs for both host and guest, the two configs "contextid1" and "contextid2"
> > +can be set at the same time:
> > +
> > +  perf record -e cs_etm/contextid1,contextid2/u -- uname
> > +
> 
> To make this case clear, we could change the command from uname to
> something like:
> 
> perf record -e cs_etm/contextid1,contextid2/u -- vm

Exactly, will refine.

> Otherwise looks good to me.
> 
> With the above fixed,
> 
> Reviewed-by: Suzuki K Poulose 

Thanks,
Leo


Re: [PATCH v2 6/7] perf cs-etm: Detect pid in VMID for kernel running at EL2

2021-02-03 Thread Leo Yan
On Tue, Feb 02, 2021 at 11:29:47PM +, Suzuki Kuruppassery Poulose wrote:
> On 2/2/21 4:38 PM, Leo Yan wrote:
> > From: Suzuki K Poulose 
> > 
> > The PID of the task could be traced as VMID when the kernel is running
> > at EL2.  Teach the decoder to look for VMID when the CONTEXTIDR (Arm32)
> > or CONTEXTIDR_EL1 (Arm64) is invalid but we have a valid VMID.
> > 
> > Cc: Mike Leach 
> > Cc: Mathieu Poirier 
> > Cc: Al Grant 
> > Co-developed-by: Leo Yan 
> > Signed-off-by: Suzuki K Poulose 
> > Signed-off-by: Leo Yan 
> > ---
> >   .../perf/util/cs-etm-decoder/cs-etm-decoder.c | 32 ---
> >   1 file changed, 28 insertions(+), 4 deletions(-)
> > 
> > diff --git a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c 
> > b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c
> > index 3f4bc4050477..fb2a163ff74e 100644
> > --- a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c
> > +++ b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c
> > @@ -6,6 +6,7 @@
> >* Author: Mathieu Poirier 
> >*/
> > +#include 
> >   #include 
> >   #include 
> >   #include 
> > @@ -491,13 +492,36 @@ cs_etm_decoder__set_tid(struct cs_etm_queue *etmq,
> > const ocsd_generic_trace_elem *elem,
> > const uint8_t trace_chan_id)
> >   {
> > -   pid_t tid;
> > +   pid_t tid = -1;
> > +   u64 pid_fmt;
> > +   int ret;
> > -   /* Ignore PE_CONTEXT packets that don't have a valid contextID */
> > -   if (!elem->context.ctxt_id_valid)
> > +   ret = cs_etm__get_pid_fmt(trace_chan_id, _fmt);
> > +   if (ret)
> 
> Is this something we can cache in this function ? e.g,
>   static u64 pid_fmt;
> 
>   if (!pid_pfmt)
>   ret = cs_etm__get_pid_fmt(trace_chan_id, _fmt);
> 
> As all the ETMs will be running at the same exception level.

Sorry that I let you repeated your comments again.

To be honest, I considered this after read your comment in the previous
series, but I thought it's possible that multiple CPUs have different
PID format, especially for big.LITTLE arch.  After read your suggestion
again, I think my concern is not valid, even for big.LITTLE, all CPUs
should run on the same kernel exception level.

So will follow up your suggestion to cache "pid_fmt".

> 
> > +   return OCSD_RESP_FATAL_SYS_ERR;
> > +
> > +   /*
> > +* Process the PE_CONTEXT packets if we have a valid contextID or VMID.
> > +* If the kernel is running at EL2, the PID is traced in CONTEXTIDR_EL2
> > +* as VMID, Bit ETM_OPT_CTXTID2 is set in this case.
> > +*/
> > +   switch (pid_fmt) {
> > +   case BIT(ETM_OPT_CTXTID):
> > +   if (elem->context.ctxt_id_valid)
> > +   tid = elem->context.context_id;
> > +   break;
> > +   case BIT(ETM_OPT_CTXTID2) | BIT(ETM_OPT_CTXTID):
> 
> I would rather fix the cs_etm__get_pid_fmt() to return either of these
> as commented. i.e, ETM_OPT_CTXTID or ETM_OPT_CTXTID2. Thus we don't
> need the this case.

I explained why I set both bits for ETM_OPT_CTXTID and ETM_OPT_CTXTID2
in the patch 05/07.  Could you take a look for it?

> With the above two addressed:
> 
> Reviewed-by: Suzuki K Poulose 

Thanks,
Leo


Re: [PATCH v2 5/7] perf cs-etm: Add helper cs_etm__get_pid_fmt()

2021-02-03 Thread Leo Yan
On Tue, Feb 02, 2021 at 11:19:22PM +, Suzuki Kuruppassery Poulose wrote:
> On 2/2/21 4:38 PM, Leo Yan wrote:
> > This patch adds helper function cs_etm__get_pid_fmt(), by passing
> > parameter "traceID", it returns the PID format.
> > 
> > Signed-off-by: Leo Yan 
> > ---
> >   tools/perf/util/cs-etm.c | 43 
> >   tools/perf/util/cs-etm.h |  1 +
> >   2 files changed, 44 insertions(+)
> > 
> > diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c
> > index a2a369e2fbb6..8194ddbd01e5 100644
> > --- a/tools/perf/util/cs-etm.c
> > +++ b/tools/perf/util/cs-etm.c
> > @@ -7,6 +7,7 @@
> >*/
> >   #include 
> > +#include 
> >   #include 
> >   #include 
> >   #include 
> > @@ -156,6 +157,48 @@ int cs_etm__get_cpu(u8 trace_chan_id, int *cpu)
> > return 0;
> >   }
> > +/*
> > + * The returned PID format is presented by two bits:
> > + *
> > + *   Bit ETM_OPT_CTXTID: CONTEXTIDR or CONTEXTIDR_EL1 is traced;
> > + *   Bit ETM_OPT_CTXTID2: CONTEXTIDR_EL2 is traced.
> > + *
> > + * It's possible that these two bits are set together, this means the 
> > tracing
> > + * contains PIDs for both CONTEXTIDR_EL1 and CONTEXTIDR_EL2.
> 
> This is a bit confusing. If both the bits are set, the session
> was run on an EL2 kernel. Thus, the PID is always in CONTEXTIDR_EL2.

Sorry for confusion.  I'd like to rephrase as:

It's possible that the two bits ETM_OPT_CTXTID and ETM_OPT_CTXTID2 are
enabled at the same time when the session runs on an EL2 kernel.  This
means the CONTEXTIDR_EL1 and CONTEXTIDR_EL2 both will be recorded in
the trace data, the tool will selectively use CONTEXTIDR_EL2 as PID.

> > + */
> > +int cs_etm__get_pid_fmt(u8 trace_chan_id, u64 *pid_fmt)
> > +{
> > +   struct int_node *inode;
> > +   u64 *metadata, val;
> > +
> > +   inode = intlist__find(traceid_list, trace_chan_id);
> > +   if (!inode)
> > +   return -EINVAL;
> > +
> > +   metadata = inode->priv;
> > +
> > +   if (metadata[CS_ETM_MAGIC] == __perf_cs_etmv3_magic) {
> > +   val = metadata[CS_ETM_ETMCR];
> > +   /* CONTEXTIDR is traced */
> > +   if (val & BIT(ETM_OPT_CTXTID))
> > +   *pid_fmt = BIT(ETM_OPT_CTXTID);
> > +   } else {
> > +   val = metadata[CS_ETMV4_TRCCONFIGR];
> > +
> > +   *pid_fmt = 0;
> > +
> > +   /* CONTEXTIDR_EL2 is traced */
> > +   if (val & (BIT(ETM4_CFG_BIT_VMID) | BIT(ETM4_CFG_BIT_VMID_OPT)))
> > +   *pid_fmt = BIT(ETM_OPT_CTXTID2);
> > +
> > +   /* CONTEXTIDR_EL1 is traced */
> > +   if (val & BIT(ETM4_CFG_BIT_CTXTID))
> 
> I haven't looked at how this gets used. But, Shouldn't this be :
> 
>   else if (val & BIT(ETM4_CFG_BIT_CTXTID)) ?

Actually it's deliberately to set both bits ETM_OPT_CTXTID2 and
ETM_OPT_CTXTID if user has enable configs "contextid1" and
"contextid2".  So this is exactly the reversed flow in the
function cs_etmv4_get_config().

Thanks,
Leo


Re: [PATCH v2 1/7] coresight: etm-perf: Clarify comment on perf options

2021-02-03 Thread Leo Yan
Hi Suzuki,

On Tue, Feb 02, 2021 at 11:00:42PM +, Suzuki Kuruppassery Poulose wrote:
> Hi Leo
> 
> On 2/2/21 4:38 PM, Leo Yan wrote:
> > In theory, the options should be arbitrary values and are neutral for
> > any ETM version; so far perf tool uses ETMv3.5/PTM ETMCR config bits
> > except for register's bit definitions, also uses as options.
> > 
> > This can introduce confusion, especially if we want to add a new option
> > but the new option is not supported by ETMv3.5/PTM ETMCR.  But on the
> > other hand, we cannot change options since these options are generic
> > CoreSight PMU ABI.
> > 
> > For easier maintenance and avoid confusion, this patch refines the
> > comment to clarify perf options, and gives out the background info for
> > these bits are coming from ETMv3.5/PTM.  Afterwards, we should take
> > these options as general knobs, and if there have any confliction with
> > ETMv3.5/PTM, should consider to define saperate macros for ETMv3.5/PTM
> > ETMCR config bits.
> > 
> > Suggested-by: Suzuki K Poulose 
> > Signed-off-by: Leo Yan 
> 
> The patch looks good to me.  The only concern I have is, whether
> we should split this patch to kernel vs tools ? As the kernel
> changes go via coresight tree and the tools patch may go via
> perf tree ?

Yes, will split the patch.

> Either way, for the patch:
> 
> Reviewed-by: Suzuki K Poulose 

Thanks for review and suggestion.

> > ---
> >   .../hwtracing/coresight/coresight-etm-perf.c|  5 -
> >   include/linux/coresight-pmu.h   | 17 -
> >   tools/include/linux/coresight-pmu.h | 17 -
> >   3 files changed, 28 insertions(+), 11 deletions(-)
> > 
> > diff --git a/drivers/hwtracing/coresight/coresight-etm-perf.c 
> > b/drivers/hwtracing/coresight/coresight-etm-perf.c
> > index bdc34ca449f7..465ef1aa8c82 100644
> > --- a/drivers/hwtracing/coresight/coresight-etm-perf.c
> > +++ b/drivers/hwtracing/coresight/coresight-etm-perf.c
> > @@ -27,7 +27,10 @@ static bool etm_perf_up;
> >   static DEFINE_PER_CPU(struct perf_output_handle, ctx_handle);
> >   static DEFINE_PER_CPU(struct coresight_device *, csdev_src);
> > -/* ETMv3.5/PTM's ETMCR is 'config' */
> > +/*
> > + * The PMU formats were orignally for ETMv3.5/PTM's ETMCR 'config';
> > + * now take them as general formats and apply on all ETMs.
> > + */
> >   PMU_FORMAT_ATTR(cycacc,   "config:" __stringify(ETM_OPT_CYCACC));
> >   PMU_FORMAT_ATTR(contextid,"config:" __stringify(ETM_OPT_CTXTID));
> >   PMU_FORMAT_ATTR(timestamp,"config:" __stringify(ETM_OPT_TS));
> > diff --git a/include/linux/coresight-pmu.h b/include/linux/coresight-pmu.h
> > index b0e35eec6499..5dc47cfdcf07 100644
> > --- a/include/linux/coresight-pmu.h
> > +++ b/include/linux/coresight-pmu.h
> > @@ -10,11 +10,18 @@
> >   #define CORESIGHT_ETM_PMU_NAME "cs_etm"
> >   #define CORESIGHT_ETM_PMU_SEED  0x10
> > -/* ETMv3.5/PTM's ETMCR config bit */
> > -#define ETM_OPT_CYCACC  12
> > -#define ETM_OPT_CTXTID 14
> > -#define ETM_OPT_TS  28
> > -#define ETM_OPT_RETSTK 29
> > +/*
> > + * Below are the definition of bit offsets for perf option, and works as
> > + * arbitrary values for all ETM versions.
> > + *
> > + * Most of them are orignally from ETMv3.5/PTM's ETMCR config, therefore,
> > + * ETMv3.5/PTM doesn't define ETMCR config bits with prefix "ETM3_" and
> > + * directly use below macros as config bits.
> > + */
> > +#define ETM_OPT_CYCACC 12
> > +#define ETM_OPT_CTXTID 14
> > +#define ETM_OPT_TS 28
> > +#define ETM_OPT_RETSTK 29
> >   /* ETMv4 CONFIGR programming bits for the ETM OPTs */
> >   #define ETM4_CFG_BIT_CYCACC   4
> > diff --git a/tools/include/linux/coresight-pmu.h 
> > b/tools/include/linux/coresight-pmu.h
> > index b0e35eec6499..5dc47cfdcf07 100644
> > --- a/tools/include/linux/coresight-pmu.h
> > +++ b/tools/include/linux/coresight-pmu.h
> > @@ -10,11 +10,18 @@
> >   #define CORESIGHT_ETM_PMU_NAME "cs_etm"
> >   #define CORESIGHT_ETM_PMU_SEED  0x10
> > -/* ETMv3.5/PTM's ETMCR config bit */
> > -#define ETM_OPT_CYCACC  12
> > -#define ETM_OPT_CTXTID 14
> > -#define ETM_OPT_TS  28
> > -#define ETM_OPT_RETSTK 29
> > +/*
> > + * Below are the definition of bit offsets for perf option, and works as
> > + * arbitrary values for all ETM versions.
> > + *
> > + * Most of them are orignally from ETMv3.5/PTM's ETMCR config, therefore,
> > + * ETMv3.5/PTM doesn't define ETMCR config bits with prefix "ETM3_" and
> > + * directly use below macros as config bits.
> > + */
> > +#define ETM_OPT_CYCACC 12
> > +#define ETM_OPT_CTXTID 14
> > +#define ETM_OPT_TS 28
> > +#define ETM_OPT_RETSTK 29
> >   /* ETMv4 CONFIGR programming bits for the ETM OPTs */
> >   #define ETM4_CFG_BIT_CYCACC   4
> > 
> 


[PATCH v2 6/7] perf cs-etm: Detect pid in VMID for kernel running at EL2

2021-02-02 Thread Leo Yan
From: Suzuki K Poulose 

The PID of the task could be traced as VMID when the kernel is running
at EL2.  Teach the decoder to look for VMID when the CONTEXTIDR (Arm32)
or CONTEXTIDR_EL1 (Arm64) is invalid but we have a valid VMID.

Cc: Mike Leach 
Cc: Mathieu Poirier 
Cc: Al Grant 
Co-developed-by: Leo Yan 
Signed-off-by: Suzuki K Poulose 
Signed-off-by: Leo Yan 
---
 .../perf/util/cs-etm-decoder/cs-etm-decoder.c | 32 ---
 1 file changed, 28 insertions(+), 4 deletions(-)

diff --git a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c 
b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c
index 3f4bc4050477..fb2a163ff74e 100644
--- a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c
+++ b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c
@@ -6,6 +6,7 @@
  * Author: Mathieu Poirier 
  */
 
+#include 
 #include 
 #include 
 #include 
@@ -491,13 +492,36 @@ cs_etm_decoder__set_tid(struct cs_etm_queue *etmq,
const ocsd_generic_trace_elem *elem,
const uint8_t trace_chan_id)
 {
-   pid_t tid;
+   pid_t tid = -1;
+   u64 pid_fmt;
+   int ret;
 
-   /* Ignore PE_CONTEXT packets that don't have a valid contextID */
-   if (!elem->context.ctxt_id_valid)
+   ret = cs_etm__get_pid_fmt(trace_chan_id, _fmt);
+   if (ret)
+   return OCSD_RESP_FATAL_SYS_ERR;
+
+   /*
+* Process the PE_CONTEXT packets if we have a valid contextID or VMID.
+* If the kernel is running at EL2, the PID is traced in CONTEXTIDR_EL2
+* as VMID, Bit ETM_OPT_CTXTID2 is set in this case.
+*/
+   switch (pid_fmt) {
+   case BIT(ETM_OPT_CTXTID):
+   if (elem->context.ctxt_id_valid)
+   tid = elem->context.context_id;
+   break;
+   case BIT(ETM_OPT_CTXTID2) | BIT(ETM_OPT_CTXTID):
+   case BIT(ETM_OPT_CTXTID2):
+   if (elem->context.vmid_valid)
+   tid = elem->context.vmid;
+   break;
+   default:
+   break;
+   }
+
+   if (tid == -1)
return OCSD_RESP_CONT;
 
-   tid =  elem->context.context_id;
if (cs_etm__etmq_set_tid(etmq, tid, trace_chan_id))
return OCSD_RESP_FATAL_SYS_ERR;
 
-- 
2.25.1



[PATCH v2 7/7] Documentation: coresight: Add PID tracing description

2021-02-02 Thread Leo Yan
After support the PID tracing for the kernel in EL1 or EL2, the usage
gets more complicated.

This patch gives description for the PMU formats of contextID configs,
this can help users to understand how to control the knobs for PID
tracing when the kernel is in different ELs.

Signed-off-by: Leo Yan 
---
 Documentation/trace/coresight/coresight.rst | 37 +
 1 file changed, 37 insertions(+)

diff --git a/Documentation/trace/coresight/coresight.rst 
b/Documentation/trace/coresight/coresight.rst
index 0b73acb44efa..771558f22938 100644
--- a/Documentation/trace/coresight/coresight.rst
+++ b/Documentation/trace/coresight/coresight.rst
@@ -512,6 +512,43 @@ The --itrace option controls the type and frequency of 
synthesized events
 Note that only 64-bit programs are currently supported - further work is
 required to support instruction decode of 32-bit Arm programs.
 
+2.2) Tracing PID
+
+When the kernel is running at EL2 with Virtualization Host Extensions (VHE),
+perf records CONTEXTIDR_EL2 in the trace data and can be used as PID when
+decoding; and if the kernel is running at EL1 with nVHE, CONTEXTIDR_EL1 is
+traced for PID.
+
+To support tracing PID for the kernel runs at different exception levels,
+the PMU formats are defined as follow:
+
+  "contextid1": Available on both EL1 kernel and EL2 kernel.  When the
+kernel is running at EL1, "contextid1" enables the PID
+tracing; when the kernel is running at EL2, this enables
+tracing the PID of guest applications.
+
+  "contextid2": Only usable when the kernel is running at EL2.  When
+selected, enables PID tracing on EL2 kernel.
+
+  "contextid":  Will be an alias for the option that enables PID
+tracing.  I.e,
+contextid == contextid1, on EL1 kernel.
+contextid == contextid2, on EL2 kernel.
+
+The perf tool automatically sets corresponding bit for the "contextid" config,
+therefore, the user doesn't have to bother which EL the kernel is running.
+
+  i.e, perf record -e cs_etm/contextid/u -- uname
+or perf record -e cs_etm//u -- uname
+
+will always do the "PID" tracing, independent of the kernel EL.
+
+When the kernel is running at EL2 with VHE, if user wants to trace both the
+PIDs for both host and guest, the two configs "contextid1" and "contextid2"
+can be set at the same time:
+
+  perf record -e cs_etm/contextid1,contextid2/u -- uname
+
 
 Generating coverage files for Feedback Directed Optimization: AutoFDO
 -
-- 
2.25.1



[PATCH v2 5/7] perf cs-etm: Add helper cs_etm__get_pid_fmt()

2021-02-02 Thread Leo Yan
This patch adds helper function cs_etm__get_pid_fmt(), by passing
parameter "traceID", it returns the PID format.

Signed-off-by: Leo Yan 
---
 tools/perf/util/cs-etm.c | 43 
 tools/perf/util/cs-etm.h |  1 +
 2 files changed, 44 insertions(+)

diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c
index a2a369e2fbb6..8194ddbd01e5 100644
--- a/tools/perf/util/cs-etm.c
+++ b/tools/perf/util/cs-etm.c
@@ -7,6 +7,7 @@
  */
 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -156,6 +157,48 @@ int cs_etm__get_cpu(u8 trace_chan_id, int *cpu)
return 0;
 }
 
+/*
+ * The returned PID format is presented by two bits:
+ *
+ *   Bit ETM_OPT_CTXTID: CONTEXTIDR or CONTEXTIDR_EL1 is traced;
+ *   Bit ETM_OPT_CTXTID2: CONTEXTIDR_EL2 is traced.
+ *
+ * It's possible that these two bits are set together, this means the tracing
+ * contains PIDs for both CONTEXTIDR_EL1 and CONTEXTIDR_EL2.
+ */
+int cs_etm__get_pid_fmt(u8 trace_chan_id, u64 *pid_fmt)
+{
+   struct int_node *inode;
+   u64 *metadata, val;
+
+   inode = intlist__find(traceid_list, trace_chan_id);
+   if (!inode)
+   return -EINVAL;
+
+   metadata = inode->priv;
+
+   if (metadata[CS_ETM_MAGIC] == __perf_cs_etmv3_magic) {
+   val = metadata[CS_ETM_ETMCR];
+   /* CONTEXTIDR is traced */
+   if (val & BIT(ETM_OPT_CTXTID))
+   *pid_fmt = BIT(ETM_OPT_CTXTID);
+   } else {
+   val = metadata[CS_ETMV4_TRCCONFIGR];
+
+   *pid_fmt = 0;
+
+   /* CONTEXTIDR_EL2 is traced */
+   if (val & (BIT(ETM4_CFG_BIT_VMID) | BIT(ETM4_CFG_BIT_VMID_OPT)))
+   *pid_fmt = BIT(ETM_OPT_CTXTID2);
+
+   /* CONTEXTIDR_EL1 is traced */
+   if (val & BIT(ETM4_CFG_BIT_CTXTID))
+   *pid_fmt |= BIT(ETM_OPT_CTXTID);
+   }
+
+   return 0;
+}
+
 void cs_etm__etmq_set_traceid_queue_timestamp(struct cs_etm_queue *etmq,
  u8 trace_chan_id)
 {
diff --git a/tools/perf/util/cs-etm.h b/tools/perf/util/cs-etm.h
index 4ad925d6d799..7cc3bba0017d 100644
--- a/tools/perf/util/cs-etm.h
+++ b/tools/perf/util/cs-etm.h
@@ -173,6 +173,7 @@ struct cs_etm_packet_queue {
 int cs_etm__process_auxtrace_info(union perf_event *event,
  struct perf_session *session);
 int cs_etm__get_cpu(u8 trace_chan_id, int *cpu);
+int cs_etm__get_pid_fmt(u8 trace_chan_id, u64 *pid_fmt);
 int cs_etm__etmq_set_tid(struct cs_etm_queue *etmq,
 pid_t tid, u8 trace_chan_id);
 bool cs_etm__etmq_is_timeless(struct cs_etm_queue *etmq);
-- 
2.25.1



[PATCH v2 4/7] perf cs-etm: Support PID tracing in config

2021-02-02 Thread Leo Yan
From: Suzuki K Poulose 

If the kernel is running at EL2, the pid of a task is exposed via VMID
instead of the CONTEXTID.  Add support for this in the perf tool.

This patch respects user setting if user has specified any configs
from "contextid", "contextid1" or "contextid2"; otherwise, it
dynamically sets config based on PMU format "contextid".

Cc: Mike Leach 
Cc: Mathieu Poirier 
Cc: Al Grant 
Co-developed-by: Leo Yan 
Signed-off-by: Suzuki K Poulose 
Signed-off-by: Leo Yan 
---
 tools/include/linux/coresight-pmu.h |  3 ++
 tools/perf/arch/arm/util/cs-etm.c   | 61 +++--
 2 files changed, 52 insertions(+), 12 deletions(-)

diff --git a/tools/include/linux/coresight-pmu.h 
b/tools/include/linux/coresight-pmu.h
index 5dc47cfdcf07..4ac5c081af93 100644
--- a/tools/include/linux/coresight-pmu.h
+++ b/tools/include/linux/coresight-pmu.h
@@ -20,14 +20,17 @@
  */
 #define ETM_OPT_CYCACC 12
 #define ETM_OPT_CTXTID 14
+#define ETM_OPT_CTXTID215
 #define ETM_OPT_TS 28
 #define ETM_OPT_RETSTK 29
 
 /* ETMv4 CONFIGR programming bits for the ETM OPTs */
 #define ETM4_CFG_BIT_CYCACC4
 #define ETM4_CFG_BIT_CTXTID6
+#define ETM4_CFG_BIT_VMID  7
 #define ETM4_CFG_BIT_TS11
 #define ETM4_CFG_BIT_RETSTK12
+#define ETM4_CFG_BIT_VMID_OPT  15
 
 static inline int coresight_get_trace_id(int cpu)
 {
diff --git a/tools/perf/arch/arm/util/cs-etm.c 
b/tools/perf/arch/arm/util/cs-etm.c
index c25c878fd06c..fa6f91a7c8a1 100644
--- a/tools/perf/arch/arm/util/cs-etm.c
+++ b/tools/perf/arch/arm/util/cs-etm.c
@@ -67,6 +67,7 @@ static int cs_etm_set_context_id(struct auxtrace_record *itr,
char path[PATH_MAX];
int err = -EINVAL;
u32 val;
+   u64 contextid;
 
ptr = container_of(itr, struct cs_etm_recording, itr);
cs_etm_pmu = ptr->cs_etm_pmu;
@@ -86,25 +87,59 @@ static int cs_etm_set_context_id(struct auxtrace_record 
*itr,
goto out;
}
 
+   /* User has configured for PID tracing, respects it. */
+   contextid = evsel->core.attr.config &
+   (BIT(ETM_OPT_CTXTID) | BIT(ETM_OPT_CTXTID2));
+
/*
-* TRCIDR2.CIDSIZE, bit [9-5], indicates whether contextID tracing
-* is supported:
-*  0b0 Context ID tracing is not supported.
-*  0b00100 Maximum of 32-bit Context ID size.
-*  All other values are reserved.
+* If user doesn't configure the contextid format, parse PMU format and
+* enable PID tracing according to the "contextid" format bits:
+*
+*   If bit ETM_OPT_CTXTID is set, trace CONTEXTIDR_EL1;
+*   If bit ETM_OPT_CTXTID2 is set, trace CONTEXTIDR_EL2.
 */
-   val = BMVAL(val, 5, 9);
-   if (!val || val != 0x4) {
-   err = -EINVAL;
-   goto out;
+   if (!contextid)
+   contextid = perf_pmu__format_bits(_etm_pmu->format,
+ "contextid");
+
+   if (contextid & BIT(ETM_OPT_CTXTID)) {
+   /*
+* TRCIDR2.CIDSIZE, bit [9-5], indicates whether contextID
+* tracing is supported:
+*  0b0 Context ID tracing is not supported.
+*  0b00100 Maximum of 32-bit Context ID size.
+*  All other values are reserved.
+*/
+   val = BMVAL(val, 5, 9);
+   if (!val || val != 0x4) {
+   pr_err("%s: CONTEXTIDR_EL1 isn't supported\n",
+  CORESIGHT_ETM_PMU_NAME);
+   err = -EINVAL;
+   goto out;
+   }
+   }
+
+   if (contextid & BIT(ETM_OPT_CTXTID2)) {
+   /*
+* TRCIDR2.VMIDOPT[30:29] != 0 and
+* TRCIDR2.VMIDSIZE[14:10] == 0b00100 (32bit virtual contextid)
+* We can't support CONTEXTIDR in VMID if the size of the
+* virtual context id is < 32bit.
+* Any value of VMIDSIZE >= 4 (i.e, > 32bit) is fine for us.
+*/
+   if (!BMVAL(val, 29, 30) || BMVAL(val, 10, 14) < 4) {
+   pr_err("%s: CONTEXTIDR_EL2 isn't supported\n",
+  CORESIGHT_ETM_PMU_NAME);
+   err = -EINVAL;
+   goto out;
+   }
}
 
/* All good, let the kernel know */
-   evsel->core.attr.config |= (1 << ETM_OPT_CTXTID);
+   evsel->core.attr.config |= contextid;
err = 0;
 
 out:
-
return err;
 }
 
@@ -489,7 +524,9 @@ static u64 cs_etmv4_get_config(struct auxtrace_record *itr)
config |= BIT(ETM4_CFG_BIT_TS);
if (config_opts & BIT(ETM_OPT_RETSTK))
conf

[PATCH v2 3/7] perf cs-etm: Fix bitmap for option

2021-02-02 Thread Leo Yan
From: Suzuki K Poulose 

When set option with macros ETM_OPT_CTXTID and ETM_OPT_TS, it wrongly
takes these two values (14 and 28 prespectively) as bit masks, but
actually both are the offset for bits.  But this doesn't lead to
further failure due to the AND logic operation will be always true for
ETM_OPT_CTXTID / ETM_OPT_TS.

This patch defines new independent macros (rather than using the
"config" bits) for requesting the "contextid" and "timestamp" for
cs_etm_set_option().

[leoy: Extract the change as a separate patch for easier review]
Signed-off-by: Suzuki K Poulose 
Signed-off-by: Leo Yan 
---
 tools/perf/arch/arm/util/cs-etm.c | 12 
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/tools/perf/arch/arm/util/cs-etm.c 
b/tools/perf/arch/arm/util/cs-etm.c
index bd446aba64f7..c25c878fd06c 100644
--- a/tools/perf/arch/arm/util/cs-etm.c
+++ b/tools/perf/arch/arm/util/cs-etm.c
@@ -156,6 +156,10 @@ static int cs_etm_set_timestamp(struct auxtrace_record 
*itr,
return err;
 }
 
+#define ETM_SET_OPT_CTXTID (1 << 0)
+#define ETM_SET_OPT_TS (1 << 1)
+#define ETM_SET_OPT_MASK   (ETM_SET_OPT_CTXTID | ETM_SET_OPT_TS)
+
 static int cs_etm_set_option(struct auxtrace_record *itr,
 struct evsel *evsel, u32 option)
 {
@@ -169,17 +173,17 @@ static int cs_etm_set_option(struct auxtrace_record *itr,
!cpu_map__has(online_cpus, i))
continue;
 
-   if (option & ETM_OPT_CTXTID) {
+   if (option & ETM_SET_OPT_CTXTID) {
err = cs_etm_set_context_id(itr, evsel, i);
if (err)
goto out;
}
-   if (option & ETM_OPT_TS) {
+   if (option & ETM_SET_OPT_TS) {
err = cs_etm_set_timestamp(itr, evsel, i);
if (err)
goto out;
}
-   if (option & ~(ETM_OPT_CTXTID | ETM_OPT_TS))
+   if (option & ~(ETM_SET_OPT_MASK))
/* Nothing else is currently supported */
goto out;
}
@@ -406,7 +410,7 @@ static int cs_etm_recording_options(struct auxtrace_record 
*itr,
evsel__set_sample_bit(cs_etm_evsel, CPU);
 
err = cs_etm_set_option(itr, cs_etm_evsel,
-   ETM_OPT_CTXTID | ETM_OPT_TS);
+   ETM_SET_OPT_CTXTID | ETM_SET_OPT_TS);
if (err)
goto out;
}
-- 
2.25.1



[PATCH v2 1/7] coresight: etm-perf: Clarify comment on perf options

2021-02-02 Thread Leo Yan
In theory, the options should be arbitrary values and are neutral for
any ETM version; so far perf tool uses ETMv3.5/PTM ETMCR config bits
except for register's bit definitions, also uses as options.

This can introduce confusion, especially if we want to add a new option
but the new option is not supported by ETMv3.5/PTM ETMCR.  But on the
other hand, we cannot change options since these options are generic
CoreSight PMU ABI.

For easier maintenance and avoid confusion, this patch refines the
comment to clarify perf options, and gives out the background info for
these bits are coming from ETMv3.5/PTM.  Afterwards, we should take
these options as general knobs, and if there have any confliction with
ETMv3.5/PTM, should consider to define saperate macros for ETMv3.5/PTM
ETMCR config bits.

Suggested-by: Suzuki K Poulose 
Signed-off-by: Leo Yan 
---
 .../hwtracing/coresight/coresight-etm-perf.c|  5 -
 include/linux/coresight-pmu.h   | 17 -
 tools/include/linux/coresight-pmu.h | 17 -
 3 files changed, 28 insertions(+), 11 deletions(-)

diff --git a/drivers/hwtracing/coresight/coresight-etm-perf.c 
b/drivers/hwtracing/coresight/coresight-etm-perf.c
index bdc34ca449f7..465ef1aa8c82 100644
--- a/drivers/hwtracing/coresight/coresight-etm-perf.c
+++ b/drivers/hwtracing/coresight/coresight-etm-perf.c
@@ -27,7 +27,10 @@ static bool etm_perf_up;
 static DEFINE_PER_CPU(struct perf_output_handle, ctx_handle);
 static DEFINE_PER_CPU(struct coresight_device *, csdev_src);
 
-/* ETMv3.5/PTM's ETMCR is 'config' */
+/*
+ * The PMU formats were orignally for ETMv3.5/PTM's ETMCR 'config';
+ * now take them as general formats and apply on all ETMs.
+ */
 PMU_FORMAT_ATTR(cycacc,"config:" __stringify(ETM_OPT_CYCACC));
 PMU_FORMAT_ATTR(contextid, "config:" __stringify(ETM_OPT_CTXTID));
 PMU_FORMAT_ATTR(timestamp, "config:" __stringify(ETM_OPT_TS));
diff --git a/include/linux/coresight-pmu.h b/include/linux/coresight-pmu.h
index b0e35eec6499..5dc47cfdcf07 100644
--- a/include/linux/coresight-pmu.h
+++ b/include/linux/coresight-pmu.h
@@ -10,11 +10,18 @@
 #define CORESIGHT_ETM_PMU_NAME "cs_etm"
 #define CORESIGHT_ETM_PMU_SEED  0x10
 
-/* ETMv3.5/PTM's ETMCR config bit */
-#define ETM_OPT_CYCACC  12
-#define ETM_OPT_CTXTID 14
-#define ETM_OPT_TS  28
-#define ETM_OPT_RETSTK 29
+/*
+ * Below are the definition of bit offsets for perf option, and works as
+ * arbitrary values for all ETM versions.
+ *
+ * Most of them are orignally from ETMv3.5/PTM's ETMCR config, therefore,
+ * ETMv3.5/PTM doesn't define ETMCR config bits with prefix "ETM3_" and
+ * directly use below macros as config bits.
+ */
+#define ETM_OPT_CYCACC 12
+#define ETM_OPT_CTXTID 14
+#define ETM_OPT_TS 28
+#define ETM_OPT_RETSTK 29
 
 /* ETMv4 CONFIGR programming bits for the ETM OPTs */
 #define ETM4_CFG_BIT_CYCACC4
diff --git a/tools/include/linux/coresight-pmu.h 
b/tools/include/linux/coresight-pmu.h
index b0e35eec6499..5dc47cfdcf07 100644
--- a/tools/include/linux/coresight-pmu.h
+++ b/tools/include/linux/coresight-pmu.h
@@ -10,11 +10,18 @@
 #define CORESIGHT_ETM_PMU_NAME "cs_etm"
 #define CORESIGHT_ETM_PMU_SEED  0x10
 
-/* ETMv3.5/PTM's ETMCR config bit */
-#define ETM_OPT_CYCACC  12
-#define ETM_OPT_CTXTID 14
-#define ETM_OPT_TS  28
-#define ETM_OPT_RETSTK 29
+/*
+ * Below are the definition of bit offsets for perf option, and works as
+ * arbitrary values for all ETM versions.
+ *
+ * Most of them are orignally from ETMv3.5/PTM's ETMCR config, therefore,
+ * ETMv3.5/PTM doesn't define ETMCR config bits with prefix "ETM3_" and
+ * directly use below macros as config bits.
+ */
+#define ETM_OPT_CYCACC 12
+#define ETM_OPT_CTXTID 14
+#define ETM_OPT_TS 28
+#define ETM_OPT_RETSTK 29
 
 /* ETMv4 CONFIGR programming bits for the ETM OPTs */
 #define ETM4_CFG_BIT_CYCACC4
-- 
2.25.1



[PATCH v2 2/7] coresight: etm-perf: Support PID tracing for kernel at EL2

2021-02-02 Thread Leo Yan
From: Suzuki K Poulose 

When the kernel is running at EL2, the PID is stored in CONTEXTIDR_EL2.
So, tracing CONTEXTIDR_EL1 doesn't give us the pid of the process.
Thus we should trace the VMID with VMIDOPT set to trace CONTEXTIDR_EL2
instead of CONTEXTIDR_EL1.  Given that we have an existing config
option "contextid" and this will be useful for tracing virtual machines
(when we get to support virtualization).

So instead, this patch extends option CTXTID with an extra bit
ETM_OPT_CTXTID2 (bit 15), thus on an EL2 kernel, we will have another
bit available for the perf tool: ETM_OPT_CTXTID is for kernel running in
EL1, ETM_OPT_CTXTID2 is used when kernel runs in EL2 with VHE enabled.

The tool must be backward compatible for users, i.e, "contextid" today
traces PID and that should remain the same; for this purpose, the perf
tool is updated to automatically set corresponding bit for the
"contextid" config, therefore, the user doesn't have to bother which EL
the kernel is running.

  i.e, perf record -e cs_etm/contextid/u --

will always do the "pid" tracing, independent of the kernel EL.

The driver parses the format "contextid", which traces CONTEXTIDR_EL1
for ETM_OPT_CTXTID (on EL1 kernel) and traces CONTEXTIDR_EL2 for
ETM_OPT_CTXTID2 (on EL2 kernel).

Besides the enhancement for format "contexid", extra two formats are
introduced: "contextid1" and "contextid2".  This considers to support
tracing both CONTEXTIDR_EL1 and CONTEXTIDR_EL2 when the kernel is
running at EL2.  Finally, the PMU formats are defined as follow:

  "contextid1": Available on both EL1 kernel and EL2 kernel.  When the
kernel is running at EL1, "contextid1" enables the PID
tracing; when the kernel is running at EL2, this enables
tracing the PID of guest applications.

  "contextid2": Only usable when the kernel is running at EL2.  When
selected, enables PID tracing on EL2 kernel.

  "contextid":  Will be an alias for the option that enables PID
tracing.  I.e,
contextid == contextid1, on EL1 kernel.
contextid == contextid2, on EL2 kernel.

Cc: Mathieu Poirier 
Cc: Al Grant 
Cc: Mike Leach 
Cc: Leo Yan 
Signed-off-by: Suzuki K Poulose 
Signed-off-by: Leo Yan 
---
 .../hwtracing/coresight/coresight-etm-perf.c  | 27 ++-
 .../coresight/coresight-etm4x-core.c  | 13 +
 include/linux/coresight-pmu.h |  3 +++
 3 files changed, 42 insertions(+), 1 deletion(-)

diff --git a/drivers/hwtracing/coresight/coresight-etm-perf.c 
b/drivers/hwtracing/coresight/coresight-etm-perf.c
index 465ef1aa8c82..0f603b4094f2 100644
--- a/drivers/hwtracing/coresight/coresight-etm-perf.c
+++ b/drivers/hwtracing/coresight/coresight-etm-perf.c
@@ -32,15 +32,40 @@ static DEFINE_PER_CPU(struct coresight_device *, csdev_src);
  * now take them as general formats and apply on all ETMs.
  */
 PMU_FORMAT_ATTR(cycacc,"config:" __stringify(ETM_OPT_CYCACC));
-PMU_FORMAT_ATTR(contextid, "config:" __stringify(ETM_OPT_CTXTID));
+/* contextid1 enables tracing CONTEXTIDR_EL1 for ETMv4 */
+PMU_FORMAT_ATTR(contextid1,"config:" __stringify(ETM_OPT_CTXTID));
+/* contextid2 enables tracing CONTEXTIDR_EL2 for ETMv4 */
+PMU_FORMAT_ATTR(contextid2,"config:" __stringify(ETM_OPT_CTXTID2));
 PMU_FORMAT_ATTR(timestamp, "config:" __stringify(ETM_OPT_TS));
 PMU_FORMAT_ATTR(retstack,  "config:" __stringify(ETM_OPT_RETSTK));
 /* Sink ID - same for all ETMs */
 PMU_FORMAT_ATTR(sinkid,"config2:0-31");
 
+/*
+ * contextid always traces the "PID".  The PID is in CONTEXTIDR_EL1
+ * when the kernel is running at EL1; when the kernel is at EL2,
+ * the PID is in CONTEXTIDR_EL2.
+ */
+static ssize_t format_attr_contextid_show(struct device *dev,
+ struct device_attribute *attr,
+ char *page)
+{
+   int pid_fmt = ETM_OPT_CTXTID;
+
+#if defined(CONFIG_CORESIGHT_SOURCE_ETM4X)
+   pid_fmt = is_kernel_in_hyp_mode() ? ETM_OPT_CTXTID2 : ETM_OPT_CTXTID;
+#endif
+   return sprintf(page, "config:%d\n", pid_fmt);
+}
+
+struct device_attribute format_attr_contextid =
+   __ATTR(contextid, 0444, format_attr_contextid_show, NULL);
+
 static struct attribute *etm_config_formats_attr[] = {
_attr_cycacc.attr,
_attr_contextid.attr,
+   _attr_contextid1.attr,
+   _attr_contextid2.attr,
_attr_timestamp.attr,
_attr_retstack.attr,
_attr_sinkid.attr,
diff --git a/drivers/hwtracing/coresight/coresight-etm4x-core.c 
b/drivers/hwtracing/coresight/coresight-etm4x-core.c
index b20b6ff17cf6..8681c225b0ba 100644
--- a/drivers/hwtracing/coresight/coresight-etm4x-core.c
+++ b/driv

[PATCH v2 0/7] coresight: etm-perf: Fix pid tracing with VHE

2021-02-02 Thread Leo Yan
This patch series is to support PID tracing with Virtualization Host
Extensions (VHE).

Since the patch series v1 was sent out for reviewing, we had some
discussion and finalized the solution which is implemented in this
version.  Simply to say, to be backward compatibility, and can both
support PID tracing for the kernel is running at either EL1 or EL2,
the two new PMU formats "contextid1" and "contextid2" are introduced,
which works as a switch to trace PID for EL1 kernel and EL2 kernel
respectively.

The existed PMU format "contextid" needs to be backward compatible for
users, it's changed to an alias for "contextid1" on EL1 kernel and for
"contextid2" on EL2 kernel.  Therefore, even without setting "contextid"
config, the perf tool can dynamically pick up the config for PID
tracing, so the user doesn't have to set the "contexid" config manually.

As results, we now have three PMU formats, for easier understanding the
implementation, just copy and paste the descriptions for these three PMU
formats from the patch 07/07:

  "contextid1": Available on both EL1 kernel and EL2 kernel.  When the
kernel is running at EL1, "contextid1" enables the PID
tracing; when the kernel is running at EL2, this enables
tracing the PID of guest applications.

  "contextid2": Only usable when the kernel is running at EL2.  When
selected, enables PID tracing on EL2 kernel.

  "contextid":  Will be an alias for the option that enables PID
tracing.  I.e,
contextid == contextid1, on EL1 kernel.
contextid == contextid2, on EL2 kernel.

This patch series can be cleanly applied on perf/core branch:

  commit cd07e536b020 ("Merge remote-tracking branch 'torvalds/master' into 
perf/core")

... and applied on the mainline kernel:

  commit 88bb507a74ea ("Merge tag 'media/v5.11-3' of 
git://git.kernel.org/pub/scm/linux/kernel/git/mchehab/linux-media")

The patch series has been tested on Arm Juno-r2 board.  Verified the
kernel with EL1 and didn't find issue; though absenting the platform for
kernel with EL2, after some hacking in kernel driver and tool to emulate
the code paths for kernel on EL2, can see the code path is hit without
failure.


Changes from v1:
* Refactored PMU formats, added formats "contextid1"/"contextid2", and
  reworked format "contextid" (Suzuki/Mathieu);
* Refined the comments for perf configs (Leo/Mike);
* Added patch 07/07 for description PID tracing in docs;
* Found the issue for bitmap for option, extracted patch 03/07 for the
  fixing.

Changes from RFC:
* Added comments to clarify cases requested (Leo);
* Explain the change to generic flags for cs_etm_set_option() in the
  commit description;
* Stored PID format in metadata and passed it to decoder (Leo);
* Enhanced cs-etm for backward compatibility (Denis Nikitin).


Leo Yan (3):
  coresight: etm-perf: Clarify comment on perf options
  perf cs-etm: Add helper cs_etm__get_pid_fmt()
  Documentation: coresight: Add PID tracing description

Suzuki K Poulose (4):
  coresight: etm-perf: Support PID tracing for kernel at EL2
  perf cs-etm: Fix bitmap for option
  perf cs-etm: Support PID tracing in config
  perf cs-etm: Detect pid in VMID for kernel running at EL2

 Documentation/trace/coresight/coresight.rst   | 37 ++
 .../hwtracing/coresight/coresight-etm-perf.c  | 32 +++-
 .../coresight/coresight-etm4x-core.c  | 13 
 include/linux/coresight-pmu.h | 20 +++--
 tools/include/linux/coresight-pmu.h   | 20 +++--
 tools/perf/arch/arm/util/cs-etm.c | 73 +++
 .../perf/util/cs-etm-decoder/cs-etm-decoder.c | 32 +++-
 tools/perf/util/cs-etm.c  | 43 +++
 tools/perf/util/cs-etm.h  |  1 +
 9 files changed, 239 insertions(+), 32 deletions(-)

-- 
2.25.1



  1   2   3   4   5   6   7   8   9   10   >