[PATCH] powerpc/perf: Add kernel support for new MSR[HV PR] bits in trace-imc.

2020-07-03 Thread Anju T Sudhakar
IMC trace-mode record has MSR[HV PR] bits added in the third DW. 
These bits can be used to set the cpumode for the instruction pointer
captured in each sample.
  
   
Add support in kernel to use these bits to set the cpumode for 
each sample.   
   
Signed-off-by: Anju T Sudhakar 
---
 arch/powerpc/include/asm/imc-pmu.h |  5 +
 arch/powerpc/perf/imc-pmu.c| 29 -
 2 files changed, 29 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/include/asm/imc-pmu.h 
b/arch/powerpc/include/asm/imc-pmu.h
index 4da4fcba0684..4f897993b710 100644
--- a/arch/powerpc/include/asm/imc-pmu.h
+++ b/arch/powerpc/include/asm/imc-pmu.h
@@ -99,6 +99,11 @@ struct trace_imc_data {
  */
 #define IMC_TRACE_RECORD_TB1_MASK  0x3ffULL
 
+/*
+ * Bit 0:1 in third DW of IMC trace record
+ * specifies the MSR[HV PR] values.
+ */
+#define IMC_TRACE_RECORD_VAL_HVPR(x)   ((x) >> 62)
 
 /*
  * Device tree parser code detects IMC pmu support and
diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index cb50a9e1fd2d..310922fed9eb 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -1178,11 +1178,30 @@ static int trace_imc_prepare_sample(struct 
trace_imc_data *mem,
header->size = sizeof(*header) + event->header_size;
header->misc = 0;
 
-   if (is_kernel_addr(data->ip))
-   header->misc |= PERF_RECORD_MISC_KERNEL;
-   else
-   header->misc |= PERF_RECORD_MISC_USER;
-
+   if (cpu_has_feature(CPU_FTRS_POWER9)) {
+   if (is_kernel_addr(data->ip))
+   header->misc |= PERF_RECORD_MISC_KERNEL;
+   else
+   header->misc |= PERF_RECORD_MISC_USER;
+   } else {
+   switch (IMC_TRACE_RECORD_VAL_HVPR(mem->val)) {
+   case 0:/* when MSR HV and PR not set in the trace-record */
+   header->misc |= PERF_RECORD_MISC_GUEST_KERNEL;
+   break;
+   case 1: /* MSR HV is 0 and PR is 1 */
+   header->misc |= PERF_RECORD_MISC_GUEST_USER;
+   break;
+   case 2: /* MSR Hv is 1 and PR is 0 */
+   header->misc |= PERF_RECORD_MISC_HYPERVISOR;
+   break;
+   case 3: /* MSR HV is 1 and PR is 1 */
+   header->misc |= PERF_RECORD_MISC_USER;
+   break;
+   default:
+   pr_info("IMC: Unable to set the flag based on MSR 
bits\n");
+   break;
+   }
+   }
perf_event_header__init_id(header, data, event);
 
return 0;
-- 
2.25.4



[PATCH 2/2] powerpc/perf: Add support for outputting extended regs in perf intr_regs

2020-04-29 Thread Anju T Sudhakar
The capability flag PERF_PMU_CAP_EXTENDED_REGS, is used to indicate the
PMU which support extended registers. The generic code define the mask
of extended registers as 0 for non supported architectures.

Add support for extended registers in POWER9 architecture. For POWER9,
the extended registers are mmcr0, mmc1 and mmcr2.

REG_RESERVED mask is redefined to accommodate the extended registers.

With patch:


# perf record -I?
available registers: r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 r10 r11 r12 r13 r14
r15 r16 r17 r18 r19 r20 r21 r22 r23 r24 r25 r26 r27 r28 r29 r30 r31 nip
msr orig_r3 ctr link xer ccr softe trap dar dsisr sier mmcra mmcr0
mmcr1 mmcr2

# perf record -I ls
# perf script -D

PERF_RECORD_SAMPLE(IP, 0x1): 9019/9019: 0 period: 1 addr: 0
... intr regs: mask 0x ABI 64-bit
 r00xc011b12c
 r10xc03f9a98b930
 r20xc1a32100
 r30xc03f8fe9a800
 r40xc03fd181
 r50x3e32557150
 r60xc03f9a98b908
 r70xffc1cdae06ac
 r80x818
[.]
 r31   0xc03ffd047230
 nip   0xc011b2c0
 msr   0x90009033
 orig_r3 0xc011b21c
 ctr   0xc0119380
 link  0xc011b12c
 xer   0x0
 ccr   0x2800
 softe 0x1
 trap  0xf00
 dar   0x0
 dsisr 0x800
 sier  0x0
 mmcra 0x800
 mmcr0 0x82008090
 mmcr1 0x1e00
 mmcr2 0x0
 ... thread: perf:9019

Signed-off-by: Anju T Sudhakar 
---
 arch/powerpc/include/asm/perf_event_server.h  |  5 +++
 arch/powerpc/include/uapi/asm/perf_regs.h | 13 +++-
 arch/powerpc/perf/core-book3s.c   |  1 +
 arch/powerpc/perf/perf_regs.c | 29 ++--
 arch/powerpc/perf/power9-pmu.c|  1 +
 .../arch/powerpc/include/uapi/asm/perf_regs.h | 13 +++-
 tools/perf/arch/powerpc/include/perf_regs.h   |  6 +++-
 tools/perf/arch/powerpc/util/perf_regs.c  | 33 +++
 8 files changed, 95 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/include/asm/perf_event_server.h 
b/arch/powerpc/include/asm/perf_event_server.h
index 3e9703f44c7c..1d15953bd99e 100644
--- a/arch/powerpc/include/asm/perf_event_server.h
+++ b/arch/powerpc/include/asm/perf_event_server.h
@@ -55,6 +55,11 @@ struct power_pmu {
int *blacklist_ev;
/* BHRB entries in the PMU */
int bhrb_nr;
+   /*
+* set this flag with `PERF_PMU_CAP_EXTENDED_REGS` if
+* the pmu supports extended perf regs capability
+*/
+   int capabilities;
 };
 
 /*
diff --git a/arch/powerpc/include/uapi/asm/perf_regs.h 
b/arch/powerpc/include/uapi/asm/perf_regs.h
index f599064dd8dc..604b831378fe 100644
--- a/arch/powerpc/include/uapi/asm/perf_regs.h
+++ b/arch/powerpc/include/uapi/asm/perf_regs.h
@@ -48,6 +48,17 @@ enum perf_event_powerpc_regs {
PERF_REG_POWERPC_DSISR,
PERF_REG_POWERPC_SIER,
PERF_REG_POWERPC_MMCRA,
-   PERF_REG_POWERPC_MAX,
+   /* Extended registers */
+   PERF_REG_POWERPC_MMCR0,
+   PERF_REG_POWERPC_MMCR1,
+   PERF_REG_POWERPC_MMCR2,
+   PERF_REG_EXTENDED_MAX,
+   /* Max regs without the extended regs */
+   PERF_REG_POWERPC_MAX = PERF_REG_POWERPC_MMCRA + 1,
 };
+
+#define PERF_REG_PMU_MASK  ((1ULL << PERF_REG_POWERPC_MAX) - 1)
+#define PERF_REG_EXTENDED_MASK  (((1ULL << (PERF_REG_EXTENDED_MAX))\
+   - 1) - PERF_REG_PMU_MASK)
+
 #endif /* _UAPI_ASM_POWERPC_PERF_REGS_H */
diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index 3dcfecf858f3..f56b77800a7b 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -2276,6 +2276,7 @@ int register_power_pmu(struct power_pmu *pmu)
 
power_pmu.attr_groups = ppmu->attr_groups;
 
+   power_pmu.capabilities |= (ppmu->capabilities & 
PERF_PMU_CAP_EXTENDED_REGS);
 #ifdef MSR_HV
/*
 * Use FCHV to ignore kernel events if MSR.HV is set.
diff --git a/arch/powerpc/perf/perf_regs.c b/arch/powerpc/perf/perf_regs.c
index a213a0aa5d25..57aa02568caf 100644
--- a/arch/powerpc/perf/perf_regs.c
+++ b/arch/powerpc/perf/perf_regs.c
@@ -15,7 +15,8 @@
 
 #define PT_REGS_OFFSET(id, r) [id] = offsetof(struct pt_regs, r)
 
-#define REG_RESERVED (~((1ULL << PERF_REG_POWERPC_MAX) - 1))
+#define REG_RESERVED (~(PERF_REG_EXTENDED_MASK) &  \
+   (~((1ULL << PERF_REG_POWERPC_MAX) - 1)))
 
 static unsigned int pt_regs_offset[PERF_REG_POWERPC_MAX] = {
PT_REGS_OFFSET(PERF_REG_POWERPC_R0,  gpr[0]),
@@ -69,10 +70,22 @@ static unsigned int pt_regs_offset[PERF_REG_POWERPC_MAX] = {
PT_REGS_OFFSET(PERF_REG_POWERPC_MMCRA, dsisr),
 };
 
+/* Function to return the extended register values */
+static u64 get_ext_regs_value(int idx)
+{
+   switch (idx) {
+   case PERF_REG_POWERPC_MMCR0:
+

[PATCH 0/2] powerpc/perf: Add support for perf extended regs in powerpc

2020-04-29 Thread Anju T Sudhakar
Patch set to add support for perf extended register capability in
powerpc. The capability flag PERF_PMU_CAP_EXTENDED_REGS, is used to
indicate the PMU which support extended registers. The generic code
define the mask of extended registers as 0 for non supported architectures.

patch 2/2 defines this PERF_PMU_CAP_EXTENDED_REGS mask to output the
values of mmcr0,mmcr1,mmcr2 for POWER9.
   
x86/perf_regs.h is included by util/intel-pt.c, which will get compiled   
when buiding perf on powerpc. Since x86/perf_regs.h has
`PERF_EXTENDED_REG_MASK` defined, defining `PERF_EXTENDED_REG_MASK` for   
powerpc to add support for perf extended regs will result in perf build   
error on powerpc. Currently powerpc architecture is not having support
for auxtrace. So as a workaround for this issue, patch 1/2 set
NO_AUXTRACE for powerpc. (Any other solutions are welcome.)

Patch 2/2 also add extended regs to sample_reg_mask in the tool side to use
with `-I?` option.

Anju T Sudhakar (2):
  tools/perf: set no_auxtrace for powerpc
  powerpc/perf: Add support for outputting extended regs in perf
intr_regs

 arch/powerpc/include/asm/perf_event_server.h  |  5 +++
 arch/powerpc/include/uapi/asm/perf_regs.h | 13 +++-
 arch/powerpc/perf/core-book3s.c   |  1 +
 arch/powerpc/perf/perf_regs.c | 29 ++--
 arch/powerpc/perf/power9-pmu.c|  1 +
 .../arch/powerpc/include/uapi/asm/perf_regs.h | 13 +++-
 tools/perf/arch/powerpc/Makefile  |  1 +
 tools/perf/arch/powerpc/include/perf_regs.h   |  6 +++-
 tools/perf/arch/powerpc/util/perf_regs.c  | 33 +++
 9 files changed, 96 insertions(+), 6 deletions(-)

-- 
2.20.1



[PATCH 1/2] tools/perf: set no_auxtrace for powerpc

2020-04-29 Thread Anju T Sudhakar
x86/perf_regs.h is included by util/intel-pt.c, which will get compiled
when buiding perf on powerpc. Since x86/perf_regs.h has
`PERF_EXTENDED_REG_MASK` defined, defining `PERF_EXTENDED_REG_MASK` for
powerpc to add support for perf extended regs will result in perf build
error on powerpc.

Currently powerpc architecture is not having support for auxtrace. So as
a workaround for this issue, set NO_AUXTRACE for powerpc.

Signed-off-by: Anju T Sudhakar 
---
 tools/perf/arch/powerpc/Makefile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/perf/arch/powerpc/Makefile b/tools/perf/arch/powerpc/Makefile
index e58d00d62f02..9ebb5f513605 100644
--- a/tools/perf/arch/powerpc/Makefile
+++ b/tools/perf/arch/powerpc/Makefile
@@ -3,6 +3,7 @@ ifndef NO_DWARF
 PERF_HAVE_DWARF_REGS := 1
 endif
 
+NO_AUXTRACE := 1
 HAVE_KVM_STAT_SUPPORT := 1
 PERF_HAVE_ARCH_REGS_QUERY_REGISTER_OFFSET := 1
 PERF_HAVE_JITDUMP := 1
-- 
2.20.1



[PATCH v4 2/2] powerpc/powernv: Re-enable imc trace-mode in kernel

2020-03-12 Thread Anju T Sudhakar
commit <249fad734a25> ""powerpc/perf: Disable trace_imc pmu"
disables IMC(In-Memory Collection) trace-mode in kernel, since frequent
mode switching between accumulation mode and trace mode via the spr LDBAR
in the hardware can trigger a checkstop(system crash).

Patch to re-enable imc-trace mode in kernel.

The previous patch(1/2) in this series will address the mode switching issue
by implementing a global lock, and will restrict the usage of
accumulation and trace-mode at a time.

Signed-off-by: Anju T Sudhakar 
---
 arch/powerpc/platforms/powernv/opal-imc.c | 9 +
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/opal-imc.c 
b/arch/powerpc/platforms/powernv/opal-imc.c
index 968b9a4d1cd9..7824cc364bc4 100644
--- a/arch/powerpc/platforms/powernv/opal-imc.c
+++ b/arch/powerpc/platforms/powernv/opal-imc.c
@@ -268,14 +268,7 @@ static int opal_imc_counters_probe(struct platform_device 
*pdev)
domain = IMC_DOMAIN_THREAD;
break;
case IMC_TYPE_TRACE:
-   /*
-* FIXME. Using trace_imc events to monitor application
-* or KVM thread performance can cause a checkstop
-* (system crash).
-* Disable it for now.
-*/
-   pr_info_once("IMC: disabling trace_imc PMU\n");
-   domain = -1;
+   domain = IMC_DOMAIN_TRACE;
break;
default:
pr_warn("IMC Unknown Device type \n");
-- 
2.20.1



[PATCH v4 1/2] powerpc/perf: Implement a global lock to avoid races between trace, core and thread imc events.

2020-03-12 Thread Anju T Sudhakar
IMC(In-memory Collection Counters) does performance monitoring in
two different modes, i.e accumulation mode(core-imc and thread-imc events),
and trace mode(trace-imc events). A cpu thread can either be in
accumulation-mode or trace-mode at a time and this is done via the LDBAR
register in POWER architecture. The current design does not address the
races between thread-imc and trace-imc events.

Patch implements a global id and lock to avoid the races between
core, trace and thread imc events. With this global id-lock
implementation, the system can either run core, thread or trace imc
events at a time. i.e. to run any core-imc events, thread/trace imc events
should not be enabled/monitored.

Signed-off-by: Anju T Sudhakar 
---
Changes from v3->v4:

- Added mutex lock for thread, core and trace imc cpu offline path.

Changes from v2->v3:

- Addressed the off-line comments from Michael Ellerman
- Optimized the *_event_init code path for trace, core and thread imc
- Handled the global refc in cpuhotplug scenario
- Re-order the patch series
- Removed the selftest patches and will send as a follow up patch

Changes from v1 -> v2:

- Added self test patches to the series.

---
 arch/powerpc/perf/imc-pmu.c | 173 +++-
 1 file changed, 149 insertions(+), 24 deletions(-)

diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index cb50a9e1fd2d..eb82dda884e5 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -44,6 +44,16 @@ static DEFINE_PER_CPU(u64 *, trace_imc_mem);
 static struct imc_pmu_ref *trace_imc_refc;
 static int trace_imc_mem_size;
 
+/*
+ * Global data structure used to avoid races between thread,
+ * core and trace-imc
+ */
+static struct imc_pmu_ref imc_global_refc = {
+   .lock = __MUTEX_INITIALIZER(imc_global_refc.lock),
+   .id = 0,
+   .refc = 0,
+};
+
 static struct imc_pmu *imc_event_to_pmu(struct perf_event *event)
 {
return container_of(event->pmu, struct imc_pmu, pmu);
@@ -698,6 +708,16 @@ static int ppc_core_imc_cpu_offline(unsigned int cpu)
return -EINVAL;
 
ref->refc = 0;
+   /*
+* Reduce the global reference count, if this is the
+* last cpu in this core and core-imc event running
+* in this cpu.
+*/
+   mutex_lock(_global_refc.lock);
+   if (imc_global_refc.id == IMC_DOMAIN_CORE)
+   imc_global_refc.refc--;
+
+   mutex_unlock(_global_refc.lock);
}
return 0;
 }
@@ -710,6 +730,23 @@ static int core_imc_pmu_cpumask_init(void)
 ppc_core_imc_cpu_offline);
 }
 
+static void reset_global_refc(struct perf_event *event)
+{
+   mutex_lock(_global_refc.lock);
+   imc_global_refc.refc--;
+
+   /*
+* If no other thread is running any
+* event for this domain(thread/core/trace),
+* set the global id to zero.
+*/
+   if (imc_global_refc.refc <= 0) {
+   imc_global_refc.refc = 0;
+   imc_global_refc.id = 0;
+   }
+   mutex_unlock(_global_refc.lock);
+}
+
 static void core_imc_counters_release(struct perf_event *event)
 {
int rc, core_id;
@@ -759,6 +796,8 @@ static void core_imc_counters_release(struct perf_event 
*event)
ref->refc = 0;
}
mutex_unlock(>lock);
+
+   reset_global_refc(event);
 }
 
 static int core_imc_event_init(struct perf_event *event)
@@ -819,6 +858,29 @@ static int core_imc_event_init(struct perf_event *event)
++ref->refc;
mutex_unlock(>lock);
 
+   /*
+* Since the system can run either in accumulation or trace-mode
+* of IMC at a time, core-imc events are allowed only if no other
+* trace/thread imc events are enabled/monitored.
+*
+* Take the global lock, and check the refc.id
+* to know whether any other trace/thread imc
+* events are running.
+*/
+   mutex_lock(_global_refc.lock);
+   if (imc_global_refc.id == 0 || imc_global_refc.id == IMC_DOMAIN_CORE) {
+   /*
+* No other trace/thread imc events are running in
+* the system, so set the refc.id to core-imc.
+*/
+   imc_global_refc.id = IMC_DOMAIN_CORE;
+   imc_global_refc.refc++;
+   } else {
+   mutex_unlock(_global_refc.lock);
+   return -EBUSY;
+   }
+   mutex_unlock(_global_refc.lock);
+
event->hw.event_base = (u64)pcmi->vbase + (config & 
IMC_EVENT_OFFSET_MASK);
event->destroy = core_imc_counters_release;
return 0;
@@ -877,7 +939,23 @@ static int ppc_thread_imc_cpu_online(unsigned int cpu)
 
 static int ppc_thread_imc_cpu

[PATCH v3 2/2] powerpc/powernv: Re-enable imc trace-mode in kernel

2020-02-27 Thread Anju T Sudhakar
commit <249fad734a25> ""powerpc/perf: Disable trace_imc pmu"
disables IMC(In-Memory Collection) trace-mode in kernel, since frequent
mode switching between accumulation mode and trace mode via the spr LDBAR
in the hardware can trigger a checkstop(system crash).

Patch to re-enable imc-trace mode in kernel.

The previous patch(1/2) in this series will address the mode switching issue
by implementing a global lock, and will restrict the usage of
accumulation and trace-mode at a time.

Signed-off-by: Anju T Sudhakar 
---
 arch/powerpc/platforms/powernv/opal-imc.c | 9 +
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/opal-imc.c 
b/arch/powerpc/platforms/powernv/opal-imc.c
index 000b350d4060..3b4518f4b643 100644
--- a/arch/powerpc/platforms/powernv/opal-imc.c
+++ b/arch/powerpc/platforms/powernv/opal-imc.c
@@ -278,14 +278,7 @@ static int opal_imc_counters_probe(struct platform_device 
*pdev)
domain = IMC_DOMAIN_THREAD;
break;
case IMC_TYPE_TRACE:
-   /*
-* FIXME. Using trace_imc events to monitor application
-* or KVM thread performance can cause a checkstop
-* (system crash).
-* Disable it for now.
-*/
-   pr_info_once("IMC: disabling trace_imc PMU\n");
-   domain = -1;
+   domain = IMC_DOMAIN_TRACE;
break;
default:
pr_warn("IMC Unknown Device type \n");
-- 
2.20.1



[PATCH v3 1/2] powerpc/perf: Implement a global lock to avoid races between trace, core and thread imc events.

2020-02-27 Thread Anju T Sudhakar
IMC(In-memory Collection Counters) does performance monitoring in
two different modes, i.e accumulation mode(core-imc and thread-imc events),
and trace mode(trace-imc events). A cpu thread can either be in
accumulation-mode or trace-mode at a time and this is done via the LDBAR
register in POWER architecture. The current design does not address the
races between thread-imc and trace-imc events.

Patch implements a global id and lock to avoid the races between
core, trace and thread imc events. With this global id-lock
implementation, the system can either run core, thread or trace imc
events at a time. i.e. to run any core-imc events, thread/trace imc events
should not be enabled/monitored.

Signed-off-by: Anju T Sudhakar 
---
Changes from v2->v3:

- Addressed the off-line comments from Michael Ellerman
- Optimized the *_event_init code path for trace, core and thread imc
- Handled the global refc in cpuhotplug scenario
- Re-order the patch series
- Removed the selftest patches and will send as a follow up patch

Changes from v1 -> v2:

- Added self test patches to the series.
---
 arch/powerpc/perf/imc-pmu.c | 165 ++--
 1 file changed, 141 insertions(+), 24 deletions(-)

diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index cb50a9e1fd2d..a366e2ec0351 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -44,6 +44,16 @@ static DEFINE_PER_CPU(u64 *, trace_imc_mem);
 static struct imc_pmu_ref *trace_imc_refc;
 static int trace_imc_mem_size;

+/*
+ * Global data structure used to avoid races between thread,
+ * core and trace-imc
+ */
+static struct imc_pmu_ref imc_global_refc = {
+   .lock = __MUTEX_INITIALIZER(imc_global_refc.lock),
+   .id = 0,
+   .refc = 0,
+};
+
 static struct imc_pmu *imc_event_to_pmu(struct perf_event *event)
 {
return container_of(event->pmu, struct imc_pmu, pmu);
@@ -698,6 +708,13 @@ static int ppc_core_imc_cpu_offline(unsigned int cpu)
return -EINVAL;

ref->refc = 0;
+   /*
+* Reduce the global reference count, if this is the
+* last cpu in this core and core-imc event running
+* in this cpu.
+*/
+   if (imc_global_refc.id == IMC_DOMAIN_CORE)
+   imc_global_refc.refc--;
}
return 0;
 }
@@ -710,6 +727,23 @@ static int core_imc_pmu_cpumask_init(void)
 ppc_core_imc_cpu_offline);
 }

+static void reset_global_refc(struct perf_event *event)
+{
+   mutex_lock(_global_refc.lock);
+   imc_global_refc.refc--;
+
+   /*
+* If no other thread is running any
+* event for this domain(thread/core/trace),
+* set the global id to zero.
+*/
+   if (imc_global_refc.refc <= 0) {
+   imc_global_refc.refc = 0;
+   imc_global_refc.id = 0;
+   }
+   mutex_unlock(_global_refc.lock);
+}
+
 static void core_imc_counters_release(struct perf_event *event)
 {
int rc, core_id;
@@ -759,6 +793,8 @@ static void core_imc_counters_release(struct perf_event 
*event)
ref->refc = 0;
}
mutex_unlock(>lock);
+
+   reset_global_refc(event);
 }

 static int core_imc_event_init(struct perf_event *event)
@@ -819,6 +855,29 @@ static int core_imc_event_init(struct perf_event *event)
++ref->refc;
mutex_unlock(>lock);

+   /*
+* Since the system can run either in accumulation or trace-mode
+* of IMC at a time, core-imc events are allowed only if no other
+* trace/thread imc events are enabled/monitored.
+*
+* Take the global lock, and check the refc.id
+* to know whether any other trace/thread imc
+* events are running.
+*/
+   mutex_lock(_global_refc.lock);
+   if (imc_global_refc.id == 0 || imc_global_refc.id == IMC_DOMAIN_CORE) {
+   /*
+* No other trace/thread imc events are running in
+* the system, so set the refc.id to core-imc.
+*/
+   imc_global_refc.id = IMC_DOMAIN_CORE;
+   imc_global_refc.refc++;
+   } else {
+   mutex_unlock(_global_refc.lock);
+   return -EBUSY;
+   }
+   mutex_unlock(_global_refc.lock);
+
event->hw.event_base = (u64)pcmi->vbase + (config & 
IMC_EVENT_OFFSET_MASK);
event->destroy = core_imc_counters_release;
return 0;
@@ -877,7 +936,20 @@ static int ppc_thread_imc_cpu_online(unsigned int cpu)

 static int ppc_thread_imc_cpu_offline(unsigned int cpu)
 {
-   mtspr(SPRN_LDBAR, 0);
+   /*
+* Set the bit 0 of LDBAR to zero.
+*
+* If bit 0 of LDBAR is unset, it will stop posting
+* the coune

Re: [PATCH v5 07/10] powerpc/perf: open access for CAP_PERFMON privileged process

2020-01-22 Thread Anju T Sudhakar



On 1/20/20 5:00 PM, Alexey Budankov wrote:

Open access to monitoring for CAP_PERFMON privileged processes.
For backward compatibility reasons access to the monitoring remains
open for CAP_SYS_ADMIN privileged processes but CAP_SYS_ADMIN usage
for secure monitoring is discouraged with respect to CAP_PERFMON
capability. Providing the access under CAP_PERFMON capability singly,
without the rest of CAP_SYS_ADMIN credentials, excludes chances to
misuse the credentials and makes the operations more secure.

Signed-off-by: Alexey Budankov
---


Acked-by: Anju T Sudhakar



[PATCH v2 3/5] powerpc/perf: Add an interface sub-folder to imc pmu

2020-01-21 Thread Anju T Sudhakar
From: Madhavan Srinivasan 

Patch adds an interface attribute folder to imc pmu.
This is intended to include pmu intreface capabilities
which will be useful to userspace likes selftest
testcases. Patch adds a "glob_lck" file to notify to
userspace of global lock mechanism added to imc devices
like core, thread and trace.

"glob_lck" will be used by selftest file to execute
interface test for the global lock mechanism.

Signed-off-by: Madhavan Srinivasan 
---
 arch/powerpc/include/asm/imc-pmu.h | 11 ++-
 arch/powerpc/perf/imc-pmu.c| 19 +++
 2 files changed, 25 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/include/asm/imc-pmu.h 
b/arch/powerpc/include/asm/imc-pmu.h
index 4da4fcba0684..1b2c33c30e7c 100644
--- a/arch/powerpc/include/asm/imc-pmu.h
+++ b/arch/powerpc/include/asm/imc-pmu.h
@@ -87,8 +87,9 @@ struct trace_imc_data {
 /* Event attribute array index */
 #define IMC_FORMAT_ATTR0
 #define IMC_EVENT_ATTR 1
-#define IMC_CPUMASK_ATTR   2
-#define IMC_NULL_ATTR  3
+#define IMC_INTERFACE_ATTR 2
+#define IMC_CPUMASK_ATTR   3
+#define IMC_NULL_ATTR  4
 
 /* PMU Format attribute macros */
 #define IMC_EVENT_OFFSET_MASK  0xULL
@@ -114,10 +115,10 @@ struct imc_pmu {
/*
 * Attribute groups for the PMU. Slot 0 used for
 * format attribute, slot 1 used for cpusmask attribute,
-* slot 2 used for event attribute. Slot 3 keep as
-* NULL.
+* slot 2 used for event attribute. Slot 3 used for interface
+* attribute and Slot 4 is NULL.
 */
-   const struct attribute_group *attr_groups[4];
+   const struct attribute_group *attr_groups[5];
u32 counter_mem_size;
int domain;
/*
diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index 2e220f199530..3f49664f29f1 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -54,6 +54,24 @@ static struct imc_pmu_ref imc_global_refc = {
.refc = 0,
 };
 
+static ssize_t glob_lck_show(struct device *dev,
+   struct device_attribute *attr, char *buf)
+{
+   return sprintf(buf, "%d\n", 1);
+}
+
+static DEVICE_ATTR_RO(glob_lck);
+
+static struct attribute *imc_interface_attrs[] = {
+   _attr_glob_lck.attr,
+   NULL,
+};
+
+static struct attribute_group imc_interface_group = {
+   .name = "interface",
+   .attrs = imc_interface_attrs,
+};
+
 static struct imc_pmu *imc_event_to_pmu(struct perf_event *event)
 {
return container_of(event->pmu, struct imc_pmu, pmu);
@@ -1462,6 +1480,7 @@ static int update_pmu_ops(struct imc_pmu *pmu)
pmu->pmu.attr_groups = pmu->attr_groups;
pmu->pmu.capabilities = PERF_PMU_CAP_NO_EXCLUDE;
pmu->attr_groups[IMC_FORMAT_ATTR] = _format_group;
+   pmu->attr_groups[IMC_INTERFACE_ATTR] = _interface_group;
 
switch (pmu->domain) {
case IMC_DOMAIN_NEST:
-- 
2.20.1



[PATCH v2 2/5] powerpc/perf: Implement a global lock to avoid races between trace, core and thread imc events.

2020-01-21 Thread Anju T Sudhakar
IMC(In-memory Collection Counters) does performance monitoring in
two different modes, i.e accumulation mode(core-imc and thread-imc events),
and trace mode(trace-imc events). A cpu thread can either be in
accumulation-mode or trace-mode at a time and this is done via the LDBAR
register in POWER architecture. The current design does not address the
races between thread-imc and trace-imc events.

Patch implements a global id and lock to avoid the races between
core, trace and thread imc events. With this global id-lock
implementation, the system can either run core, thread or trace imc
events at a time. i.e. to run any core-imc events, thread/trace imc events
should not be enabled/monitored.

Signed-off-by: Anju T Sudhakar 
---
 arch/powerpc/perf/imc-pmu.c | 177 +++-
 1 file changed, 153 insertions(+), 24 deletions(-)

diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index cb50a9e1fd2d..2e220f199530 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -44,6 +44,16 @@ static DEFINE_PER_CPU(u64 *, trace_imc_mem);
 static struct imc_pmu_ref *trace_imc_refc;
 static int trace_imc_mem_size;
 
+/*
+ * Global data structure used to avoid races between thread,
+ * core and trace-imc
+ */
+static struct imc_pmu_ref imc_global_refc = {
+   .lock = __MUTEX_INITIALIZER(imc_global_refc.lock),
+   .id = 0,
+   .refc = 0,
+};
+
 static struct imc_pmu *imc_event_to_pmu(struct perf_event *event)
 {
return container_of(event->pmu, struct imc_pmu, pmu);
@@ -759,6 +769,20 @@ static void core_imc_counters_release(struct perf_event 
*event)
ref->refc = 0;
}
mutex_unlock(>lock);
+
+   mutex_lock(_global_refc.lock);
+   if (imc_global_refc.id == IMC_DOMAIN_CORE) {
+   imc_global_refc.refc--;
+   /*
+* If no other thread is running any core-imc
+* event, set the global id to zero.
+*/
+   if (imc_global_refc.refc <= 0) {
+   imc_global_refc.refc = 0;
+   imc_global_refc.id = 0;
+   }
+   }
+   mutex_unlock(_global_refc.lock);
 }
 
 static int core_imc_event_init(struct perf_event *event)
@@ -779,6 +803,22 @@ static int core_imc_event_init(struct perf_event *event)
if (event->cpu < 0)
return -EINVAL;
 
+   /*
+* Take the global lock, and make sure
+* no other thread is running any trace OR thread imc event
+*/
+   mutex_lock(_global_refc.lock);
+   if (imc_global_refc.id == 0) {
+   imc_global_refc.id = IMC_DOMAIN_CORE;
+   imc_global_refc.refc++;
+   } else if (imc_global_refc.id == IMC_DOMAIN_CORE) {
+   imc_global_refc.refc++;
+   } else {
+   mutex_unlock(_global_refc.lock);
+   return -EBUSY;
+   }
+   mutex_unlock(_global_refc.lock);
+
event->hw.idx = -1;
pmu = imc_event_to_pmu(event);
 
@@ -877,7 +917,16 @@ static int ppc_thread_imc_cpu_online(unsigned int cpu)
 
 static int ppc_thread_imc_cpu_offline(unsigned int cpu)
 {
-   mtspr(SPRN_LDBAR, 0);
+   /*
+* Toggle the bit 0 of LDBAR.
+*
+* If bit 0 of LDBAR is unset, it will stop posting
+* the counetr data to memory.
+* For thread-imc, bit 0 of LDBAR will be set to 1 in the
+* event_add function. So toggle this bit here, to stop the updates
+* to memory in the cpu_offline path.
+*/
+   mtspr(SPRN_LDBAR, (mfspr(SPRN_LDBAR) ^ (1UL << 63)));
return 0;
 }
 
@@ -889,6 +938,24 @@ static int thread_imc_cpu_init(void)
  ppc_thread_imc_cpu_offline);
 }
 
+static void thread_imc_counters_release(struct perf_event *event)
+{
+
+   mutex_lock(_global_refc.lock);
+   if (imc_global_refc.id == IMC_DOMAIN_THREAD) {
+   imc_global_refc.refc--;
+   /*
+* If no other thread is running any thread-imc
+* event, set the global id to zero.
+*/
+   if (imc_global_refc.refc <= 0) {
+   imc_global_refc.refc = 0;
+   imc_global_refc.id = 0;
+   }
+   }
+   mutex_unlock(_global_refc.lock);
+}
+
 static int thread_imc_event_init(struct perf_event *event)
 {
u32 config = event->attr.config;
@@ -905,6 +972,27 @@ static int thread_imc_event_init(struct perf_event *event)
if (event->hw.sample_period)
return -EINVAL;
 
+   mutex_lock(_global_refc.lock);
+   /*
+* Check if any other thread is running
+* core-engine, if not set the global id to
+* thread-imc.
+*/
+   if (imc_global_refc.id == 0) {
+   imc_global_refc.id = IMC_DOMAIN_THREAD;
+   imc_global_refc.refc++;
+   } else if (imc

[PATCH v2 5/5] selftest/powerpc/pmu: Testcase for imc global lock mechanism

2020-01-21 Thread Anju T Sudhakar
From: Madhavan Srinivasan 

Signed-off-by: Madhavan Srinivasan 
---
 .../pmu/mem_counters/imc_global_lock_test.c   | 49 ++-
 1 file changed, 48 insertions(+), 1 deletion(-)

diff --git 
a/tools/testing/selftests/powerpc/pmu/mem_counters/imc_global_lock_test.c 
b/tools/testing/selftests/powerpc/pmu/mem_counters/imc_global_lock_test.c
index ea687ffc1990..f643dba8ecc0 100644
--- a/tools/testing/selftests/powerpc/pmu/mem_counters/imc_global_lock_test.c
+++ b/tools/testing/selftests/powerpc/pmu/mem_counters/imc_global_lock_test.c
@@ -5,9 +5,56 @@
 
 #include "mem_counters.h"
 
+static  bool check_imc_interface_glob_lck(void)
+{
+   if (!access("/sys/devices/thread_imc/interface/glob_lck", F_OK))
+   return true;
+
+   return false;
+}
+
 static int testcase(void)
 {
-   return 0;
+   struct event events[2];
+
+   if (!check_imc_interface_glob_lck()) {
+   printf("Test not supported\n");
+   return MAGIC_SKIP_RETURN_VALUE;
+   }
+
+   if (!is_mem_counters_device_enabled(CORE) || 
!is_mem_counters_device_enabled(THREAD)) {
+   printf("%s: IMC device not found. So exiting the test\n", 
__FUNCTION__);
+   return -1;
+   }
+
+   if (setup_mem_counters_event(THREAD, [0], 0xe0, 
"thread_imc/cycles")) {
+   printf("%s setup_mem_counters_event for thread_imc failed\n", 
__FUNCTION__);
+   return -1;
+   }
+
+   if (setup_mem_counters_event(CORE, [1], 0xe0, 
"core_imc/cycles")) {
+   printf("%s setup_mem_counters_event for core_imc failed\n", 
__FUNCTION__);
+   return -1;
+   }
+
+   if (event_open([0])) {
+   perror("thread_imc: perf_event_open");
+   return -1;
+   }
+
+   /*
+* If we have the Global lock patchset applied to kernel
+* event_open for events[1] should fail with resource busy
+*/
+   if (event_open_with_cpu([1], 0)) {
+   /*
+* Check for the errno to certify the test result
+*/
+   if (errno == 16) // Resource busy (EBUSY)
+   return 0;
+   }
+
+   return -1;
 }
 
 static int imc_global_lock_test(void)
-- 
2.20.1



[PATCH v2 4/5] selftest/powerpc/pmc: Support to include interface test for Memory Counter PMUs

2020-01-21 Thread Anju T Sudhakar
From: Madhavan Srinivasan 

Patch to add support to include interface tests for
memory counter PMUs as part of selftest.
These PMUs are primarily used to understand socket/chip/core
resourage usage. In PowerNV envirnoment, the perf interface
registered to access these counters are called "In Memory Collection"
(IMC) and in PowerVM, the perf interface registered to access
these counters are called "hv_24x7".

New folder "mem_counters" added under selftest/powerpc/pmu.
This will include interface tests for both "imc" and "hv_24x7"
pmus. Patch adds base/common functioned needed.
To make blame easier, a place-holder test function added to
this patch. Subsequent patch will fill in the actual test
content.

Signed-off-by: Madhavan Srinivasan 
---
 tools/testing/selftests/powerpc/pmu/Makefile  |  7 +-
 .../powerpc/pmu/mem_counters/Makefile | 21 
 .../pmu/mem_counters/imc_global_lock_test.c   | 21 
 .../powerpc/pmu/mem_counters/mem_counters.c   | 99 +++
 .../powerpc/pmu/mem_counters/mem_counters.h   | 36 +++
 5 files changed, 182 insertions(+), 2 deletions(-)
 create mode 100644 tools/testing/selftests/powerpc/pmu/mem_counters/Makefile
 create mode 100644 
tools/testing/selftests/powerpc/pmu/mem_counters/imc_global_lock_test.c
 create mode 100644 
tools/testing/selftests/powerpc/pmu/mem_counters/mem_counters.c
 create mode 100644 
tools/testing/selftests/powerpc/pmu/mem_counters/mem_counters.h

diff --git a/tools/testing/selftests/powerpc/pmu/Makefile 
b/tools/testing/selftests/powerpc/pmu/Makefile
index 19046db995fe..e352eceac0a9 100644
--- a/tools/testing/selftests/powerpc/pmu/Makefile
+++ b/tools/testing/selftests/powerpc/pmu/Makefile
@@ -8,7 +8,7 @@ EXTRA_SOURCES := ../harness.c event.c lib.c ../utils.c
 top_srcdir = ../../../../..
 include ../../lib.mk
 
-all: $(TEST_GEN_PROGS) ebb
+all: $(TEST_GEN_PROGS) ebb mem_counters
 
 $(TEST_GEN_PROGS): $(EXTRA_SOURCES)
 
@@ -43,4 +43,7 @@ clean:
 ebb:
TARGET=$@; BUILD_TARGET=$$OUTPUT/$$TARGET; mkdir -p $$BUILD_TARGET; 
$(MAKE) OUTPUT=$$BUILD_TARGET -k -C $$TARGET all
 
-.PHONY: all run_tests clean ebb
+mem_counters:
+   TARGET=$@; BUILD_TARGET=$$OUTPUT/$$TARGET; mkdir -p $$BUILD_TARGET; 
$(MAKE) OUTPUT=$$BUILD_TARGET -k -C $$TARGET all
+
+.PHONY: all run_tests clean ebb mem_counters
diff --git a/tools/testing/selftests/powerpc/pmu/mem_counters/Makefile 
b/tools/testing/selftests/powerpc/pmu/mem_counters/Makefile
new file mode 100644
index ..f39ebe30ab70
--- /dev/null
+++ b/tools/testing/selftests/powerpc/pmu/mem_counters/Makefile
@@ -0,0 +1,21 @@
+# SPDX-License-Identifier: GPL-2.0
+include ../../../../../../scripts/Kbuild.include
+
+noarg:
+   $(MAKE) -C ../../
+
+CFLAGS += -m64
+
+# Toolchains may build PIE by default which breaks the assembly
+no-pie-option := $(call try-run, echo 'int main() { return 0; }' | \
+$(CC) -Werror $(KBUILD_CPPFLAGS) $(CC_OPTION_CFLAGS) -no-pie -x c - -o 
"$$TMP", -no-pie)
+
+LDFLAGS += $(no-pie-option)
+
+TEST_GEN_PROGS := imc_global_lock_test
+
+top_srcdir = ../../../../../..
+include ../../../lib.mk
+
+$(TEST_GEN_PROGS): ../../harness.c ../../utils.c ../event.c ../lib.c 
./mem_counters.c \
+   imc_global_lock_test.c
diff --git 
a/tools/testing/selftests/powerpc/pmu/mem_counters/imc_global_lock_test.c 
b/tools/testing/selftests/powerpc/pmu/mem_counters/imc_global_lock_test.c
new file mode 100644
index ..ea687ffc1990
--- /dev/null
+++ b/tools/testing/selftests/powerpc/pmu/mem_counters/imc_global_lock_test.c
@@ -0,0 +1,21 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2020, Madhavan Srinivasan, IBM Corp.
+ */
+
+#include "mem_counters.h"
+
+static int testcase(void)
+{
+   return 0;
+}
+
+static int imc_global_lock_test(void)
+{
+   return eat_cpu(testcase);
+}
+
+int main(void)
+{
+   return test_harness(imc_global_lock_test, "imc_global_lock_test");
+}
diff --git a/tools/testing/selftests/powerpc/pmu/mem_counters/mem_counters.c 
b/tools/testing/selftests/powerpc/pmu/mem_counters/mem_counters.c
new file mode 100644
index ..b0ee1319f018
--- /dev/null
+++ b/tools/testing/selftests/powerpc/pmu/mem_counters/mem_counters.c
@@ -0,0 +1,99 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2020, Madhavan Srinivasan, IBM Corp.
+ */
+
+#include "mem_counters.h"
+
+/*
+ * mem_counters.c will contain common/basic functions
+ * to support testcases for both In Memory Collection (IMC)
+ * and hv_24x7 counters.
+ */
+
+
+/*
+ * Since device type enum starts with 1,
+ * have the first entry in the array as a placeholder.
+ */
+const char mem_counters_dev_path[][30] = {
+   "",
+   "/sys/devices/thread_imc",
+   "/sys/devices/trace_imc",
+   "/sys/devices/core_imc",
+   "/sys/devices/hv_24x7",
+   "",
+};
+
+const char mem_counters_dev_type_path[][35] = {
+   "",
+   "/sys/devices/thread_imc/type",
+   "/sys/devices/trace_imc/type",
+   

[PATCH v2 1/5] powerpc/powernv: Re-enable imc trace-mode in kernel

2020-01-21 Thread Anju T Sudhakar
commit <249fad734a25> ""powerpc/perf: Disable trace_imc pmu"
disables IMC(In-Memory Collection) trace-mode in kernel, since frequent
mode switching between accumulation mode and trace mode via the spr LDBAR
in the hardware can trigger a checkstop(system crash).

Patch to re-enable imc-trace mode in kernel.

The following patch in this series will address the mode switching issue
by implementing a global lock, and will restrict the usage of
accumulation and trace-mode at a time.

Signed-off-by: Anju T Sudhakar 
---
 arch/powerpc/platforms/powernv/opal-imc.c | 9 +
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/opal-imc.c 
b/arch/powerpc/platforms/powernv/opal-imc.c
index 000b350d4060..3b4518f4b643 100644
--- a/arch/powerpc/platforms/powernv/opal-imc.c
+++ b/arch/powerpc/platforms/powernv/opal-imc.c
@@ -278,14 +278,7 @@ static int opal_imc_counters_probe(struct platform_device 
*pdev)
domain = IMC_DOMAIN_THREAD;
break;
case IMC_TYPE_TRACE:
-   /*
-* FIXME. Using trace_imc events to monitor application
-* or KVM thread performance can cause a checkstop
-* (system crash).
-* Disable it for now.
-*/
-   pr_info_once("IMC: disabling trace_imc PMU\n");
-   domain = -1;
+   domain = IMC_DOMAIN_TRACE;
break;
default:
pr_warn("IMC Unknown Device type \n");
-- 
2.20.1



[PATCH v2 0/5] Re-enable IMC trace-mode

2020-01-21 Thread Anju T Sudhakar
commit <249fad734a25> ""powerpc/perf: Disable trace_imc pmu"   
disables IMC(In-Memory Collection) trace-mode in kernel, since frequent   
mode switching between accumulation mode and trace mode via the spr LDBAR  
in the hardware can trigger a checkstop(system crash).

This patch series re-enables IMC trace mode and fixes the mode switching
issue by global lock mechanism.

Patch 3/5,4/5 and 5/5 provides a selftest to verify the global-lock
mechanism.

Changes from v1 -> v2:
-
- Added self test patches to the series.

Anju T Sudhakar (2):
  powerpc/powernv: Re-enable imc trace-mode in kernel
  powerpc/perf: Implement a global lock to avoid races between trace,
core and thread imc events.

Madhavan Srinivasan (3):
  powerpc/perf: Add an interface sub-folder to imc pmu
  selftest/powerpc/pmc: Support to include interface test for Memory
Counter PMUs
  selftest/powerpc/pmu: Testcase for imc global lock mechanism


 arch/powerpc/include/asm/imc-pmu.h|  11 +-
 arch/powerpc/perf/imc-pmu.c   | 196 +++---
 arch/powerpc/platforms/powernv/opal-imc.c |   9 +-
 tools/testing/selftests/powerpc/pmu/Makefile  |   7 +-
 .../powerpc/pmu/mem_counters/Makefile |  21 ++
 .../pmu/mem_counters/imc_global_lock_test.c   |  68 ++
 .../powerpc/pmu/mem_counters/mem_counters.c   |  99 +
 .../powerpc/pmu/mem_counters/mem_counters.h   |  36 
 8 files changed, 408 insertions(+), 39 deletions(-)
 create mode 100644 tools/testing/selftests/powerpc/pmu/mem_counters/Makefile
 create mode 100644 
tools/testing/selftests/powerpc/pmu/mem_counters/imc_global_lock_test.c
 create mode 100644 
tools/testing/selftests/powerpc/pmu/mem_counters/mem_counters.c
 create mode 100644 
tools/testing/selftests/powerpc/pmu/mem_counters/mem_counters.h

-- 
2.18.1



[PATCH 2/2] powerpc/perf: Implement a global lock to avoid races between trace, core and thread imc events.

2020-01-20 Thread Anju T Sudhakar
IMC(In-memory Collection Counters) does performance monitoring in
two different modes, i.e accumulation mode(core-imc and thread-imc events),
and trace mode(trace-imc events). A cpu thread can either be in
accumulation-mode or trace-mode at a time and this is done via the LDBAR
register in POWER architecture. The current design does not address the
races between thread-imc and trace-imc events.

Patch implements a global id and lock to avoid the races between
core, trace and thread imc events. With this global id-lock
implementation, the system can either run core, thread or trace imc
events at a time. i.e. to run any core-imc events, thread/trace imc events
should not be enabled/monitored.
 
Signed-off-by: Anju T Sudhakar 
---
 arch/powerpc/perf/imc-pmu.c | 177 +++-
 1 file changed, 153 insertions(+), 24 deletions(-)

diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index cb50a9e1fd2d..2e220f199530 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -44,6 +44,16 @@ static DEFINE_PER_CPU(u64 *, trace_imc_mem);
 static struct imc_pmu_ref *trace_imc_refc;
 static int trace_imc_mem_size;
 
+/*
+ * Global data structure used to avoid races between thread,
+ * core and trace-imc
+ */
+static struct imc_pmu_ref imc_global_refc = {
+   .lock = __MUTEX_INITIALIZER(imc_global_refc.lock),
+   .id = 0,
+   .refc = 0,
+};
+
 static struct imc_pmu *imc_event_to_pmu(struct perf_event *event)
 {
return container_of(event->pmu, struct imc_pmu, pmu);
@@ -759,6 +769,20 @@ static void core_imc_counters_release(struct perf_event 
*event)
ref->refc = 0;
}
mutex_unlock(>lock);
+
+   mutex_lock(_global_refc.lock);
+   if (imc_global_refc.id == IMC_DOMAIN_CORE) {
+   imc_global_refc.refc--;
+   /*
+* If no other thread is running any core-imc
+* event, set the global id to zero.
+*/
+   if (imc_global_refc.refc <= 0) {
+   imc_global_refc.refc = 0;
+   imc_global_refc.id = 0;
+   }
+   }
+   mutex_unlock(_global_refc.lock);
 }
 
 static int core_imc_event_init(struct perf_event *event)
@@ -779,6 +803,22 @@ static int core_imc_event_init(struct perf_event *event)
if (event->cpu < 0)
return -EINVAL;
 
+   /*
+* Take the global lock, and make sure
+* no other thread is running any trace OR thread imc event
+*/
+   mutex_lock(_global_refc.lock);
+   if (imc_global_refc.id == 0) {
+   imc_global_refc.id = IMC_DOMAIN_CORE;
+   imc_global_refc.refc++;
+   } else if (imc_global_refc.id == IMC_DOMAIN_CORE) {
+   imc_global_refc.refc++;
+   } else {
+   mutex_unlock(_global_refc.lock);
+   return -EBUSY;
+   }
+   mutex_unlock(_global_refc.lock);
+
event->hw.idx = -1;
pmu = imc_event_to_pmu(event);
 
@@ -877,7 +917,16 @@ static int ppc_thread_imc_cpu_online(unsigned int cpu)
 
 static int ppc_thread_imc_cpu_offline(unsigned int cpu)
 {
-   mtspr(SPRN_LDBAR, 0);
+   /*
+* Toggle the bit 0 of LDBAR.
+*
+* If bit 0 of LDBAR is unset, it will stop posting
+* the counetr data to memory.
+* For thread-imc, bit 0 of LDBAR will be set to 1 in the
+* event_add function. So toggle this bit here, to stop the updates
+* to memory in the cpu_offline path.
+*/
+   mtspr(SPRN_LDBAR, (mfspr(SPRN_LDBAR) ^ (1UL << 63)));
return 0;
 }
 
@@ -889,6 +938,24 @@ static int thread_imc_cpu_init(void)
  ppc_thread_imc_cpu_offline);
 }
 
+static void thread_imc_counters_release(struct perf_event *event)
+{
+
+   mutex_lock(_global_refc.lock);
+   if (imc_global_refc.id == IMC_DOMAIN_THREAD) {
+   imc_global_refc.refc--;
+   /*
+* If no other thread is running any thread-imc
+* event, set the global id to zero.
+*/
+   if (imc_global_refc.refc <= 0) {
+   imc_global_refc.refc = 0;
+   imc_global_refc.id = 0;
+   }
+   }
+   mutex_unlock(_global_refc.lock);
+}
+
 static int thread_imc_event_init(struct perf_event *event)
 {
u32 config = event->attr.config;
@@ -905,6 +972,27 @@ static int thread_imc_event_init(struct perf_event *event)
if (event->hw.sample_period)
return -EINVAL;
 
+   mutex_lock(_global_refc.lock);
+   /*
+* Check if any other thread is running
+* core-engine, if not set the global id to
+* thread-imc.
+*/
+   if (imc_global_refc.id == 0) {
+   imc_global_refc.id = IMC_DOMAIN_THREAD;
+   imc_global_refc.refc++;
+   } else if (imc

[PATCH 1/2] powerpc/powernv: Re-enable imc trace-mode in kernel

2020-01-20 Thread Anju T Sudhakar
commit <249fad734a25> ""powerpc/perf: Disable trace_imc pmu"
disables IMC(In-Memory Collection) trace-mode in kernel, since frequent
mode switching between accumulation mode and trace mode via the spr LDBAR
in the hardware can trigger a checkstop(system crash).

Patch to re-enable imc-trace mode in kernel.

The following patch in this series will address the mode switching issue
by implementing a global lock, and will restrict the usage of
accumulation and trace-mode at a time.

Signed-off-by: Anju T Sudhakar 
---
 arch/powerpc/platforms/powernv/opal-imc.c | 9 +
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/opal-imc.c 
b/arch/powerpc/platforms/powernv/opal-imc.c
index 000b350d4060..3b4518f4b643 100644
--- a/arch/powerpc/platforms/powernv/opal-imc.c
+++ b/arch/powerpc/platforms/powernv/opal-imc.c
@@ -278,14 +278,7 @@ static int opal_imc_counters_probe(struct platform_device 
*pdev)
domain = IMC_DOMAIN_THREAD;
break;
case IMC_TYPE_TRACE:
-   /*
-* FIXME. Using trace_imc events to monitor application
-* or KVM thread performance can cause a checkstop
-* (system crash).
-* Disable it for now.
-*/
-   pr_info_once("IMC: disabling trace_imc PMU\n");
-   domain = -1;
+   domain = IMC_DOMAIN_TRACE;
break;
default:
pr_warn("IMC Unknown Device type \n");
-- 
2.20.1



[PATCH v3] platforms/powernv: Avoid re-registration of imc debugfs directory

2019-11-26 Thread Anju T Sudhakar
export_imc_mode_and_cmd() function which creates the debugfs interface for
imc-mode and imc-command, is invoked when each nest pmu units is
registered.
When the first nest pmu unit is registered, export_imc_mode_and_cmd()
creates 'imc' directory under `/debug/powerpc/`. In the subsequent
invocations debugfs_create_dir() function returns, since the directory
already exists.

The recent commit  (debugfs: make error message a bit more
verbose), throws a warning if we try to invoke `debugfs_create_dir()`
with an already existing directory name.

Address this warning by making the debugfs directory registration
in the opal_imc_counters_probe() function, i.e invoke
export_imc_mode_and_cmd() function from the probe function.

Signed-off-by: Anju T Sudhakar 
---
Changes from v2 -> v3:

* Invoke export_imc_mode_and_cmd(), which does the imc debugfs
  directory registration and deletion, from the probe fucntion.
* Change the return type of imc_pmu_create() to get the
  control block address for nest units in the probe
  function
* Remove unnecessary comments

---
 arch/powerpc/platforms/powernv/opal-imc.c | 39 +--
 1 file changed, 16 insertions(+), 23 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/opal-imc.c 
b/arch/powerpc/platforms/powernv/opal-imc.c
index e04b206..3b4518f 100644
--- a/arch/powerpc/platforms/powernv/opal-imc.c
+++ b/arch/powerpc/platforms/powernv/opal-imc.c
@@ -59,10 +59,6 @@ static void export_imc_mode_and_cmd(struct device_node *node,
 
imc_debugfs_parent = debugfs_create_dir("imc", powerpc_debugfs_root);
 
-   /*
-* Return here, either because 'imc' directory already exists,
-* Or failed to create a new one.
-*/
if (!imc_debugfs_parent)
return;
 
@@ -135,7 +131,6 @@ static int imc_get_mem_addr_nest(struct device_node *node,
}
 
pmu_ptr->imc_counter_mmaped = true;
-   export_imc_mode_and_cmd(node, pmu_ptr);
kfree(base_addr_arr);
kfree(chipid_arr);
return 0;
@@ -151,7 +146,7 @@ static int imc_get_mem_addr_nest(struct device_node *node,
  * and domain as the inputs.
  * Allocates memory for the struct imc_pmu, sets up its domain, size and 
offsets
  */
-static int imc_pmu_create(struct device_node *parent, int pmu_index, int 
domain)
+static struct imc_pmu *imc_pmu_create(struct device_node *parent, int 
pmu_index, int domain)
 {
int ret = 0;
struct imc_pmu *pmu_ptr;
@@ -159,27 +154,23 @@ static int imc_pmu_create(struct device_node *parent, int 
pmu_index, int domain)
 
/* Return for unknown domain */
if (domain < 0)
-   return -EINVAL;
+   return NULL;
 
/* memory for pmu */
pmu_ptr = kzalloc(sizeof(*pmu_ptr), GFP_KERNEL);
if (!pmu_ptr)
-   return -ENOMEM;
+   return NULL;
 
/* Set the domain */
pmu_ptr->domain = domain;
 
ret = of_property_read_u32(parent, "size", _ptr->counter_mem_size);
-   if (ret) {
-   ret = -EINVAL;
+   if (ret)
goto free_pmu;
-   }
 
if (!of_property_read_u32(parent, "offset", )) {
-   if (imc_get_mem_addr_nest(parent, pmu_ptr, offset)) {
-   ret = -EINVAL;
+   if (imc_get_mem_addr_nest(parent, pmu_ptr, offset))
goto free_pmu;
-   }
}
 
/* Function to register IMC pmu */
@@ -190,14 +181,14 @@ static int imc_pmu_create(struct device_node *parent, int 
pmu_index, int domain)
if (pmu_ptr->domain == IMC_DOMAIN_NEST)
kfree(pmu_ptr->mem_info);
kfree(pmu_ptr);
-   return ret;
+   return NULL;
}
 
-   return 0;
+   return pmu_ptr;
 
 free_pmu:
kfree(pmu_ptr);
-   return ret;
+   return NULL;
 }
 
 static void disable_nest_pmu_counters(void)
@@ -254,6 +245,7 @@ int get_max_nest_dev(void)
 static int opal_imc_counters_probe(struct platform_device *pdev)
 {
struct device_node *imc_dev = pdev->dev.of_node;
+   struct imc_pmu *pmu;
int pmu_count = 0, domain;
bool core_imc_reg = false, thread_imc_reg = false;
u32 type;
@@ -269,6 +261,7 @@ static int opal_imc_counters_probe(struct platform_device 
*pdev)
}
 
for_each_compatible_node(imc_dev, NULL, IMC_DTB_UNIT_COMPAT) {
+   pmu = NULL;
if (of_property_read_u32(imc_dev, "type", )) {
pr_warn("IMC Device without type property\n");
continue;
@@ -293,9 +286,13 @@ static int opal_imc_counters_probe(struct platform_device 
*pdev)
break;
}
 
-   if (!imc_pmu_create(imc_dev, pmu_count, domain)) {
-   if (domain == IMC_DO

[tip: perf/urgent] perf kvm: Move kvm-stat header file from conditional inclusion to common include section

2019-09-20 Thread tip-bot2 for Anju T Sudhakar
The following commit has been merged into the perf/urgent branch of tip:

Commit-ID: 8067b3da970baa12e6045400fdf009673b8dd3c2
Gitweb:
https://git.kernel.org/tip/8067b3da970baa12e6045400fdf009673b8dd3c2
Author:Anju T Sudhakar 
AuthorDate:Thu, 18 Jul 2019 23:47:47 +05:30
Committer: Arnaldo Carvalho de Melo 
CommitterDate: Fri, 20 Sep 2019 10:28:26 -03:00

perf kvm: Move kvm-stat header file from conditional inclusion to common 
include section

Move kvm-stat header file to the common include section, and make the
definitions in the header file under the conditional inclusion `#ifdef
HAVE_KVM_STAT_SUPPORT`.

This helps to define other 'perf kvm' related function prototypes in
kvm-stat header file, which may not need kvm-stat support.

Signed-off-by: Anju T Sudhakar 
Reviewed-By: Ravi Bangoria 
Cc: Alexander Shishkin 
Cc: Jiri Olsa 
Cc: Madhavan Srinivasan 
Cc: Michael Ellerman 
Cc: Namhyung Kim 
Cc: Peter Zijlstra 
Cc: linuxppc-dev@lists.ozlabs.org
Link: http://lore.kernel.org/lkml/20190718181749.30612-1-a...@linux.vnet.ibm.com
Signed-off-by: Arnaldo Carvalho de Melo 
---
 tools/perf/builtin-kvm.c   | 2 +-
 tools/perf/util/kvm-stat.h | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/tools/perf/builtin-kvm.c b/tools/perf/builtin-kvm.c
index ac6d6e0..2b822be 100644
--- a/tools/perf/builtin-kvm.c
+++ b/tools/perf/builtin-kvm.c
@@ -21,6 +21,7 @@
 #include "util/top.h"
 #include "util/data.h"
 #include "util/ordered-events.h"
+#include "util/kvm-stat.h"
 #include "ui/ui.h"
 
 #include 
@@ -59,7 +60,6 @@ static const char *get_filename_for_perf_kvm(void)
 }
 
 #ifdef HAVE_KVM_STAT_SUPPORT
-#include "util/kvm-stat.h"
 
 void exit_event_get_key(struct evsel *evsel,
struct perf_sample *sample,
diff --git a/tools/perf/util/kvm-stat.h b/tools/perf/util/kvm-stat.h
index 4691363..8fd6ec2 100644
--- a/tools/perf/util/kvm-stat.h
+++ b/tools/perf/util/kvm-stat.h
@@ -2,6 +2,8 @@
 #ifndef __PERF_KVM_STAT_H
 #define __PERF_KVM_STAT_H
 
+#ifdef HAVE_KVM_STAT_SUPPORT
+
 #include "tool.h"
 #include "stat.h"
 #include "record.h"
@@ -144,5 +146,6 @@ extern const int decode_str_len;
 extern const char *kvm_exit_reason;
 extern const char *kvm_entry_trace;
 extern const char *kvm_exit_trace;
+#endif /* HAVE_KVM_STAT_SUPPORT */
 
 #endif /* __PERF_KVM_STAT_H */


[tip: perf/urgent] perf kvm: Add arch neutral function to choose event for perf kvm record

2019-09-20 Thread tip-bot2 for Anju T Sudhakar
The following commit has been merged into the perf/urgent branch of tip:

Commit-ID: 124eb5f82bf9395419b20205c4dcc1b8fcda7f29
Gitweb:
https://git.kernel.org/tip/124eb5f82bf9395419b20205c4dcc1b8fcda7f29
Author:Anju T Sudhakar 
AuthorDate:Thu, 18 Jul 2019 23:47:48 +05:30
Committer: Arnaldo Carvalho de Melo 
CommitterDate: Fri, 20 Sep 2019 10:28:26 -03:00

perf kvm: Add arch neutral function to choose event for perf kvm record

'perf kvm record' uses 'cycles'(if the user did not specify any event)
as the default event to profile the guest.

This will not provide any proper samples from the guest incase of
powerpc architecture, since in powerpc the PMUs are controlled by the
guest rather than the host.

Patch adds a function to pick an arch specific event for 'perf kvm
record', instead of selecting 'cycles' as a default event for all
architectures.

For powerpc this function checks for any user specified event, and if
there isn't any it returns invalid instead of proceeding with 'cycles'
event.

Signed-off-by: Anju T Sudhakar 
Reviewed-by: Ravi Bangoria 
Cc: Alexander Shishkin 
Cc: Jiri Olsa 
Cc: Madhavan Srinivasan 
Cc: Michael Ellerman 
Cc: Namhyung Kim 
Cc: Peter Zijlstra 
Cc: linuxppc-dev@lists.ozlabs.org
Link: http://lore.kernel.org/lkml/20190718181749.30612-2-a...@linux.vnet.ibm.com
Signed-off-by: Arnaldo Carvalho de Melo 
---
 tools/perf/arch/powerpc/util/kvm-stat.c | 37 -
 tools/perf/builtin-kvm.c| 12 +++-
 tools/perf/util/kvm-stat.h  |  1 +-
 3 files changed, 49 insertions(+), 1 deletion(-)

diff --git a/tools/perf/arch/powerpc/util/kvm-stat.c 
b/tools/perf/arch/powerpc/util/kvm-stat.c
index f0dbf7b..ec5b771 100644
--- a/tools/perf/arch/powerpc/util/kvm-stat.c
+++ b/tools/perf/arch/powerpc/util/kvm-stat.c
@@ -8,6 +8,7 @@
 
 #include "book3s_hv_exits.h"
 #include "book3s_hcalls.h"
+#include 
 
 #define NR_TPS 4
 
@@ -172,3 +173,39 @@ int cpu_isa_init(struct perf_kvm_stat *kvm, const char 
*cpuid __maybe_unused)
 
return ret;
 }
+
+/*
+ * Incase of powerpc architecture, pmu registers are programmable
+ * by guest kernel. So monitoring guest via host may not provide
+ * valid samples. It is better to fail the "perf kvm record"
+ * with default "cycles" event to monitor guest in powerpc.
+ *
+ * Function to parse the arguments and return appropriate values.
+ */
+int kvm_add_default_arch_event(int *argc, const char **argv)
+{
+   const char **tmp;
+   bool event = false;
+   int i, j = *argc;
+
+   const struct option event_options[] = {
+   OPT_BOOLEAN('e', "event", , NULL),
+   OPT_END()
+   };
+
+   tmp = calloc(j + 1, sizeof(char *));
+   if (!tmp)
+   return -EINVAL;
+
+   for (i = 0; i < j; i++)
+   tmp[i] = argv[i];
+
+   parse_options(j, tmp, event_options, NULL, PARSE_OPT_KEEP_UNKNOWN);
+   if (!event) {
+   free(tmp);
+   return -EINVAL;
+   }
+
+   free(tmp);
+   return 0;
+}
diff --git a/tools/perf/builtin-kvm.c b/tools/perf/builtin-kvm.c
index 2b822be..6e3e366 100644
--- a/tools/perf/builtin-kvm.c
+++ b/tools/perf/builtin-kvm.c
@@ -1514,11 +1514,21 @@ perf_stat:
 }
 #endif /* HAVE_KVM_STAT_SUPPORT */
 
+int __weak kvm_add_default_arch_event(int *argc __maybe_unused,
+   const char **argv __maybe_unused)
+{
+   return 0;
+}
+
 static int __cmd_record(const char *file_name, int argc, const char **argv)
 {
-   int rec_argc, i = 0, j;
+   int rec_argc, i = 0, j, ret;
const char **rec_argv;
 
+   ret = kvm_add_default_arch_event(, argv);
+   if (ret)
+   return -EINVAL;
+
rec_argc = argc + 2;
rec_argv = calloc(rec_argc + 1, sizeof(char *));
rec_argv[i++] = strdup("record");
diff --git a/tools/perf/util/kvm-stat.h b/tools/perf/util/kvm-stat.h
index 8fd6ec2..6f0fa05 100644
--- a/tools/perf/util/kvm-stat.h
+++ b/tools/perf/util/kvm-stat.h
@@ -148,4 +148,5 @@ extern const char *kvm_entry_trace;
 extern const char *kvm_exit_trace;
 #endif /* HAVE_KVM_STAT_SUPPORT */
 
+extern int kvm_add_default_arch_event(int *argc, const char **argv);
 #endif /* __PERF_KVM_STAT_H */


[tip: perf/urgent] perf kvm stat: Set 'trace_cycles' as default event for 'perf kvm record' in powerpc

2019-09-20 Thread tip-bot2 for Anju T Sudhakar
The following commit has been merged into the perf/urgent branch of tip:

Commit-ID: 2bff2b828502b5e5d5ea5a52643d3542053df03f
Gitweb:
https://git.kernel.org/tip/2bff2b828502b5e5d5ea5a52643d3542053df03f
Author:Anju T Sudhakar 
AuthorDate:Thu, 18 Jul 2019 23:47:49 +05:30
Committer: Arnaldo Carvalho de Melo 
CommitterDate: Fri, 20 Sep 2019 10:28:26 -03:00

perf kvm stat: Set 'trace_cycles' as default event for 'perf kvm record' in 
powerpc

Use 'trace_imc/trace_cycles' as the default event for 'perf kvm record'
in powerpc.

Signed-off-by: Anju T Sudhakar 
Reviewed-by: Ravi Bangoria 
Cc: Alexander Shishkin 
Cc: Jiri Olsa 
Cc: Madhavan Srinivasan 
Cc: Michael Ellerman 
Cc: Namhyung Kim 
Cc: Peter Zijlstra 
Cc: linuxppc-dev@lists.ozlabs.org
Link: http://lore.kernel.org/lkml/20190718181749.30612-3-a...@linux.vnet.ibm.com
[ Add missing pmu.h header, needed because this patch uses pmu_have_event() ]
Signed-off-by: Arnaldo Carvalho de Melo 
---
 tools/perf/arch/powerpc/util/kvm-stat.c | 16 
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/tools/perf/arch/powerpc/util/kvm-stat.c 
b/tools/perf/arch/powerpc/util/kvm-stat.c
index ec5b771..9cc1c4a 100644
--- a/tools/perf/arch/powerpc/util/kvm-stat.c
+++ b/tools/perf/arch/powerpc/util/kvm-stat.c
@@ -5,6 +5,7 @@
 #include "util/debug.h"
 #include "util/evsel.h"
 #include "util/evlist.h"
+#include "util/pmu.h"
 
 #include "book3s_hv_exits.h"
 #include "book3s_hcalls.h"
@@ -177,8 +178,9 @@ int cpu_isa_init(struct perf_kvm_stat *kvm, const char 
*cpuid __maybe_unused)
 /*
  * Incase of powerpc architecture, pmu registers are programmable
  * by guest kernel. So monitoring guest via host may not provide
- * valid samples. It is better to fail the "perf kvm record"
- * with default "cycles" event to monitor guest in powerpc.
+ * valid samples with default 'cycles' event. It is better to use
+ * 'trace_imc/trace_cycles' event for guest profiling, since it
+ * can track the guest instruction pointer in the trace-record.
  *
  * Function to parse the arguments and return appropriate values.
  */
@@ -202,8 +204,14 @@ int kvm_add_default_arch_event(int *argc, const char 
**argv)
 
parse_options(j, tmp, event_options, NULL, PARSE_OPT_KEEP_UNKNOWN);
if (!event) {
-   free(tmp);
-   return -EINVAL;
+   if (pmu_have_event("trace_imc", "trace_cycles")) {
+   argv[j++] = strdup("-e");
+   argv[j++] = strdup("trace_imc/trace_cycles/");
+   *argc += 2;
+   } else {
+   free(tmp);
+   return -EINVAL;
+   }
}
 
free(tmp);


[PATCH v2] platform/powernv: Avoid re-registration of imc debugfs directory

2019-09-03 Thread Anju T Sudhakar
export_imc_mode_and_cmd() function which creates the debugfs interface for
imc-mode and imc-command, is invoked when each nest pmu units is
registered.
When the first nest pmu unit is registered, export_imc_mode_and_cmd()
creates 'imc' directory under `/debug/powerpc/`. In the subsequent
invocations debugfs_create_dir() function returns, since the directory
already exists.

The recent commit  (debugfs: make error message a bit more
verbose), throws a warning if we try to invoke `debugfs_create_dir()`
with an already existing directory name.

Address this warning by searching for an existing 'imc' directory,
and do not invoke debugfs_create_dir(), if the debugfs interface for
imc already exists.

This patch is based on:
https://lists.ozlabs.org/pipermail/linuxppc-dev/2019-August/195898.html

Signed-off-by: Anju T Sudhakar 
Tested-by: Nageswara R Sastry 
---
Changes from v1 -> v2

* Minor changes in the commit message.
---
 arch/powerpc/platforms/powernv/opal-imc.c | 13 +
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/opal-imc.c 
b/arch/powerpc/platforms/powernv/opal-imc.c
index e04b20625cb9..fc2f0e60a44d 100644
--- a/arch/powerpc/platforms/powernv/opal-imc.c
+++ b/arch/powerpc/platforms/powernv/opal-imc.c
@@ -55,14 +55,19 @@ static void export_imc_mode_and_cmd(struct device_node 
*node,
static u64 loc, *imc_mode_addr, *imc_cmd_addr;
char mode[16], cmd[16];
u32 cb_offset;
+   struct dentry *dir = NULL;
struct imc_mem_info *ptr = pmu_ptr->mem_info;
 
+
+   /* Return, if 'imc' interface already exists */
+   dir = debugfs_lookup("imc", powerpc_debugfs_root);
+   if (dir) {
+   dput(dir);
+   return;
+   }
imc_debugfs_parent = debugfs_create_dir("imc", powerpc_debugfs_root);
 
-   /*
-* Return here, either because 'imc' directory already exists,
-* Or failed to create a new one.
-*/
+   /* Return here, if failed to create the directory */
if (!imc_debugfs_parent)
return;
 
-- 
2.20.1



Re: [PATCH] platform/powernv: Avoid re-registration of imc debugfs directory

2019-08-20 Thread Anju T Sudhakar

Hi,

On 8/21/19 10:16 AM, Oliver O'Halloran wrote:

On Wed, Aug 21, 2019 at 2:10 PM Anju T Sudhakar  wrote:

export_imc_mode_and_cmd() function which creates the debugfs interface for
imc-mode and imc-command, is invoked when each nest pmu units is
registered.
When the first nest pmu unit is registered, export_imc_mode_and_cmd()
creates 'imc' directory under `/debug/powerpc`. In the subsequent
invocations debugfs_create_dir() function returns, since the directory
already exists.

The recent commit  (debugfs: make error message a bit more
verbose), throws a warning if we try to invoke `debugfs_create_dir()`
with an already existing directory name.

Address this warning by lookup for an existing 'imc' directory,
and do not invoke debugfs_create_dir(), if the debugfs interface for
imc already exists.

This patch is based on:
https://lists.ozlabs.org/pipermail/linuxppc-dev/2019-July/192979.html

Signed-off-by: Anju T Sudhakar 
Tested-by: Nageswara R Sastry 
---
  arch/powerpc/platforms/powernv/opal-imc.c | 13 +
  1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/opal-imc.c 
b/arch/powerpc/platforms/powernv/opal-imc.c
index e04b20625cb9..fc2f0e60a44d 100644
--- a/arch/powerpc/platforms/powernv/opal-imc.c
+++ b/arch/powerpc/platforms/powernv/opal-imc.c
@@ -55,14 +55,19 @@ static void export_imc_mode_and_cmd(struct device_node 
*node,
 static u64 loc, *imc_mode_addr, *imc_cmd_addr;
 char mode[16], cmd[16];
 u32 cb_offset;
+   struct dentry *dir = NULL;
 struct imc_mem_info *ptr = pmu_ptr->mem_info;

+
+   /* Return, if 'imc' interface already exists */
+   dir = debugfs_lookup("imc", powerpc_debugfs_root);
+   if (dir) {
+   dput(dir);
+   return;
+   }
 imc_debugfs_parent = debugfs_create_dir("imc", powerpc_debugfs_root);

Is there a reason why we create the debugfs directory here and not in
opal_imc_counters_probe()? There's logic to remove the debugfs
directory in _probe() already so it seems like a more natural place to
it.


Good point. But we can only create the parent directory,

i.e 'imc' directory in `_probe()` function and the entries can be

created only here. The reason is, this debugfs entries are only for

IMC nest units. So, to get the imc mode and command values from

the nest memory region we need the relevant offsets from the control 
block structure.


Since imc_get_mem_addr_nest() function reads this address

for each chip, we invoke the function to create the debugfs

entries after this values are populated(i.e export_imc_mode_and_cmd() in 
invoked by


imc_get_mem_addr_nest()).

Also, if we create the parent directory in `_probe()` function,

we need to track whether the entries(i.e imc_cmd and imc_mode) are 
created or not.



Regards,

Anju



[PATCH] platform/powernv: Avoid re-registration of imc debugfs directory

2019-08-20 Thread Anju T Sudhakar
export_imc_mode_and_cmd() function which creates the debugfs interface for
imc-mode and imc-command, is invoked when each nest pmu units is
registered.
When the first nest pmu unit is registered, export_imc_mode_and_cmd()
creates 'imc' directory under `/debug/powerpc`. In the subsequent
invocations debugfs_create_dir() function returns, since the directory
already exists.

The recent commit  (debugfs: make error message a bit more
verbose), throws a warning if we try to invoke `debugfs_create_dir()`
with an already existing directory name.

Address this warning by lookup for an existing 'imc' directory,
and do not invoke debugfs_create_dir(), if the debugfs interface for
imc already exists.

This patch is based on:
   https://lists.ozlabs.org/pipermail/linuxppc-dev/2019-July/192979.html

Signed-off-by: Anju T Sudhakar 
Tested-by: Nageswara R Sastry 
---
 arch/powerpc/platforms/powernv/opal-imc.c | 13 +
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/opal-imc.c 
b/arch/powerpc/platforms/powernv/opal-imc.c
index e04b20625cb9..fc2f0e60a44d 100644
--- a/arch/powerpc/platforms/powernv/opal-imc.c
+++ b/arch/powerpc/platforms/powernv/opal-imc.c
@@ -55,14 +55,19 @@ static void export_imc_mode_and_cmd(struct device_node 
*node,
static u64 loc, *imc_mode_addr, *imc_cmd_addr;
char mode[16], cmd[16];
u32 cb_offset;
+   struct dentry *dir = NULL;
struct imc_mem_info *ptr = pmu_ptr->mem_info;
 
+
+   /* Return, if 'imc' interface already exists */
+   dir = debugfs_lookup("imc", powerpc_debugfs_root);
+   if (dir) {
+   dput(dir);
+   return;
+   }
imc_debugfs_parent = debugfs_create_dir("imc", powerpc_debugfs_root);
 
-   /*
-* Return here, either because 'imc' directory already exists,
-* Or failed to create a new one.
-*/
+   /* Return here, if failed to create the directory */
if (!imc_debugfs_parent)
return;
 
-- 
2.20.1



Re: [RFC PATCH 3/4] powerpc/perf: fix imc allocation failure

2019-07-23 Thread Anju T Sudhakar



On 7/22/19 11:16 PM, Nicholas Piggin wrote:

alloc_pages_node return value should be tested before applying
page_address.

Cc: Anju T Sudhakar 
Cc: Madhavan Srinivasan 
Signed-off-by: Nicholas Piggin 
---


Tested-by: Anju T Sudhakar 



Re: [PATCH v2] powerpc/imc: Dont create debugfs files for cpu-less nodes

2019-07-23 Thread Anju T Sudhakar

Hi Qian,

On 7/16/19 12:11 AM, Qian Cai wrote:

On Thu, 2019-07-11 at 14:53 +1000, Michael Ellerman wrote:

Hi Maddy,

Madhavan Srinivasan  writes:

diff --git a/arch/powerpc/platforms/powernv/opal-imc.c
b/arch/powerpc/platforms/powernv/opal-imc.c
index 186109bdd41b..e04b20625cb9 100644
--- a/arch/powerpc/platforms/powernv/opal-imc.c
+++ b/arch/powerpc/platforms/powernv/opal-imc.c
@@ -69,20 +69,20 @@ static void export_imc_mode_and_cmd(struct device_node
*node,
    if (of_property_read_u32(node, "cb_offset", _offset))
    cb_offset = IMC_CNTL_BLK_OFFSET;
  
-	for_each_node(nid) {

-   loc = (u64)(pmu_ptr->mem_info[chip].vbase) + cb_offset;
+   while (ptr->vbase != NULL) {

This means you'll bail out as soon as you find a node with no vbase, but
it's possible we could have a CPU-less node intermingled with other
nodes.

So I think you want to keep the for loop, but continue if you see a NULL
vbase?

Not sure if this will also takes care of some of those messages during the boot
on today's linux-next even without this patch.


[   18.077780][T1] debugfs: Directory 'imc' with parent 'powerpc' already
present!




This is introduced by a recent commit: c33d442328f55 (debugfs: make 
error message a bit more verbose).


So basically, the debugfs imc_* file is created per node, and is created 
by the first nest unit which is


being registered. For the subsequent nest units, debugfs_create_dir() 
will just return since the imc_* file already


exist.

The commit "c33d442328f55 (debugfs: make error message a bit more 
verbose)", prints


a message if the debugfs file already exists in debugfs_create_dir(). 
That is why we are encountering these


messages now.


This patch (i.e, powerpc/imc: Dont create debugfs files for cpu-less 
nodes) will address the initial issue, i.e


"numa crash while reading imc_* debugfs files for cpu less nodes", and 
will not address these debugfs messages.



But yeah this is a good catch. We can have some checks to avoid these 
debugfs messages.



Hi Michael,

Do we need to have a separate patch to address these debugfs messages, 
or can we address the same


in the next version of this patch itself?


Thanks,

Anju






[PATCH v2 3/3] tools/perf: Set 'trace_cycles' as defaultevent for perf kvm record in powerpc

2019-07-18 Thread Anju T Sudhakar
Use 'trace_imc/trace_cycles' as the default event for 'perf kvm record'
in powerpc.

Signed-off-by: Anju T Sudhakar 
---
 tools/perf/arch/powerpc/util/kvm-stat.c | 15 +++
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/tools/perf/arch/powerpc/util/kvm-stat.c 
b/tools/perf/arch/powerpc/util/kvm-stat.c
index c55e7405940e..0a06626fb18a 100644
--- a/tools/perf/arch/powerpc/util/kvm-stat.c
+++ b/tools/perf/arch/powerpc/util/kvm-stat.c
@@ -177,8 +177,9 @@ int cpu_isa_init(struct perf_kvm_stat *kvm, const char 
*cpuid __maybe_unused)
 /*
  * Incase of powerpc architecture, pmu registers are programmable
  * by guest kernel. So monitoring guest via host may not provide
- * valid samples. It is better to fail the "perf kvm record"
- * with default "cycles" event to monitor guest in powerpc.
+ * valid samples with default 'cycles' event. It is better to use
+ * 'trace_imc/trace_cycles' event for guest profiling, since it
+ * can track the guest instruction pointer in the trace-record.
  *
  * Function to parse the arguments and return appropriate values.
  */
@@ -202,8 +203,14 @@ int kvm_add_default_arch_event(int *argc, const char 
**argv)
 
parse_options(j, tmp, event_options, NULL, PARSE_OPT_KEEP_UNKNOWN);
if (!event) {
-   free(tmp);
-   return -EINVAL;
+   if (pmu_have_event("trace_imc", "trace_cycles")) {
+   argv[j++] = strdup("-e");
+   argv[j++] = strdup("trace_imc/trace_cycles/");
+   *argc += 2;
+   } else {
+   free(tmp);
+   return -EINVAL;
+   }
}
 
free(tmp);
-- 
2.20.1



[PATCH v2 2/3] tools/perf: Add arch neutral function to choose event for perf kvm record

2019-07-18 Thread Anju T Sudhakar
'perf kvm record' uses 'cycles'(if the user did not specify any event) as
the default event to profile the guest.
This will not provide any proper samples from the guest incase of
powerpc architecture, since in powerpc the PMUs are controlled by
the guest rather than the host.

Patch adds a function to pick an arch specific event for 'perf kvm record',
instead of selecting 'cycles' as a default event for all architectures.

For powerpc this function checks for any user specified event, and if there
isn't any it returns invalid instead of proceeding with 'cycles' event.

Signed-off-by: Anju T Sudhakar 
---

Changes from v1->v2
* Cross-build issue for aarch64, reported by Ravi is fixed.
---

 tools/perf/arch/powerpc/util/kvm-stat.c | 37 +
 tools/perf/builtin-kvm.c| 12 +++-
 tools/perf/util/kvm-stat.h  |  1 +
 3 files changed, 49 insertions(+), 1 deletion(-)

diff --git a/tools/perf/arch/powerpc/util/kvm-stat.c 
b/tools/perf/arch/powerpc/util/kvm-stat.c
index f9db341c47b6..c55e7405940e 100644
--- a/tools/perf/arch/powerpc/util/kvm-stat.c
+++ b/tools/perf/arch/powerpc/util/kvm-stat.c
@@ -8,6 +8,7 @@
 
 #include "book3s_hv_exits.h"
 #include "book3s_hcalls.h"
+#include 
 
 #define NR_TPS 4
 
@@ -172,3 +173,39 @@ int cpu_isa_init(struct perf_kvm_stat *kvm, const char 
*cpuid __maybe_unused)
 
return ret;
 }
+
+/*
+ * Incase of powerpc architecture, pmu registers are programmable
+ * by guest kernel. So monitoring guest via host may not provide
+ * valid samples. It is better to fail the "perf kvm record"
+ * with default "cycles" event to monitor guest in powerpc.
+ *
+ * Function to parse the arguments and return appropriate values.
+ */
+int kvm_add_default_arch_event(int *argc, const char **argv)
+{
+   const char **tmp;
+   bool event = false;
+   int i, j = *argc;
+
+   const struct option event_options[] = {
+   OPT_BOOLEAN('e', "event", , NULL),
+   OPT_END()
+   };
+
+   tmp = calloc(j + 1, sizeof(char *));
+   if (!tmp)
+   return -EINVAL;
+
+   for (i = 0; i < j; i++)
+   tmp[i] = argv[i];
+
+   parse_options(j, tmp, event_options, NULL, PARSE_OPT_KEEP_UNKNOWN);
+   if (!event) {
+   free(tmp);
+   return -EINVAL;
+   }
+
+   free(tmp);
+   return 0;
+}
diff --git a/tools/perf/builtin-kvm.c b/tools/perf/builtin-kvm.c
index 5d2b34d290a3..d03750da051b 100644
--- a/tools/perf/builtin-kvm.c
+++ b/tools/perf/builtin-kvm.c
@@ -1510,11 +1510,21 @@ static int kvm_cmd_stat(const char *file_name, int 
argc, const char **argv)
 }
 #endif /* HAVE_KVM_STAT_SUPPORT */
 
+int __weak kvm_add_default_arch_event(int *argc __maybe_unused,
+   const char **argv __maybe_unused)
+{
+   return 0;
+}
+
 static int __cmd_record(const char *file_name, int argc, const char **argv)
 {
-   int rec_argc, i = 0, j;
+   int rec_argc, i = 0, j, ret;
const char **rec_argv;
 
+   ret = kvm_add_default_arch_event(, argv);
+   if (ret)
+   return -EINVAL;
+
rec_argc = argc + 2;
rec_argv = calloc(rec_argc + 1, sizeof(char *));
rec_argv[i++] = strdup("record");
diff --git a/tools/perf/util/kvm-stat.h b/tools/perf/util/kvm-stat.h
index b3b2670e1a2b..81a5bf4fbc71 100644
--- a/tools/perf/util/kvm-stat.h
+++ b/tools/perf/util/kvm-stat.h
@@ -148,4 +148,5 @@ extern const char *kvm_entry_trace;
 extern const char *kvm_exit_trace;
 #endif /* HAVE_KVM_STAT_SUPPORT */
 
+extern int kvm_add_default_arch_event(int *argc, const char **argv);
 #endif /* __PERF_KVM_STAT_H */
-- 
2.20.1



[PATCH v2 1/3] tools/perf: Move kvm-stat header file from conditional inclusion to common include section

2019-07-18 Thread Anju T Sudhakar
Move kvm-stat header file to the common include section, and make the
definitions in the header file under the conditional inclusion 
`#ifdef HAVE_KVM_STAT_SUPPORT`.

This helps to define other perf kvm related function prototypes in
kvm-stat header file, which may not need kvm-stat support.

Signed-off-by: Anju T Sudhakar 
---
 tools/perf/builtin-kvm.c   | 2 +-
 tools/perf/util/kvm-stat.h | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/tools/perf/builtin-kvm.c b/tools/perf/builtin-kvm.c
index b33c83489120..5d2b34d290a3 100644
--- a/tools/perf/builtin-kvm.c
+++ b/tools/perf/builtin-kvm.c
@@ -19,6 +19,7 @@
 #include "util/top.h"
 #include "util/data.h"
 #include "util/ordered-events.h"
+#include "util/kvm-stat.h"
 
 #include 
 #ifdef HAVE_TIMERFD_SUPPORT
@@ -55,7 +56,6 @@ static const char *get_filename_for_perf_kvm(void)
 }
 
 #ifdef HAVE_KVM_STAT_SUPPORT
-#include "util/kvm-stat.h"
 
 void exit_event_get_key(struct perf_evsel *evsel,
struct perf_sample *sample,
diff --git a/tools/perf/util/kvm-stat.h b/tools/perf/util/kvm-stat.h
index 1403dec189b4..b3b2670e1a2b 100644
--- a/tools/perf/util/kvm-stat.h
+++ b/tools/perf/util/kvm-stat.h
@@ -2,6 +2,8 @@
 #ifndef __PERF_KVM_STAT_H
 #define __PERF_KVM_STAT_H
 
+#ifdef HAVE_KVM_STAT_SUPPORT
+
 #include "../perf.h"
 #include "tool.h"
 #include "stat.h"
@@ -144,5 +146,6 @@ extern const int decode_str_len;
 extern const char *kvm_exit_reason;
 extern const char *kvm_entry_trace;
 extern const char *kvm_exit_trace;
+#endif /* HAVE_KVM_STAT_SUPPORT */
 
 #endif /* __PERF_KVM_STAT_H */
-- 
2.20.1



Re: power9 NUMA crash while reading debugfs imc_cmd

2019-06-28 Thread Anju T Sudhakar



On 6/28/19 9:04 AM, Qian Cai wrote:



On Jun 27, 2019, at 11:12 PM, Michael Ellerman  wrote:

Qian Cai  writes:

Read of debugfs imc_cmd file for a memory-less node will trigger a crash below
on this power9 machine which has the following NUMA layout.

What type of machine is it?

description: PowerNV
product: 8335-GTH (ibm,witherspoon)
vendor: IBM
width: 64 bits
capabilities: smp powernv opal



Hi Qian Cai,

Could you please try with this patch: 
https://lists.ozlabs.org/pipermail/linuxppc-dev/2019-June/192803.html


and see if the issue is resolved?


Thanks,

Anju




[PATCH] powerpc/imc: Dont create debugfs files for cpu-less nodes

2019-06-28 Thread Anju T Sudhakar
From: Madhavan Srinivasan 

Commit <684d984038aa> ('powerpc/powernv: Add debugfs interface for imc-mode
and imc') added debugfs interface for the nest imc pmu devices to support
changing of different ucode modes. Primarily adding this capability for
debug. But when doing so, the code did not consider the case of cpu-less
nodes. So when reading the _cmd_ or _mode_ file of a cpu-less node
will create this crash.

[ 1139.415461][ T5301] Faulting instruction address: 0xc00d0d58
[ 1139.415492][ T5301] Oops: Kernel access of bad area, sig: 11 [#1]
[ 1139.415509][ T5301] LE PAGE_SIZE=64K MMU=Radix MMU=Hash SMP NR_CPUS=256
DEBUG_PAGEALLOC NUMA PowerNV
[ 1139.415542][ T5301] Modules linked in: i2c_opal i2c_core ip_tables x_tables
xfs sd_mod bnx2x mdio ahci libahci tg3 libphy libata firmware_class dm_mirror
dm_region_hash dm_log dm_mod
[ 1139.415595][ T5301] CPU: 67 PID: 5301 Comm: cat Not tainted 5.2.0-rc6-next-
20190627+ #19
[ 1139.415634][ T5301] NIP:  c00d0d58 LR: c049aa18 
CTR:c00d0d50
[ 1139.415675][ T5301] REGS: c00020194548f9e0 TRAP: 0300   Not tainted  
(5.2.0-rc6-next-20190627+)
[ 1139.415705][ T5301] MSR:  90009033   
CR:28022822  XER: 
[ 1139.415777][ T5301] CFAR: c049aa14 DAR: 0003fc08 
DSISR:4000 IRQMASK: 0
[ 1139.415777][ T5301] GPR00: c049aa18 c00020194548fc70 
c16f8b03fc08
[ 1139.415777][ T5301] GPR04: c00020194548fcd0  
14884e7300011eaa
[ 1139.415777][ T5301] GPR08: 7eea5a52 c00d0d50 

[ 1139.415777][ T5301] GPR12: c00d0d50 c000201fff7f8c00 

[ 1139.415777][ T5301] GPR16: 000d 7fffeb0c3368 

[ 1139.415777][ T5301] GPR20:   
0002
[ 1139.415777][ T5301] GPR24:   
000200010ec9
[ 1139.415777][ T5301] GPR28: c00020194548fdf0 c00020049a584ef8 
c00020049a584ea8
[ 1139.416116][ T5301] NIP [c00d0d58] imc_mem_get+0x8/0x20
[ 1139.416143][ T5301] LR [c049aa18] simple_attr_read+0x118/0x170
[ 1139.416158][ T5301] Call Trace:
[ 1139.416182][ T5301] [c00020194548fc70] 
[c049a970]simple_attr_read+0x70/0x170 (unreliable)
[ 1139.416255][ T5301] [c00020194548fd10] 
[c054385c]debugfs_attr_read+0x6c/0xb0
[ 1139.416305][ T5301] [c00020194548fd60] [c0454c1c]__vfs_read+0x3c/0x70
[ 1139.416363][ T5301] [c00020194548fd80] [c0454d0c] vfs_read+0xbc/0x1a0
[ 1139.416392][ T5301] [c00020194548fdd0] [c045519c]ksys_read+0x7c/0x140
[ 1139.416434][ T5301] [c00020194548fe20] 
[c000b108]system_call+0x5c/0x70
[ 1139.416473][ T5301] Instruction dump:
[ 1139.416511][ T5301] 4e800020 6000 7c0802a6 6000 7c801d28 3860 
4e800020 6000
[ 1139.416572][ T5301] 6000 6000 7c0802a6 6000 <7d201c28> 3860 
f924 4e800020
[ 1139.416636][ T5301] ---[ end trace c44d1fb4ace04784 ]---
[ 1139.520686][ T5301]
[ 1140.520820][ T5301] Kernel panic - not syncing: Fatal exception

Patch adds a check to avoid creation of these files to cpu-less nodes.

Fixes: 684d984038aa ('powerpc/powernv: Add debugfs interface for imc-mode and 
imc')
Reported-by: Qian Cai 
Signed-off-by: Madhavan Srinivasan 
Signed-off-by: Anju T Sudhakar 
---
 arch/powerpc/platforms/powernv/opal-imc.c | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/arch/powerpc/platforms/powernv/opal-imc.c 
b/arch/powerpc/platforms/powernv/opal-imc.c
index 186109bdd41b..12c8964a2f9c 100644
--- a/arch/powerpc/platforms/powernv/opal-imc.c
+++ b/arch/powerpc/platforms/powernv/opal-imc.c
@@ -13,6 +13,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -56,6 +57,7 @@ static void export_imc_mode_and_cmd(struct device_node *node,
int chip = 0, nid;
char mode[16], cmd[16];
u32 cb_offset;
+   const struct cpumask *l_cpumask;
 
imc_debugfs_parent = debugfs_create_dir("imc", powerpc_debugfs_root);
 
@@ -70,6 +72,14 @@ static void export_imc_mode_and_cmd(struct device_node *node,
cb_offset = IMC_CNTL_BLK_OFFSET;
 
for_each_node(nid) {
+   /*
+* Since these are related to nest pmu,
+* create only if the node has any cpu in it.
+*/
+   l_cpumask = cpumask_of_node(nid);
+   if (cpumask_empty(l_cpumask))
+   continue;
+
loc = (u64)(pmu_ptr->mem_info[chip].vbase) + cb_offset;
imc_mode_addr = (u64 *)(loc + IMC_CNTL_BLK_MODE_OFFSET);
sprintf(mode, "imc_mode_%d", nid);
-- 
2.20.1



Re: [PATCH v2] powerpc/perf: Use cpumask_last() to determine the designated cpu for nest/core units.

2019-06-11 Thread Anju T Sudhakar

Hi Leonardo,

On 6/11/19 12:17 AM, Leonardo Bras wrote:

On Mon, 2019-06-10 at 12:02 +0530, Anju T Sudhakar wrote:

Nest and core imc(In-memory Collection counters) assigns a particular
cpu as the designated target for counter data collection.
During system boot, the first online cpu in a chip gets assigned as
the designated cpu for that chip(for nest-imc) and the first online cpu
in a core gets assigned as the designated cpu for that core(for core-imc).

If the designated cpu goes offline, the next online cpu from the same
chip(for nest-imc)/core(for core-imc) is assigned as the next target,
and the event context is migrated to the target cpu.
Currently, cpumask_any_but() function is used to find the target cpu.
Though this function is expected to return a `random` cpu, this always
returns the next online cpu.

If all cpus in a chip/core is offlined in a sequential manner, starting
from the first cpu, the event migration has to happen for all the cpus
which goes offline. Since the migration process involves a grace period,
the total time taken to offline all the cpus will be significantly high.

Seems like a very interesting work.
Out of curiosity, have you used 'chcpu -d' to create your benchmark?


Here I did not use chcpu to disable the cpu.

I used a script which will offline cpus 88-175 by echoing  `0` to

/sys/devices/system/cpu/cpu*/online.


Regards,

Anju




[PATCH RESEND 2/2] tools/perf: Set 'trace_cycles' as defaultevent for perf kvm record in powerpc

2019-06-10 Thread Anju T Sudhakar
Use 'trace_imc/trace_cycles' as the default event for 'perf kvm record'
in powerpc.

Signed-off-by: Anju T Sudhakar 
---
 tools/perf/arch/powerpc/util/kvm-stat.c | 15 +++
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/tools/perf/arch/powerpc/util/kvm-stat.c 
b/tools/perf/arch/powerpc/util/kvm-stat.c
index 66f8fe500945..b552884263df 100644
--- a/tools/perf/arch/powerpc/util/kvm-stat.c
+++ b/tools/perf/arch/powerpc/util/kvm-stat.c
@@ -177,8 +177,9 @@ int cpu_isa_init(struct perf_kvm_stat *kvm, const char 
*cpuid __maybe_unused)
 /*
  * Incase of powerpc architecture, pmu registers are programmable
  * by guest kernel. So monitoring guest via host may not provide
- * valid samples. It is better to fail the "perf kvm record"
- * with default "cycles" event to monitor guest in powerpc.
+ * valid samples with default 'cycles' event. It is better to use
+ * 'trace_imc/trace_cycles' event for guest profiling, since it
+ * can track the guest instruction pointer in the trace-record.
  *
  * Function to parse the arguments and return appropriate values.
  */
@@ -202,8 +203,14 @@ int kvm_add_default_arch_event(int *argc, const char 
**argv)
 
parse_options(j, tmp, event_options, NULL, 0);
if (!event) {
-   free(tmp);
-   return -EINVAL;
+   if (pmu_have_event("trace_imc", "trace_cycles")) {
+   argv[j++] = strdup("-e");
+   argv[j++] = strdup("trace_imc/trace_cycles/");
+   *argc += 2;
+   } else {
+   free(tmp);
+   return -EINVAL;
+   }
}
 
free(tmp);
-- 
2.17.2



[PATCH RESEND 1/2] tools/perf: Add arch neutral function to choose event for perf kvm record

2019-06-10 Thread Anju T Sudhakar
'perf kvm record' uses 'cycles'(if the user did not specify any event) as
the default event to profile the guest.
This will not provide any proper samples from the guest incase of
powerpc architecture, since in powerpc the PMUs are controlled by
the guest rather than the host.

Patch adds a function to pick an arch specific event for 'perf kvm record',
instead of selecting 'cycles' as a default event for all architectures.

For powerpc this function checks for any user specified event, and if there
isn't any it returns invalid instead of proceeding with 'cycles' event.

Signed-off-by: Anju T Sudhakar 
---
 tools/perf/arch/powerpc/util/kvm-stat.c | 37 +
 tools/perf/builtin-kvm.c| 12 +++-
 tools/perf/util/kvm-stat.h  |  2 +-
 3 files changed, 49 insertions(+), 2 deletions(-)

diff --git a/tools/perf/arch/powerpc/util/kvm-stat.c 
b/tools/perf/arch/powerpc/util/kvm-stat.c
index f9db341c47b6..66f8fe500945 100644
--- a/tools/perf/arch/powerpc/util/kvm-stat.c
+++ b/tools/perf/arch/powerpc/util/kvm-stat.c
@@ -8,6 +8,7 @@
 
 #include "book3s_hv_exits.h"
 #include "book3s_hcalls.h"
+#include 
 
 #define NR_TPS 4
 
@@ -172,3 +173,39 @@ int cpu_isa_init(struct perf_kvm_stat *kvm, const char 
*cpuid __maybe_unused)
 
return ret;
 }
+
+/*
+ * Incase of powerpc architecture, pmu registers are programmable
+ * by guest kernel. So monitoring guest via host may not provide
+ * valid samples. It is better to fail the "perf kvm record"
+ * with default "cycles" event to monitor guest in powerpc.
+ *
+ * Function to parse the arguments and return appropriate values.
+ */
+int kvm_add_default_arch_event(int *argc, const char **argv)
+{
+   const char **tmp;
+   bool event = false;
+   int i, j = *argc;
+
+   const struct option event_options[] = {
+   OPT_BOOLEAN('e', "event", , NULL),
+   OPT_END()
+   };
+
+   tmp = calloc(j + 1, sizeof(char *));
+   if (!tmp)
+   return -EINVAL;
+
+   for (i = 0; i < j; i++)
+   tmp[i] = argv[i];
+
+   parse_options(j, tmp, event_options, NULL, 0);
+   if (!event) {
+   free(tmp);
+   return -EINVAL;
+   }
+
+   free(tmp);
+   return 0;
+}
diff --git a/tools/perf/builtin-kvm.c b/tools/perf/builtin-kvm.c
index dbb6f737a3e2..fe33b3ec55c9 100644
--- a/tools/perf/builtin-kvm.c
+++ b/tools/perf/builtin-kvm.c
@@ -1510,11 +1510,21 @@ static int kvm_cmd_stat(const char *file_name, int 
argc, const char **argv)
 }
 #endif /* HAVE_KVM_STAT_SUPPORT */
 
+int __weak kvm_add_default_arch_event(int *argc __maybe_unused,
+   const char **argv __maybe_unused)
+{
+   return 0;
+}
+
 static int __cmd_record(const char *file_name, int argc, const char **argv)
 {
-   int rec_argc, i = 0, j;
+   int rec_argc, i = 0, j, ret;
const char **rec_argv;
 
+   ret = kvm_add_default_arch_event(, argv);
+   if (ret)
+   return -EINVAL;
+
rec_argc = argc + 2;
rec_argv = calloc(rec_argc + 1, sizeof(char *));
rec_argv[i++] = strdup("record");
diff --git a/tools/perf/util/kvm-stat.h b/tools/perf/util/kvm-stat.h
index 1403dec189b4..da38b56c46cb 100644
--- a/tools/perf/util/kvm-stat.h
+++ b/tools/perf/util/kvm-stat.h
@@ -144,5 +144,5 @@ extern const int decode_str_len;
 extern const char *kvm_exit_reason;
 extern const char *kvm_entry_trace;
 extern const char *kvm_exit_trace;
-
+extern int kvm_add_default_arch_event(int *argc, const char **argv);
 #endif /* __PERF_KVM_STAT_H */
-- 
2.17.2



[PATCH v2] powerpc/perf: Use cpumask_last() to determine the designated cpu for nest/core units.

2019-06-10 Thread Anju T Sudhakar
Nest and core imc(In-memory Collection counters) assigns a particular
cpu as the designated target for counter data collection.
During system boot, the first online cpu in a chip gets assigned as
the designated cpu for that chip(for nest-imc) and the first online cpu
in a core gets assigned as the designated cpu for that core(for core-imc).

If the designated cpu goes offline, the next online cpu from the same
chip(for nest-imc)/core(for core-imc) is assigned as the next target,
and the event context is migrated to the target cpu.
Currently, cpumask_any_but() function is used to find the target cpu.
Though this function is expected to return a `random` cpu, this always
returns the next online cpu.

If all cpus in a chip/core is offlined in a sequential manner, starting
from the first cpu, the event migration has to happen for all the cpus
which goes offline. Since the migration process involves a grace period,
the total time taken to offline all the cpus will be significantly high.

Example:
In a system which has 2 sockets, with
NUMA node0 CPU(s): 0-87
NUMA node8 CPU(s): 88-175

Time taken to offline cpu 88-175:
real2m56.099s
user0m0.191s
sys 0m0.000s

Use cpumask_last() to choose the target cpu, when the designated cpu
goes online, so the migration will happen only when the last_cpu in the
mask goes offline. This way the time taken to offline all cpus in a
chip/core can be reduced.

With the patch, 

Time taken  to offline cpu 88-175:
real0m12.207s
user0m0.171s
sys 0m0.000s


Offlining all cpus in reverse order is also taken care because,
cpumask_any_but() is used to find the designated cpu if the last cpu in
the mask goes offline. Since cpumask_any_but() always return the first
cpu in the mask, that becomes the designated cpu and migration will happen
only when the first_cpu in the mask goes offline.

Example:
With the patch,

Time taken to offline cpu from 175-88:
real0m9.330s
user0m0.110s
sys 0m0.000s

Signed-off-by: Anju T Sudhakar 
Reviewed-by: Madhavan Srinivasan 
---

Changes from v1:
Modified the commit log with more info.
---

 arch/powerpc/perf/imc-pmu.c | 14 --
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index 31fa753..fbfd6e7 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -366,7 +366,14 @@ static int ppc_nest_imc_cpu_offline(unsigned int cpu)
 */
nid = cpu_to_node(cpu);
l_cpumask = cpumask_of_node(nid);
-   target = cpumask_any_but(l_cpumask, cpu);
+   target = cpumask_last(l_cpumask);
+
+   /*
+* If this(target) is the last cpu in the cpumask for this chip,
+* check for any possible online cpu in the chip.
+*/
+   if (unlikely(target == cpu))
+   target = cpumask_any_but(l_cpumask, cpu);
 
/*
 * Update the cpumask with the target cpu and
@@ -671,7 +678,10 @@ static int ppc_core_imc_cpu_offline(unsigned int cpu)
return 0;
 
/* Find any online cpu in that core except the current "cpu" */
-   ncpu = cpumask_any_but(cpu_sibling_mask(cpu), cpu);
+   ncpu = cpumask_last(cpu_sibling_mask(cpu));
+
+   if (unlikely(ncpu == cpu))
+   ncpu = cpumask_any_but(cpu_sibling_mask(cpu), cpu);
 
if (ncpu >= 0 && ncpu < nr_cpu_ids) {
cpumask_set_cpu(ncpu, _imc_cpumask);
-- 
1.8.3.1



[PATCH 2/2] tools/perf: Set 'trace_cycles' as defaultevent for perf kvm record in powerpc

2019-05-24 Thread Anju T Sudhakar
Use 'trace_imc/trace_cycles' as the default event for 'perf kvm record'
in powerpc.

Signed-off-by: Anju T Sudhakar 
---
 tools/perf/arch/powerpc/util/kvm-stat.c | 15 +++
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/tools/perf/arch/powerpc/util/kvm-stat.c 
b/tools/perf/arch/powerpc/util/kvm-stat.c
index 66f8fe500945..b552884263df 100644
--- a/tools/perf/arch/powerpc/util/kvm-stat.c
+++ b/tools/perf/arch/powerpc/util/kvm-stat.c
@@ -177,8 +177,9 @@ int cpu_isa_init(struct perf_kvm_stat *kvm, const char 
*cpuid __maybe_unused)
 /*
  * Incase of powerpc architecture, pmu registers are programmable
  * by guest kernel. So monitoring guest via host may not provide
- * valid samples. It is better to fail the "perf kvm record"
- * with default "cycles" event to monitor guest in powerpc.
+ * valid samples with default 'cycles' event. It is better to use
+ * 'trace_imc/trace_cycles' event for guest profiling, since it
+ * can track the guest instruction pointer in the trace-record.
  *
  * Function to parse the arguments and return appropriate values.
  */
@@ -202,8 +203,14 @@ int kvm_add_default_arch_event(int *argc, const char 
**argv)
 
parse_options(j, tmp, event_options, NULL, 0);
if (!event) {
-   free(tmp);
-   return -EINVAL;
+   if (pmu_have_event("trace_imc", "trace_cycles")) {
+   argv[j++] = strdup("-e");
+   argv[j++] = strdup("trace_imc/trace_cycles/");
+   *argc += 2;
+   } else {
+   free(tmp);
+   return -EINVAL;
+   }
}
 
free(tmp);
-- 
2.17.2



[PATCH 1/2] tools/perf: Add arch neutral function to choose event for perf kvm record

2019-05-24 Thread Anju T Sudhakar
'perf kvm record' uses 'cycles'(if the user did not specify any event) as
the default event to profile the guest.
This will not provide any proper samples from the guest incase of
powerpc architecture, since in powerpc the PMUs are controlled by
the guest rather than the host.

Patch adds a function to pick an arch specific event for 'perf kvm record',
instead of selecting 'cycles' as a default event for all architectures.

For powerpc this function checks for any user specified event, and if there
isn't any it returns invalid instead of proceeding with 'cycles' event.

Signed-off-by: Anju T Sudhakar 
---
 tools/perf/arch/powerpc/util/kvm-stat.c | 37 +
 tools/perf/builtin-kvm.c| 12 +++-
 tools/perf/util/kvm-stat.h  |  2 +-
 3 files changed, 49 insertions(+), 2 deletions(-)

diff --git a/tools/perf/arch/powerpc/util/kvm-stat.c 
b/tools/perf/arch/powerpc/util/kvm-stat.c
index f9db341c47b6..66f8fe500945 100644
--- a/tools/perf/arch/powerpc/util/kvm-stat.c
+++ b/tools/perf/arch/powerpc/util/kvm-stat.c
@@ -8,6 +8,7 @@
 
 #include "book3s_hv_exits.h"
 #include "book3s_hcalls.h"
+#include 
 
 #define NR_TPS 4
 
@@ -172,3 +173,39 @@ int cpu_isa_init(struct perf_kvm_stat *kvm, const char 
*cpuid __maybe_unused)
 
return ret;
 }
+
+/*
+ * Incase of powerpc architecture, pmu registers are programmable
+ * by guest kernel. So monitoring guest via host may not provide
+ * valid samples. It is better to fail the "perf kvm record"
+ * with default "cycles" event to monitor guest in powerpc.
+ *
+ * Function to parse the arguments and return appropriate values.
+ */
+int kvm_add_default_arch_event(int *argc, const char **argv)
+{
+   const char **tmp;
+   bool event = false;
+   int i, j = *argc;
+
+   const struct option event_options[] = {
+   OPT_BOOLEAN('e', "event", , NULL),
+   OPT_END()
+   };
+
+   tmp = calloc(j + 1, sizeof(char *));
+   if (!tmp)
+   return -EINVAL;
+
+   for (i = 0; i < j; i++)
+   tmp[i] = argv[i];
+
+   parse_options(j, tmp, event_options, NULL, 0);
+   if (!event) {
+   free(tmp);
+   return -EINVAL;
+   }
+
+   free(tmp);
+   return 0;
+}
diff --git a/tools/perf/builtin-kvm.c b/tools/perf/builtin-kvm.c
index dbb6f737a3e2..fe33b3ec55c9 100644
--- a/tools/perf/builtin-kvm.c
+++ b/tools/perf/builtin-kvm.c
@@ -1510,11 +1510,21 @@ static int kvm_cmd_stat(const char *file_name, int 
argc, const char **argv)
 }
 #endif /* HAVE_KVM_STAT_SUPPORT */
 
+int __weak kvm_add_default_arch_event(int *argc __maybe_unused,
+   const char **argv __maybe_unused)
+{
+   return 0;
+}
+
 static int __cmd_record(const char *file_name, int argc, const char **argv)
 {
-   int rec_argc, i = 0, j;
+   int rec_argc, i = 0, j, ret;
const char **rec_argv;
 
+   ret = kvm_add_default_arch_event(, argv);
+   if (ret)
+   return -EINVAL;
+
rec_argc = argc + 2;
rec_argv = calloc(rec_argc + 1, sizeof(char *));
rec_argv[i++] = strdup("record");
diff --git a/tools/perf/util/kvm-stat.h b/tools/perf/util/kvm-stat.h
index 1403dec189b4..da38b56c46cb 100644
--- a/tools/perf/util/kvm-stat.h
+++ b/tools/perf/util/kvm-stat.h
@@ -144,5 +144,5 @@ extern const int decode_str_len;
 extern const char *kvm_exit_reason;
 extern const char *kvm_entry_trace;
 extern const char *kvm_exit_trace;
-
+extern int kvm_add_default_arch_event(int *argc, const char **argv);
 #endif /* __PERF_KVM_STAT_H */
-- 
2.17.2



Re: [PATCH] powerpc/powernv: Return for invalid IMC domain

2019-05-22 Thread Anju T Sudhakar

Hi,

On 5/21/19 5:18 PM, Michael Ellerman wrote:

Anju T Sudhakar  writes:

Currently init_imc_pmu() can be failed either because
an IMC unit with invalid domain(i.e an IMC node not
supported by the kernel) is attempted a pmu-registration
or something went wrong while registering a valid IMC unit.
In both the cases kernel provides a 'Registration failed'
error message.

Example:
Log message, when trace-imc node is not supported by the kernel, and the
skiboot supports trace-imc node.

So for kernel, trace-imc node is now an unknown domain.

[1.731870] nest_phb5_imc performance monitor hardware support registered
[1.731944] nest_powerbus0_imc performance monitor hardware support 
registered
[1.734458] thread_imc performance monitor hardware support registered
[1.734460] IMC Unknown Device type
[1.734462] IMC PMU (null) Register failed
[1.734558] nest_xlink0_imc performance monitor hardware support registered
[1.734614] nest_xlink1_imc performance monitor hardware support registered
[1.734670] nest_xlink2_imc performance monitor hardware support registered
[1.747043] Initialise system trusted keyrings
[1.747054] Key type blacklist registered


To avoid ambiguity on the error message, return for invalid domain
before attempting a pmu registration.

What do we print once the patch is applied?



Once the patch is applied, we return for invalid domains. so we will 
only have


`/IMC Unknown Device type/` message printed for *unknown domains*.

And `/IMC PMU (null) Register failed/` message will appear only if the

registration fails for a *known domain*.



Thanks,

Anju




Re: [PATCH] powerpc/perf: Use cpumask_last() to determine the

2019-05-20 Thread Anju T Sudhakar

Hi,

Somehow the subject of this patch didn't appear completely here.

The Subject of this patch is as follows,

`Subject [PATCH] powerpc/perf: Use cpumask_last() to determine the 
designated cpu for nest/core units.`


Thanks,

Anju


On 5/20/19 2:35 PM, Anju T Sudhakar wrote:

Nest and core imc(In-memory Collection counters) assigns a particular
cpu as the designated target for counter data collection.
During system boot, the first online cpu in a chip gets assigned as
the designated cpu for that chip(for nest-imc) and the first online cpu
in a core gets assigned as the designated cpu for that core(for core-imc).

If the designated cpu goes offline, the next online cpu from the same
chip(for nest-imc)/core(for core-imc) is assigned as the next target,
and the event context is migrated to the target cpu.
Currently, cpumask_any_but() function is used to find the target cpu.
Though this function is expected to return a `random` cpu, this always
returns the next online cpu.

If all cpus in a chip/core is offlined in a sequential manner, starting
from the first cpu, the event migration has to happen for all the cpus
which goes offline. Since the migration process involves a grace period,
the total time taken to offline all the cpus will be significantly high.





[PATCH] powerpc/perf: Use cpumask_last() to determine the

2019-05-20 Thread Anju T Sudhakar
Nest and core imc(In-memory Collection counters) assigns a particular
cpu as the designated target for counter data collection.
During system boot, the first online cpu in a chip gets assigned as
the designated cpu for that chip(for nest-imc) and the first online cpu
in a core gets assigned as the designated cpu for that core(for core-imc).

If the designated cpu goes offline, the next online cpu from the same
chip(for nest-imc)/core(for core-imc) is assigned as the next target,
and the event context is migrated to the target cpu.
Currently, cpumask_any_but() function is used to find the target cpu.
Though this function is expected to return a `random` cpu, this always
returns the next online cpu.

If all cpus in a chip/core is offlined in a sequential manner, starting
from the first cpu, the event migration has to happen for all the cpus
which goes offline. Since the migration process involves a grace period,
the total time taken to offline all the cpus will be significantly high.

Example:
In a system which has 2 sockets, with
NUMA node0 CPU(s): 0-87
NUMA node8 CPU(s): 88-175

Time taken to offline cpu 88-175:
real2m56.099s
user0m0.191s
sys 0m0.000s

Use cpumask_last() to choose the target cpu, when the designated cpu
goes online, so the migration will happen only when the last_cpu in the
mask goes offline. This way the time taken to offline all cpus in a
chip/core can be reduced.

With the patch, 

Time taken  to offline cpu 88-175:
real0m12.207s
user0m0.171s
sys 0m0.000s

cpumask_last() is a better way to find the target cpu, since in most of the
cases cpuhotplug is performed in an increasing order(even in ppc64_cpu).

cpumask_any_but() can still be used to check the possibility of other
online cpus from the same chip/core if the last cpu in the mask goes
offline.

Signed-off-by: Anju T Sudhakar 
---
 arch/powerpc/perf/imc-pmu.c | 14 --
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index 31fa753..fbfd6e7 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -366,7 +366,14 @@ static int ppc_nest_imc_cpu_offline(unsigned int cpu)
 */
nid = cpu_to_node(cpu);
l_cpumask = cpumask_of_node(nid);
-   target = cpumask_any_but(l_cpumask, cpu);
+   target = cpumask_last(l_cpumask);
+
+   /*
+* If this(target) is the last cpu in the cpumask for this chip,
+* check for any possible online cpu in the chip.
+*/
+   if (unlikely(target == cpu))
+   target = cpumask_any_but(l_cpumask, cpu);
 
/*
 * Update the cpumask with the target cpu and
@@ -671,7 +678,10 @@ static int ppc_core_imc_cpu_offline(unsigned int cpu)
return 0;
 
/* Find any online cpu in that core except the current "cpu" */
-   ncpu = cpumask_any_but(cpu_sibling_mask(cpu), cpu);
+   ncpu = cpumask_last(cpu_sibling_mask(cpu));
+
+   if (unlikely(ncpu == cpu))
+   ncpu = cpumask_any_but(cpu_sibling_mask(cpu), cpu);
 
if (ncpu >= 0 && ncpu < nr_cpu_ids) {
cpumask_set_cpu(ncpu, _imc_cpumask);
-- 
1.8.3.1



[PATCH] powerpc/powernv: Return for invalid IMC domain

2019-05-20 Thread Anju T Sudhakar
Currently init_imc_pmu() can be failed either because
an IMC unit with invalid domain(i.e an IMC node not
supported by the kernel) is attempted a pmu-registration
or something went wrong while registering a valid IMC unit.
In both the cases kernel provides a 'Registration failed'
error message.

Example:
Log message, when trace-imc node is not supported by the kernel, and the
skiboot supports trace-imc node.

So for kernel, trace-imc node is now an unknown domain.

[1.731870] nest_phb5_imc performance monitor hardware support registered
[1.731944] nest_powerbus0_imc performance monitor hardware support 
registered
[1.734458] thread_imc performance monitor hardware support registered
[1.734460] IMC Unknown Device type
[1.734462] IMC PMU (null) Register failed
[1.734558] nest_xlink0_imc performance monitor hardware support registered
[1.734614] nest_xlink1_imc performance monitor hardware support registered
[1.734670] nest_xlink2_imc performance monitor hardware support registered
[1.747043] Initialise system trusted keyrings
[1.747054] Key type blacklist registered


To avoid ambiguity on the error message, return for invalid domain
before attempting a pmu registration. 

Fixes: 8f95faaac56c1 (`powerpc/powernv: Detect and create IMC device`)
Reported-by: Pavaman Subramaniyam 
Signed-off-by: Anju T Sudhakar 
---
 arch/powerpc/platforms/powernv/opal-imc.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/arch/powerpc/platforms/powernv/opal-imc.c 
b/arch/powerpc/platforms/powernv/opal-imc.c
index 58a0794..4e8b0e1 100644
--- a/arch/powerpc/platforms/powernv/opal-imc.c
+++ b/arch/powerpc/platforms/powernv/opal-imc.c
@@ -161,6 +161,10 @@ static int imc_pmu_create(struct device_node *parent, int 
pmu_index, int domain)
struct imc_pmu *pmu_ptr;
u32 offset;
 
+   /* Return for unknown domain */
+   if (domain < 0)
+   return -EINVAL;
+
/* memory for pmu */
pmu_ptr = kzalloc(sizeof(*pmu_ptr), GFP_KERNEL);
if (!pmu_ptr)
-- 
1.8.3.1



[PATCH] powerpc/imc: Add documentation for IMC and trace-mode

2019-05-10 Thread Anju T Sudhakar
Documentation for IMC(In-Memory Collection Counters) infrastructure
and trace-mode of IMC.

Signed-off-by: Anju T Sudhakar 
---
 Documentation/powerpc/imc.txt | 195 ++
 1 file changed, 195 insertions(+)
 create mode 100644 Documentation/powerpc/imc.txt

diff --git a/Documentation/powerpc/imc.txt b/Documentation/powerpc/imc.txt
new file mode 100644
index ..9c32e059f3be
--- /dev/null
+++ b/Documentation/powerpc/imc.txt
@@ -0,0 +1,195 @@
+   ===
+   IMC (In-Memory Collection Counters)
+   ===
+   Date created: 10 May 2019
+
+Table of Contents:
+--
+   - Basic overview
+   - IMC example Usage
+   - IMC Trace Mode
+   - LDBAR Register Layout
+   - TRACE_IMC_SCOM bit representation
+   - Trace IMC example usage
+   - Benefits of using IMC trace-mode
+
+
+Basic overview
+==
+
+IMC (In-Memory collection counters) is a hardware monitoring facility
+that collects large number of hardware performance events at Nest level
+(these are on-chip but off-core), Core level and Thread level.
+
+The Nest PMU counters are handled by a Nest IMC microcode which runs
+in the OCC (On-Chip Controller) complex. The microcode collects the
+counter data and moves the nest IMC counter data to memory.
+
+The Core and Thread IMC PMU counters are handled in the core. Core
+level PMU counters give us the IMC counters' data per core and thread
+level PMU counters give us the IMC counters' data per CPU thread.
+
+OPAL obtains the IMC PMU and supported events information from the
+IMC Catalog and passes on to the kernel via the device tree. The event's
+information contains :
+ - Event name
+ - Event Offset
+ - Event description
+and, maybe :
+ - Event scale
+ - Event unit
+
+Some PMUs may have a common scale and unit values for all their
+supported events. For those cases, the scale and unit properties for
+those events must be inherited from the PMU.
+
+The event offset in the memory is where the counter data gets
+accumulated.
+
+IMC catalog is available at:
+   https://github.com/open-power/ima-catalog
+
+The kernel discovers the IMC counters information in the device tree
+at the "imc-counters" device node which has a compatible field
+"ibm,opal-in-memory-counters". From the device tree, the kernel parses
+the PMUs and their event's information and register the PMU and it
+attributes in the kernel.
+
+IMC example usage
+=
+
+# perf list
+
+  [...]
+  nest_mcs01/PM_MCS01_64B_RD_DISP_PORT01/[Kernel PMU event]
+  nest_mcs01/PM_MCS01_64B_RD_DISP_PORT23/[Kernel PMU event]
+
+  [...]
+  core_imc/CPM_0THRD_NON_IDLE_PCYC/  [Kernel PMU event]
+  core_imc/CPM_1THRD_NON_IDLE_INST/  [Kernel PMU event]
+
+  [...]
+  thread_imc/CPM_0THRD_NON_IDLE_PCYC/[Kernel PMU event]
+  thread_imc/CPM_1THRD_NON_IDLE_INST/[Kernel PMU event]
+
+To see per chip data for nest_mcs0/PM_MCS_DOWN_128B_DATA_XFER_MC0/ :
+ # ./perf stat -e "nest_mcs01/PM_MCS01_64B_WR_DISP_PORT01/" -a --per-socket
+
+To see non-idle instructions for core 0 :
+ # ./perf stat -e "core_imc/CPM_NON_IDLE_INST/" -C 0 -I 1000
+
+To see non-idle instructions for a "make" :
+ # ./perf stat -e "thread_imc/CPM_NON_IDLE_PCYC/" make
+
+
+IMC Trace-mode
+===
+
+POWER9 support two modes for IMC which are the Accumulation mode and
+Trace mode. In Accumulation mode, event counts are accumulated in system
+Memory. Hypervisor then reads the posted counts periodically or when
+requested. In IMC Trace mode, the 64 bit trace scom value is initialized
+with the event information. The CPMC*SEL and CPMC_LOAD in the trace scom,
+specifies the event to be monitored and the sampling duration. On each
+overflow in the CPMC*SEL, hardware snapshots the program counter along
+with event counts and writes into memory pointed by LDBAR.
+
+LDBAR is a 64 bit special purpose per thread register, it has bits to
+indicate whether hardware is configured for accumulation or trace mode.
+
+* LDBAR Register Layout:
+   0 : Enable/Disable
+   1 : 0 -> Accumulation Mode
+   1 -> Trace Mode
+   2:3   : Reserved
+   4-6   : PB scope
+   7 : Reserved
+   8:50  : Counter Address
+   51:63 : Reserved
+
+* TRACE_IMC_SCOM bit representation:
+
+   0:1 : SAMPSEL
+   2:33: CPMC_LOAD
+   34:40   : CPMC1SEL
+   41:47   : CPMC2SEL
+   48:50   : BUFFERSIZE
+   51:63   : RESERVED
+
+CPMC_LOAD contains the sampling duration. SAMPSEL and CPMC*SEL determines
+the event to count. BUFFRSIZE indicates the memory range. On each overflow,
+hardware snapshots program counter along with event counts and update the
+memory and reloads the CMPC_LOAD value for the nex

Re: [PATCH v4 0/5] powerpc/perf: IMC trace-mode support

2019-04-16 Thread Anju T Sudhakar



On 4/16/19 3:14 PM, Anju T Sudhakar wrote:

Hi,

Kindly ignore this series, since patch 5/5 in this series doesn't 
incorporate the event-format change


that I've done in v4 of this series.


Apologies for the inconvenience. I will post the updated v5 soon.



s/v5/v4



Thanks,

Anju

On 4/15/19 3:41 PM, Anju T Sudhakar wrote:

IMC (In-Memory collection counters) is a hardware monitoring facility
that collects large number of hardware performance events.
POWER9 support two modes for IMC which are the Accumulation mode and
Trace mode. In Accumulation mode, event counts are accumulated in system
Memory. Hypervisor then reads the posted counts periodically or when
requested. In IMC Trace mode, the 64 bit trace scom value is initialized
with the event information. The CPMC*SEL and CPMC_LOAD in the trace 
scom, specifies
the event to be monitored and the sampling duration. On each overflow 
in the

CPMC*SEL, hardware snapshots the program counter along with event counts
and writes into memory pointed by LDBAR. LDBAR has bits to indicate 
whether

hardware is configured for accumulation or trace mode.
Currently the event monitored for trace-mode is fixed as cycle.

Trace-IMC Implementation:
--
To enable trace-imc, we need to

* Add trace node in the DTS file for power9, so that the new trace 
node can

be discovered by the kernel.

Information included in the DTS file are as follows, (a snippet from
the ima-catalog)

TRACE_IMC: trace-events {
  #address-cells = <0x1>;
  #size-cells = <0x1>;
  event at 1020 {
 event-name = "cycles" ;
 reg = <0x1020 0x8>;
 desc = "Reference cycles" ;
  };
  };
  trace@0 {
 compatible = "ibm,imc-counters";
 events-prefix = "trace_";
 reg = <0x0 0x8>;
 events = < _IMC >;
 type = <0x2>;
 size = <0x4>;
  };

OP-BUILD changes needed to include the "trace node" is already pulled in
to the ima-catalog repo.

ps://github.com/open-power/op-build/commit/d3e75dc26d1283d7d5eb444bff1ec9e40d5dfc07 



* Enchance the opal_imc_counters_* calls to support this new trace mode
in imc. Add support to initialize the trace-mode scom.

TRACE_IMC_SCOM bit representation:

0:1 : SAMPSEL
2:33    : CPMC_LOAD
34:40   : CPMC1SEL
41:47   : CPMC2SEL
48:50   : BUFFERSIZE
51:63   : RESERVED

CPMC_LOAD contains the sampling duration. SAMPSEL and CPMC*SEL 
determines
the event to count. BUFFRSIZE indicates the memory range. On each 
overflow,
hardware snapshots program counter along with event counts and update 
the

memory and reloads the CMPC_LOAD value for the next sampling duration.
IMC hardware does not support exceptions, so it quietly wraps around if
memory buffer reaches the end.

OPAL support for IMC trace mode is already upstream.

* Set LDBAR spr to enable imc-trace mode.
   LDBAR Layout:
   0 : Enable/Disable
   1 : 0 -> Accumulation Mode
   1 -> Trace Mode
   2:3   : Reserved
   4-6   : PB scope
   7 : Reserved
   8:50  : Counter Address
   51:63 : Reserved

--

PMI interrupt handling is avoided, since IMC trace mode snapshots the
program counter and update to the memory. And this also provide a way 
for

the operating system to do instruction sampling in real time without
PMI(Performance Monitoring Interrupts) processing overhead.
Performance data using 'perf top' with and without trace-imc event:

PMI interrupts count when `perf top` command is executed without 
trace-imc event.


# cat /proc/interrupts  (a snippet from the output)
9944  1072    804    804   1644    804 1306
804    804    804    804    804 804    804
804    804   1961   1602    804    804 1258
[-]
803    803    803    803    803 803    803
803    803    803    803    804 804    804
804    804    804    804    804 804    803
803    803    803    803    803 1306    803
803   Performance monitoring interrupts


`perf top` with trace-imc (executed right after 'perf top' without 
trace-imc event):


# perf top -e trace_imc/trace_cycles/
12.50%  [kernel]  [k] arch_cpu_idle
11.81%  [kernel]  [k] __next_timer_interrupt
11.22%  [kernel]  [k] rcu_idle_enter
10.25%  [kernel]  [k] find_next_bit
  7.91%  [kernel]  [k] do_idle
  7.69%  [kernel]  [k] rcu_dynticks_eqs_exit
  5.20%  [kernel]  [k] tick_nohz_idle_stop_tick
  [---]

# cat /proc/interrupts (a snippet from the output)

9944  1072    804    804   1644    804 1306
804    804    804    804    804 804    804
804    804   1961   1602    804    804 1258
[-]
8

[PATCH v4 5/5] powerpc/perf: Trace imc PMU functions

2019-04-16 Thread Anju T Sudhakar
Add PMU functions to support trace-imc.

Signed-off-by: Anju T Sudhakar 
Reviewed-by: Madhavan Srinivasan 
---
 arch/powerpc/perf/imc-pmu.c | 205 +++-
 1 file changed, 204 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index 3fe0222885bc..cc9724561bf2 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -53,7 +53,7 @@ static struct imc_pmu *imc_event_to_pmu(struct perf_event 
*event)
return container_of(event->pmu, struct imc_pmu, pmu);
 }
 
-PMU_FORMAT_ATTR(event, "config:0-40");
+PMU_FORMAT_ATTR(event, "config:0-61");
 PMU_FORMAT_ATTR(offset, "config:0-31");
 PMU_FORMAT_ATTR(rvalue, "config:32");
 PMU_FORMAT_ATTR(mode, "config:33-40");
@@ -70,6 +70,25 @@ static struct attribute_group imc_format_group = {
.attrs = imc_format_attrs,
 };
 
+/* Format attribute for imc trace-mode */
+PMU_FORMAT_ATTR(cpmc_reserved, "config:0-19");
+PMU_FORMAT_ATTR(cpmc_event, "config:20-27");
+PMU_FORMAT_ATTR(cpmc_samplesel, "config:28-29");
+PMU_FORMAT_ATTR(cpmc_load, "config:30-61");
+static struct attribute *trace_imc_format_attrs[] = {
+   _attr_event.attr,
+   _attr_cpmc_reserved.attr,
+   _attr_cpmc_event.attr,
+   _attr_cpmc_samplesel.attr,
+   _attr_cpmc_load.attr,
+   NULL,
+};
+
+static struct attribute_group trace_imc_format_group = {
+.name = "format",
+.attrs = trace_imc_format_attrs,
+};
+
 /* Get the cpumask printed to a buffer "buf" */
 static ssize_t imc_pmu_cpumask_get_attr(struct device *dev,
struct device_attribute *attr,
@@ -1108,6 +1127,182 @@ static int trace_imc_cpu_init(void)
  ppc_trace_imc_cpu_offline);
 }
 
+static u64 get_trace_imc_event_base_addr(void)
+{
+   return (u64)per_cpu(trace_imc_mem, smp_processor_id());
+}
+
+/*
+ * Function to parse trace-imc data obtained
+ * and to prepare the perf sample.
+ */
+static int trace_imc_prepare_sample(struct trace_imc_data *mem,
+   struct perf_sample_data *data,
+   u64 *prev_tb,
+   struct perf_event_header *header,
+   struct perf_event *event)
+{
+   /* Sanity checks for a valid record */
+   if (be64_to_cpu(READ_ONCE(mem->tb1)) > *prev_tb)
+   *prev_tb = be64_to_cpu(READ_ONCE(mem->tb1));
+   else
+   return -EINVAL;
+
+   if ((be64_to_cpu(READ_ONCE(mem->tb1)) & IMC_TRACE_RECORD_TB1_MASK) !=
+be64_to_cpu(READ_ONCE(mem->tb2)))
+   return -EINVAL;
+
+   /* Prepare perf sample */
+   data->ip =  be64_to_cpu(READ_ONCE(mem->ip));
+   data->period = event->hw.last_period;
+
+   header->type = PERF_RECORD_SAMPLE;
+   header->size = sizeof(*header) + event->header_size;
+   header->misc = 0;
+
+   if (is_kernel_addr(data->ip))
+   header->misc |= PERF_RECORD_MISC_KERNEL;
+   else
+   header->misc |= PERF_RECORD_MISC_USER;
+
+   perf_event_header__init_id(header, data, event);
+
+   return 0;
+}
+
+static void dump_trace_imc_data(struct perf_event *event)
+{
+   struct trace_imc_data *mem;
+   int i, ret;
+   u64 prev_tb = 0;
+
+   mem = (struct trace_imc_data *)get_trace_imc_event_base_addr();
+   for (i = 0; i < (trace_imc_mem_size / sizeof(struct trace_imc_data));
+   i++, mem++) {
+   struct perf_sample_data data;
+   struct perf_event_header header;
+
+   ret = trace_imc_prepare_sample(mem, , _tb, , 
event);
+   if (ret) /* Exit, if not a valid record */
+   break;
+   else {
+   /* If this is a valid record, create the sample */
+   struct perf_output_handle handle;
+
+   if (perf_output_begin(, event, header.size))
+   return;
+
+   perf_output_sample(, , , event);
+   perf_output_end();
+   }
+   }
+}
+
+static int trace_imc_event_add(struct perf_event *event, int flags)
+{
+   int core_id = smp_processor_id() / threads_per_core;
+   struct imc_pmu_ref *ref = NULL;
+   u64 local_mem, ldbar_value;
+
+   /* Set trace-imc bit in ldbar and load ldbar with per-thread memory 
address */
+   local_mem = get_trace_imc_event_base_addr();
+   ldbar_value = ((u64)local_mem & THREAD_IMC_LDBAR_MASK) | 
TRACE_IMC_ENABLE;
+
+   if (core_imc_refc)
+   ref = _imc_refc[core_id];
+   if (!ref) {
+   /* If core-imc is not enabled, use trace-imc reference count */
+   if (trace_imc_refc)
+ 

[PATCH v4 4/5] powerpc/perf: Trace imc events detection and cpuhotplug

2019-04-16 Thread Anju T Sudhakar
Patch detects trace-imc events, does memory initilizations for each online
cpu, and registers cpuhotplug call-backs.

Signed-off-by: Anju T Sudhakar 
Reviewed-by: Madhavan Srinivasan 
---
 arch/powerpc/perf/imc-pmu.c   | 104 ++
 arch/powerpc/platforms/powernv/opal-imc.c |   3 +
 include/linux/cpuhotplug.h|   1 +
 3 files changed, 108 insertions(+)

diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index 7fe258e17dfe..3fe0222885bc 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -43,6 +43,11 @@ static DEFINE_PER_CPU(u64 *, thread_imc_mem);
 static struct imc_pmu *thread_imc_pmu;
 static int thread_imc_mem_size;
 
+/* Trace IMC data structures */
+static DEFINE_PER_CPU(u64 *, trace_imc_mem);
+static struct imc_pmu_ref *trace_imc_refc;
+static int trace_imc_mem_size;
+
 static struct imc_pmu *imc_event_to_pmu(struct perf_event *event)
 {
return container_of(event->pmu, struct imc_pmu, pmu);
@@ -1050,6 +1055,59 @@ static void thread_imc_event_del(struct perf_event 
*event, int flags)
imc_event_update(event);
 }
 
+/*
+ * Allocate a page of memory for each cpu, and load LDBAR with 0.
+ */
+static int trace_imc_mem_alloc(int cpu_id, int size)
+{
+   u64 *local_mem = per_cpu(trace_imc_mem, cpu_id);
+   int phys_id = cpu_to_node(cpu_id), rc = 0;
+   int core_id = (cpu_id / threads_per_core);
+
+   if (!local_mem) {
+   local_mem = page_address(alloc_pages_node(phys_id,
+   GFP_KERNEL | __GFP_ZERO | 
__GFP_THISNODE |
+   __GFP_NOWARN, get_order(size)));
+   if (!local_mem)
+   return -ENOMEM;
+   per_cpu(trace_imc_mem, cpu_id) = local_mem;
+
+   /* Initialise the counters for trace mode */
+   rc = opal_imc_counters_init(OPAL_IMC_COUNTERS_TRACE, __pa((void 
*)local_mem),
+   get_hard_smp_processor_id(cpu_id));
+   if (rc) {
+   pr_info("IMC:opal init failed for trace imc\n");
+   return rc;
+   }
+   }
+
+   /* Init the mutex, if not already */
+   trace_imc_refc[core_id].id = core_id;
+   mutex_init(_imc_refc[core_id].lock);
+
+   mtspr(SPRN_LDBAR, 0);
+   return 0;
+}
+
+static int ppc_trace_imc_cpu_online(unsigned int cpu)
+{
+   return trace_imc_mem_alloc(cpu, trace_imc_mem_size);
+}
+
+static int ppc_trace_imc_cpu_offline(unsigned int cpu)
+{
+   mtspr(SPRN_LDBAR, 0);
+   return 0;
+}
+
+static int trace_imc_cpu_init(void)
+{
+   return cpuhp_setup_state(CPUHP_AP_PERF_POWERPC_TRACE_IMC_ONLINE,
+ "perf/powerpc/imc_trace:online",
+ ppc_trace_imc_cpu_online,
+ ppc_trace_imc_cpu_offline);
+}
+
 /* update_pmu_ops : Populate the appropriate operations for "pmu" */
 static int update_pmu_ops(struct imc_pmu *pmu)
 {
@@ -1172,6 +1230,18 @@ static void cleanup_all_thread_imc_memory(void)
}
 }
 
+static void cleanup_all_trace_imc_memory(void)
+{
+   int i, order = get_order(trace_imc_mem_size);
+
+   for_each_online_cpu(i) {
+   if (per_cpu(trace_imc_mem, i))
+   free_pages((u64)per_cpu(trace_imc_mem, i), order);
+
+   }
+   kfree(trace_imc_refc);
+}
+
 /* Function to free the attr_groups which are dynamically allocated */
 static void imc_common_mem_free(struct imc_pmu *pmu_ptr)
 {
@@ -1213,6 +1283,11 @@ static void imc_common_cpuhp_mem_free(struct imc_pmu 
*pmu_ptr)
cpuhp_remove_state(CPUHP_AP_PERF_POWERPC_THREAD_IMC_ONLINE);
cleanup_all_thread_imc_memory();
}
+
+   if (pmu_ptr->domain == IMC_DOMAIN_TRACE) {
+   cpuhp_remove_state(CPUHP_AP_PERF_POWERPC_TRACE_IMC_ONLINE);
+   cleanup_all_trace_imc_memory();
+   }
 }
 
 /*
@@ -1295,6 +1370,27 @@ static int imc_mem_init(struct imc_pmu *pmu_ptr, struct 
device_node *parent,
 
thread_imc_pmu = pmu_ptr;
break;
+   case IMC_DOMAIN_TRACE:
+   /* Update the pmu name */
+   pmu_ptr->pmu.name = kasprintf(GFP_KERNEL, "%s%s", s, "_imc");
+   if (!pmu_ptr->pmu.name)
+   return -ENOMEM;
+
+   nr_cores = DIV_ROUND_UP(num_possible_cpus(), threads_per_core);
+   trace_imc_refc = kcalloc(nr_cores, sizeof(struct imc_pmu_ref),
+   GFP_KERNEL);
+   if (!trace_imc_refc)
+   return -ENOMEM;
+
+   trace_imc_mem_size = pmu_ptr->counter_mem_size;
+   for_each_online_cpu(cpu) {
+   res = trace_imc_mem_alloc(cpu, trace_imc_mem_size);
+   if (re

[PATCH v4 2/5] powerpc/perf: Rearrange setting of ldbar for thread-imc

2019-04-16 Thread Anju T Sudhakar
LDBAR holds the memory address allocated for each cpu. For thread-imc
the mode bit (i.e bit 1) of LDBAR is set to accumulation.
Currently, ldbar is loaded with per cpu memory address and mode set to
accumulation at boot time.

To enable trace-imc, the mode bit of ldbar should be set to 'trace'. So to
accommodate trace-mode of IMC, reposition setting of ldbar for thread-imc
to thread_imc_event_add(). Also reset ldbar at thread_imc_event_del().

Signed-off-by: Anju T Sudhakar 
Reviewed-by: Madhavan Srinivasan 
---
 arch/powerpc/perf/imc-pmu.c | 28 +---
 1 file changed, 17 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index b1c37cc3fa98..51f1d3eaaa6d 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -788,8 +788,11 @@ static int core_imc_event_init(struct perf_event *event)
 }
 
 /*
- * Allocates a page of memory for each of the online cpus, and write the
- * physical base address of that page to the LDBAR for that cpu.
+ * Allocates a page of memory for each of the online cpus, and load
+ * LDBAR with 0.
+ * The physical base address of the page allocated for a cpu will be
+ * written to the LDBAR for that cpu, when the thread-imc event
+ * is added.
  *
  * LDBAR Register Layout:
  *
@@ -807,7 +810,7 @@ static int core_imc_event_init(struct perf_event *event)
  */
 static int thread_imc_mem_alloc(int cpu_id, int size)
 {
-   u64 ldbar_value, *local_mem = per_cpu(thread_imc_mem, cpu_id);
+   u64 *local_mem = per_cpu(thread_imc_mem, cpu_id);
int nid = cpu_to_node(cpu_id);
 
if (!local_mem) {
@@ -824,9 +827,7 @@ static int thread_imc_mem_alloc(int cpu_id, int size)
per_cpu(thread_imc_mem, cpu_id) = local_mem;
}
 
-   ldbar_value = ((u64)local_mem & THREAD_IMC_LDBAR_MASK) | 
THREAD_IMC_ENABLE;
-
-   mtspr(SPRN_LDBAR, ldbar_value);
+   mtspr(SPRN_LDBAR, 0);
return 0;
 }
 
@@ -977,6 +978,7 @@ static int thread_imc_event_add(struct perf_event *event, 
int flags)
 {
int core_id;
struct imc_pmu_ref *ref;
+   u64 ldbar_value, *local_mem = per_cpu(thread_imc_mem, 
smp_processor_id());
 
if (flags & PERF_EF_START)
imc_event_start(event, flags);
@@ -985,6 +987,9 @@ static int thread_imc_event_add(struct perf_event *event, 
int flags)
return -EINVAL;
 
core_id = smp_processor_id() / threads_per_core;
+   ldbar_value = ((u64)local_mem & THREAD_IMC_LDBAR_MASK) | 
THREAD_IMC_ENABLE;
+   mtspr(SPRN_LDBAR, ldbar_value);
+
/*
 * imc pmus are enabled only when it is used.
 * See if this is triggered for the first time.
@@ -1016,11 +1021,7 @@ static void thread_imc_event_del(struct perf_event 
*event, int flags)
int core_id;
struct imc_pmu_ref *ref;
 
-   /*
-* Take a snapshot and calculate the delta and update
-* the event counter values.
-*/
-   imc_event_update(event);
+   mtspr(SPRN_LDBAR, 0);
 
core_id = smp_processor_id() / threads_per_core;
ref = _imc_refc[core_id];
@@ -1039,6 +1040,11 @@ static void thread_imc_event_del(struct perf_event 
*event, int flags)
ref->refc = 0;
}
mutex_unlock(>lock);
+   /*
+* Take a snapshot and calculate the delta and update
+* the event counter values.
+*/
+   imc_event_update(event);
 }
 
 /* update_pmu_ops : Populate the appropriate operations for "pmu" */
-- 
2.17.2



[PATCH v4 3/5] powerpc/perf: Add privileged access check for thread_imc

2019-04-16 Thread Anju T Sudhakar
From: Madhavan Srinivasan 

Add code to restrict user access to thread_imc pmu since
some event report privilege level information.

Fixes: f74c89bd80fb3 ('powerpc/perf: Add thread IMC PMU support')
Signed-off-by: Madhavan Srinivasan 
Signed-off-by: Anju T Sudhakar 
---
 arch/powerpc/perf/imc-pmu.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index 51f1d3eaaa6d..7fe258e17dfe 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -859,6 +859,9 @@ static int thread_imc_event_init(struct perf_event *event)
if (event->attr.type != event->pmu->type)
return -ENOENT;
 
+   if (!capable(CAP_SYS_ADMIN))
+   return -EACCES;
+
/* Sampling not supported */
if (event->hw.sample_period)
return -EINVAL;
-- 
2.17.2



[PATCH v4 1/5] powerpc/include: Add data structures and macros for IMC trace mode

2019-04-16 Thread Anju T Sudhakar
Add the macros needed for IMC (In-Memory Collection Counters) trace-mode
and data structure to hold the trace-imc record data.
Also, add the new type "OPAL_IMC_COUNTERS_TRACE" in 'opal-api.h', since
there is a new switch case added in the opal-calls for IMC.

Signed-off-by: Anju T Sudhakar 
Reviewed-by: Madhavan Srinivasan 
---
 arch/powerpc/include/asm/imc-pmu.h  | 39 +
 arch/powerpc/include/asm/opal-api.h |  1 +
 2 files changed, 40 insertions(+)

diff --git a/arch/powerpc/include/asm/imc-pmu.h 
b/arch/powerpc/include/asm/imc-pmu.h
index 69f516ecb2fd..7c2ef0e42661 100644
--- a/arch/powerpc/include/asm/imc-pmu.h
+++ b/arch/powerpc/include/asm/imc-pmu.h
@@ -33,6 +33,7 @@
  */
 #define THREAD_IMC_LDBAR_MASK   0x0003e000ULL
 #define THREAD_IMC_ENABLE   0x8000ULL
+#define TRACE_IMC_ENABLE   0x4000ULL
 
 /*
  * For debugfs interface for imc-mode and imc-command
@@ -59,6 +60,34 @@ struct imc_events {
char *scale;
 };
 
+/*
+ * Trace IMC hardware updates a 64bytes record on
+ * Core Performance Monitoring Counter (CPMC)
+ * overflow. Here is the layout for the trace imc record
+ *
+ * DW 0 : Timebase
+ * DW 1 : Program Counter
+ * DW 2 : PIDR information
+ * DW 3 : CPMC1
+ * DW 4 : CPMC2
+ * DW 5 : CPMC3
+ * Dw 6 : CPMC4
+ * DW 7 : Timebase
+ * .
+ *
+ * The following is the data structure to hold trace imc data.
+ */
+struct trace_imc_data {
+   u64 tb1;
+   u64 ip;
+   u64 val;
+   u64 cpmc1;
+   u64 cpmc2;
+   u64 cpmc3;
+   u64 cpmc4;
+   u64 tb2;
+};
+
 /* Event attribute array index */
 #define IMC_FORMAT_ATTR0
 #define IMC_EVENT_ATTR 1
@@ -68,6 +97,13 @@ struct imc_events {
 /* PMU Format attribute macros */
 #define IMC_EVENT_OFFSET_MASK  0xULL
 
+/*
+ * Macro to mask bits 0:21 of first double word(which is the timebase) to
+ * compare with 8th double word (timebase) of trace imc record data.
+ */
+#define IMC_TRACE_RECORD_TB1_MASK  0x3ffULL
+
+
 /*
  * Device tree parser code detects IMC pmu support and
  * registers new IMC pmus. This structure will hold the
@@ -113,6 +149,7 @@ struct imc_pmu_ref {
 
 enum {
IMC_TYPE_THREAD = 0x1,
+   IMC_TYPE_TRACE  = 0x2,
IMC_TYPE_CORE   = 0x4,
IMC_TYPE_CHIP   = 0x10,
 };
@@ -123,6 +160,8 @@ enum {
 #define IMC_DOMAIN_NEST1
 #define IMC_DOMAIN_CORE2
 #define IMC_DOMAIN_THREAD  3
+/* For trace-imc the domain is still thread but it operates in trace-mode */
+#define IMC_DOMAIN_TRACE   4
 
 extern int init_imc_pmu(struct device_node *parent,
struct imc_pmu *pmu_ptr, int pmu_id);
diff --git a/arch/powerpc/include/asm/opal-api.h 
b/arch/powerpc/include/asm/opal-api.h
index 870fb7b239ea..a4130b21b159 100644
--- a/arch/powerpc/include/asm/opal-api.h
+++ b/arch/powerpc/include/asm/opal-api.h
@@ -1118,6 +1118,7 @@ enum {
 enum {
OPAL_IMC_COUNTERS_NEST = 1,
OPAL_IMC_COUNTERS_CORE = 2,
+   OPAL_IMC_COUNTERS_TRACE = 3,
 };
 
 
-- 
2.17.2



[PATCH v4 0/5] powerpc/perf: IMC trace-mode support

2019-04-16 Thread Anju T Sudhakar
   
  0 : Enable/Disable 
  1 : 0 -> Accumulation Mode 
  1 -> Trace Mode
  2:3   : Reserved   
  4-6   : PB scope   
  7 : Reserved   
  8:50  : Counter Address
  51:63 : Reserved 

--

PMI interrupt handling is avoided, since IMC trace mode snapshots the
program counter and update to the memory. And this also provide a way for
the operating system to do instruction sampling in real time without
PMI(Performance Monitoring Interrupts) processing overhead. 


Performance data using 'perf top' with and without trace-imc event:

PMI interrupts count when `perf top` command is executed without trace-imc 
event.

# cat /proc/interrupts  (a snippet from the output)
9944  1072804804   1644804   1306  
804804804804804804804  
804804   1961   1602804804   1258  
[-]
803803803803803803803  
803803803803804804804 
804804804804804804803 
803803803803803   1306803 
803   Performance monitoring interrupts   


`perf top` with trace-imc (executed right after 'perf top' without trace-imc 
event):

# perf top -e trace_imc/trace_cycles/  
12.50%  [kernel]  [k] arch_cpu_idle
11.81%  [kernel]  [k] __next_timer_interrupt   
11.22%  [kernel]  [k] rcu_idle_enter   
10.25%  [kernel]  [k] find_next_bit
 7.91%  [kernel]  [k] do_idle  
 7.69%  [kernel]  [k] rcu_dynticks_eqs_exit
 5.20%  [kernel]  [k] tick_nohz_idle_stop_tick 
 [---]  

# cat /proc/interrupts (a snippet from the output) 

9944  1072804804   1644804   1306  
804804804804804804804  
804804   1961   1602804804   1258  
[-]
803803803803803803803  
803803803804804804804
804804804804804804803 
803803803803803   1306803 
803   Performance monitoring interrupts   

The PMI interrupts count remains the same.


Changelog:
--
>From v3 -> v4:

* trace_imc_refc is introduced. So that even if, core-imc
is disabled, trace-imc can be used.

* trace_imc_pmu_sched_task is removed and opal start/stop
is invoked in trace_imc_event_add/del function.

 
Suggestions/comments are welcome.

Anju T Sudhakar (4):   
  powerpc/include: Add data structures and macros for IMC trace mode   
  powerpc/perf: Rearrange setting of ldbar for thread-imc  
  powerpc/perf: Trace imc events detection and cpuhotplug  
  powerpc/perf: Trace imc PMU functions
   
Madhavan Srinivasan (1):   
  powerpc/perf: Add privileged access check for thread_imc 
 

Re: [PATCH v4 0/5] powerpc/perf: IMC trace-mode support

2019-04-16 Thread Anju T Sudhakar

Hi,

Kindly ignore this series, since patch 5/5 in this series doesn't 
incorporate the event-format change


that I've done in v4 of this series.


Apologies for the inconvenience. I will post the updated v5 soon.


Thanks,

Anju

On 4/15/19 3:41 PM, Anju T Sudhakar wrote:

IMC (In-Memory collection counters) is a hardware monitoring facility
that collects large number of hardware performance events.
POWER9 support two modes for IMC which are the Accumulation mode and
Trace mode. In Accumulation mode, event counts are accumulated in system
Memory. Hypervisor then reads the posted counts periodically or when
requested. In IMC Trace mode, the 64 bit trace scom value is initialized
with the event information. The CPMC*SEL and CPMC_LOAD in the trace scom, 
specifies
the event to be monitored and the sampling duration. On each overflow in the
CPMC*SEL, hardware snapshots the program counter along with event counts
and writes into memory pointed by LDBAR. LDBAR has bits to indicate whether
hardware is configured for accumulation or trace mode.
Currently the event monitored for trace-mode is fixed as cycle.

Trace-IMC Implementation:
--
To enable trace-imc, we need to

* Add trace node in the DTS file for power9, so that the new trace node can
be discovered by the kernel.

Information included in the DTS file are as follows, (a snippet from
the ima-catalog)

TRACE_IMC: trace-events {
  #address-cells = <0x1>;
  #size-cells = <0x1>;
  event at 1020 {
 event-name = "cycles" ;
 reg = <0x1020 0x8>;
 desc = "Reference cycles" ;
  };
  };
  trace@0 {
 compatible = "ibm,imc-counters";
 events-prefix = "trace_";
 reg = <0x0 0x8>;
 events = < _IMC >;
 type = <0x2>;
 size = <0x4>;
  };

OP-BUILD changes needed to include the "trace node" is already pulled in
to the ima-catalog repo.

ps://github.com/open-power/op-build/commit/d3e75dc26d1283d7d5eb444bff1ec9e40d5dfc07

* Enchance the opal_imc_counters_* calls to support this new trace mode
in imc. Add support to initialize the trace-mode scom.

TRACE_IMC_SCOM bit representation:

0:1 : SAMPSEL
2:33: CPMC_LOAD
34:40   : CPMC1SEL
41:47   : CPMC2SEL
48:50   : BUFFERSIZE
51:63   : RESERVED

CPMC_LOAD contains the sampling duration. SAMPSEL and CPMC*SEL determines
the event to count. BUFFRSIZE indicates the memory range. On each overflow,
hardware snapshots program counter along with event counts and update the
memory and reloads the CMPC_LOAD value for the next sampling duration.
IMC hardware does not support exceptions, so it quietly wraps around if
memory buffer reaches the end.

OPAL support for IMC trace mode is already upstream.

* Set LDBAR spr to enable imc-trace mode.
  
   LDBAR Layout:
  
   0 : Enable/Disable

   1 : 0 -> Accumulation Mode
   1 -> Trace Mode
   2:3   : Reserved
   4-6   : PB scope
   7 : Reserved
   8:50  : Counter Address
   51:63 : Reserved

--

PMI interrupt handling is avoided, since IMC trace mode snapshots the
program counter and update to the memory. And this also provide a way for
the operating system to do instruction sampling in real time without
PMI(Performance Monitoring Interrupts) processing overhead. 


Performance data using 'perf top' with and without trace-imc event:

PMI interrupts count when `perf top` command is executed without trace-imc 
event.

# cat /proc/interrupts  (a snippet from the output)
9944  1072804804   1644804   1306
804804804804804804804
804804   1961   1602804804   1258
[-]
803803803803803803803
803803803803804804804
804 

[PATCH v4 5/5] powerpc/perf: Trace imc PMU functions

2019-04-15 Thread Anju T Sudhakar
Add PMU functions to support trace-imc.

Signed-off-by: Anju T Sudhakar 
Reviewed-by: Madhavan Srinivasan 
---
 arch/powerpc/perf/imc-pmu.c | 183 
 1 file changed, 183 insertions(+)

diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index 3fe0222885bc..3f433cc96b18 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -1108,6 +1108,182 @@ static int trace_imc_cpu_init(void)
  ppc_trace_imc_cpu_offline);
 }
 
+static u64 get_trace_imc_event_base_addr(void)
+{
+   return (u64)per_cpu(trace_imc_mem, smp_processor_id());
+}
+
+/*
+ * Function to parse trace-imc data obtained
+ * and to prepare the perf sample.
+ */
+static int trace_imc_prepare_sample(struct trace_imc_data *mem,
+   struct perf_sample_data *data,
+   u64 *prev_tb,
+   struct perf_event_header *header,
+   struct perf_event *event)
+{
+   /* Sanity checks for a valid record */
+   if (be64_to_cpu(READ_ONCE(mem->tb1)) > *prev_tb)
+   *prev_tb = be64_to_cpu(READ_ONCE(mem->tb1));
+   else
+   return -EINVAL;
+
+   if ((be64_to_cpu(READ_ONCE(mem->tb1)) & IMC_TRACE_RECORD_TB1_MASK) !=
+be64_to_cpu(READ_ONCE(mem->tb2)))
+   return -EINVAL;
+
+   /* Prepare perf sample */
+   data->ip =  be64_to_cpu(READ_ONCE(mem->ip));
+   data->period = event->hw.last_period;
+
+   header->type = PERF_RECORD_SAMPLE;
+   header->size = sizeof(*header) + event->header_size;
+   header->misc = 0;
+
+   if (is_kernel_addr(data->ip))
+   header->misc |= PERF_RECORD_MISC_KERNEL;
+   else
+   header->misc |= PERF_RECORD_MISC_USER;
+
+   perf_event_header__init_id(header, data, event);
+
+   return 0;
+}
+
+static void dump_trace_imc_data(struct perf_event *event)
+{
+   struct trace_imc_data *mem;
+   int i, ret;
+   u64 prev_tb = 0;
+
+   mem = (struct trace_imc_data *)get_trace_imc_event_base_addr();
+   for (i = 0; i < (trace_imc_mem_size / sizeof(struct trace_imc_data));
+   i++, mem++) {
+   struct perf_sample_data data;
+   struct perf_event_header header;
+
+   ret = trace_imc_prepare_sample(mem, , _tb, , 
event);
+   if (ret) /* Exit, if not a valid record */
+   break;
+   else {
+   /* If this is a valid record, create the sample */
+   struct perf_output_handle handle;
+
+   if (perf_output_begin(, event, header.size))
+   return;
+
+   perf_output_sample(, , , event);
+   perf_output_end();
+   }
+   }
+}
+
+static int trace_imc_event_add(struct perf_event *event, int flags)
+{
+   int core_id = smp_processor_id() / threads_per_core;
+   struct imc_pmu_ref *ref = NULL;
+   u64 local_mem, ldbar_value;
+
+   /* Set trace-imc bit in ldbar and load ldbar with per-thread memory 
address */
+   local_mem = get_trace_imc_event_base_addr();
+   ldbar_value = ((u64)local_mem & THREAD_IMC_LDBAR_MASK) | 
TRACE_IMC_ENABLE;
+
+   if (core_imc_refc)
+   ref = _imc_refc[core_id];
+   if (!ref) {
+   /* If core-imc is not enabled, use trace-imc reference count */
+   if (trace_imc_refc)
+   ref = _imc_refc[core_id];
+   if (!ref)
+   return -EINVAL;
+   }
+   mtspr(SPRN_LDBAR, ldbar_value);
+   mutex_lock(>lock);
+   if (ref->refc == 0) {
+   if (opal_imc_counters_start(OPAL_IMC_COUNTERS_TRACE,
+   get_hard_smp_processor_id(smp_processor_id( 
{
+   mutex_unlock(>lock);
+   pr_err("trace-imc: Unable to start the counters for 
core %d\n", core_id);
+   mtspr(SPRN_LDBAR, 0);
+   return -EINVAL;
+   }
+   }
+   ++ref->refc;
+   mutex_unlock(>lock);
+
+   return 0;
+}
+
+static void trace_imc_event_read(struct perf_event *event)
+{
+   return;
+}
+
+static void trace_imc_event_stop(struct perf_event *event, int flags)
+{
+   u64 local_mem = get_trace_imc_event_base_addr();
+   dump_trace_imc_data(event);
+   memset((void *)local_mem, 0, sizeof(u64));
+}
+
+static void trace_imc_event_start(struct perf_event *event, int flags)
+{
+   return;
+}
+
+static void trace_imc_event_del(struct perf_event *event, int flags)
+{
+   int core_id = smp_processor_id() / threads_per_core;
+   struct imc_pmu_ref *ref = NULL;
+
+   if (core_imc_refc)
+   ref = _imc_r

[PATCH v4 4/5] powerpc/perf: Trace imc events detection and cpuhotplug

2019-04-15 Thread Anju T Sudhakar
Patch detects trace-imc events, does memory initilizations for each online
cpu, and registers cpuhotplug call-backs.

Signed-off-by: Anju T Sudhakar 
Reviewed-by: Madhavan Srinivasan 
---
 arch/powerpc/perf/imc-pmu.c   | 104 ++
 arch/powerpc/platforms/powernv/opal-imc.c |   3 +
 include/linux/cpuhotplug.h|   1 +
 3 files changed, 108 insertions(+)

diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index 7fe258e17dfe..3fe0222885bc 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -43,6 +43,11 @@ static DEFINE_PER_CPU(u64 *, thread_imc_mem);
 static struct imc_pmu *thread_imc_pmu;
 static int thread_imc_mem_size;
 
+/* Trace IMC data structures */
+static DEFINE_PER_CPU(u64 *, trace_imc_mem);
+static struct imc_pmu_ref *trace_imc_refc;
+static int trace_imc_mem_size;
+
 static struct imc_pmu *imc_event_to_pmu(struct perf_event *event)
 {
return container_of(event->pmu, struct imc_pmu, pmu);
@@ -1050,6 +1055,59 @@ static void thread_imc_event_del(struct perf_event 
*event, int flags)
imc_event_update(event);
 }
 
+/*
+ * Allocate a page of memory for each cpu, and load LDBAR with 0.
+ */
+static int trace_imc_mem_alloc(int cpu_id, int size)
+{
+   u64 *local_mem = per_cpu(trace_imc_mem, cpu_id);
+   int phys_id = cpu_to_node(cpu_id), rc = 0;
+   int core_id = (cpu_id / threads_per_core);
+
+   if (!local_mem) {
+   local_mem = page_address(alloc_pages_node(phys_id,
+   GFP_KERNEL | __GFP_ZERO | 
__GFP_THISNODE |
+   __GFP_NOWARN, get_order(size)));
+   if (!local_mem)
+   return -ENOMEM;
+   per_cpu(trace_imc_mem, cpu_id) = local_mem;
+
+   /* Initialise the counters for trace mode */
+   rc = opal_imc_counters_init(OPAL_IMC_COUNTERS_TRACE, __pa((void 
*)local_mem),
+   get_hard_smp_processor_id(cpu_id));
+   if (rc) {
+   pr_info("IMC:opal init failed for trace imc\n");
+   return rc;
+   }
+   }
+
+   /* Init the mutex, if not already */
+   trace_imc_refc[core_id].id = core_id;
+   mutex_init(_imc_refc[core_id].lock);
+
+   mtspr(SPRN_LDBAR, 0);
+   return 0;
+}
+
+static int ppc_trace_imc_cpu_online(unsigned int cpu)
+{
+   return trace_imc_mem_alloc(cpu, trace_imc_mem_size);
+}
+
+static int ppc_trace_imc_cpu_offline(unsigned int cpu)
+{
+   mtspr(SPRN_LDBAR, 0);
+   return 0;
+}
+
+static int trace_imc_cpu_init(void)
+{
+   return cpuhp_setup_state(CPUHP_AP_PERF_POWERPC_TRACE_IMC_ONLINE,
+ "perf/powerpc/imc_trace:online",
+ ppc_trace_imc_cpu_online,
+ ppc_trace_imc_cpu_offline);
+}
+
 /* update_pmu_ops : Populate the appropriate operations for "pmu" */
 static int update_pmu_ops(struct imc_pmu *pmu)
 {
@@ -1172,6 +1230,18 @@ static void cleanup_all_thread_imc_memory(void)
}
 }
 
+static void cleanup_all_trace_imc_memory(void)
+{
+   int i, order = get_order(trace_imc_mem_size);
+
+   for_each_online_cpu(i) {
+   if (per_cpu(trace_imc_mem, i))
+   free_pages((u64)per_cpu(trace_imc_mem, i), order);
+
+   }
+   kfree(trace_imc_refc);
+}
+
 /* Function to free the attr_groups which are dynamically allocated */
 static void imc_common_mem_free(struct imc_pmu *pmu_ptr)
 {
@@ -1213,6 +1283,11 @@ static void imc_common_cpuhp_mem_free(struct imc_pmu 
*pmu_ptr)
cpuhp_remove_state(CPUHP_AP_PERF_POWERPC_THREAD_IMC_ONLINE);
cleanup_all_thread_imc_memory();
}
+
+   if (pmu_ptr->domain == IMC_DOMAIN_TRACE) {
+   cpuhp_remove_state(CPUHP_AP_PERF_POWERPC_TRACE_IMC_ONLINE);
+   cleanup_all_trace_imc_memory();
+   }
 }
 
 /*
@@ -1295,6 +1370,27 @@ static int imc_mem_init(struct imc_pmu *pmu_ptr, struct 
device_node *parent,
 
thread_imc_pmu = pmu_ptr;
break;
+   case IMC_DOMAIN_TRACE:
+   /* Update the pmu name */
+   pmu_ptr->pmu.name = kasprintf(GFP_KERNEL, "%s%s", s, "_imc");
+   if (!pmu_ptr->pmu.name)
+   return -ENOMEM;
+
+   nr_cores = DIV_ROUND_UP(num_possible_cpus(), threads_per_core);
+   trace_imc_refc = kcalloc(nr_cores, sizeof(struct imc_pmu_ref),
+   GFP_KERNEL);
+   if (!trace_imc_refc)
+   return -ENOMEM;
+
+   trace_imc_mem_size = pmu_ptr->counter_mem_size;
+   for_each_online_cpu(cpu) {
+   res = trace_imc_mem_alloc(cpu, trace_imc_mem_size);
+   if (re

[PATCH v4 3/5] powerpc/perf: Add privileged access check for thread_imc

2019-04-15 Thread Anju T Sudhakar
From: Madhavan Srinivasan 

Add code to restrict user access to thread_imc pmu since
some event report privilege level information.

Fixes: f74c89bd80fb3 ('powerpc/perf: Add thread IMC PMU support')
Signed-off-by: Madhavan Srinivasan 
Signed-off-by: Anju T Sudhakar 
---
 arch/powerpc/perf/imc-pmu.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index 51f1d3eaaa6d..7fe258e17dfe 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -859,6 +859,9 @@ static int thread_imc_event_init(struct perf_event *event)
if (event->attr.type != event->pmu->type)
return -ENOENT;
 
+   if (!capable(CAP_SYS_ADMIN))
+   return -EACCES;
+
/* Sampling not supported */
if (event->hw.sample_period)
return -EINVAL;
-- 
2.17.2



[PATCH v4 2/5] powerpc/perf: Rearrange setting of ldbar for thread-imc

2019-04-15 Thread Anju T Sudhakar
LDBAR holds the memory address allocated for each cpu. For thread-imc
the mode bit (i.e bit 1) of LDBAR is set to accumulation.
Currently, ldbar is loaded with per cpu memory address and mode set to
accumulation at boot time.

To enable trace-imc, the mode bit of ldbar should be set to 'trace'. So to
accommodate trace-mode of IMC, reposition setting of ldbar for thread-imc
to thread_imc_event_add(). Also reset ldbar at thread_imc_event_del().

Signed-off-by: Anju T Sudhakar 
Reviewed-by: Madhavan Srinivasan 
---
 arch/powerpc/perf/imc-pmu.c | 28 +---
 1 file changed, 17 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index b1c37cc3fa98..51f1d3eaaa6d 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -788,8 +788,11 @@ static int core_imc_event_init(struct perf_event *event)
 }
 
 /*
- * Allocates a page of memory for each of the online cpus, and write the
- * physical base address of that page to the LDBAR for that cpu.
+ * Allocates a page of memory for each of the online cpus, and load
+ * LDBAR with 0.
+ * The physical base address of the page allocated for a cpu will be
+ * written to the LDBAR for that cpu, when the thread-imc event
+ * is added.
  *
  * LDBAR Register Layout:
  *
@@ -807,7 +810,7 @@ static int core_imc_event_init(struct perf_event *event)
  */
 static int thread_imc_mem_alloc(int cpu_id, int size)
 {
-   u64 ldbar_value, *local_mem = per_cpu(thread_imc_mem, cpu_id);
+   u64 *local_mem = per_cpu(thread_imc_mem, cpu_id);
int nid = cpu_to_node(cpu_id);
 
if (!local_mem) {
@@ -824,9 +827,7 @@ static int thread_imc_mem_alloc(int cpu_id, int size)
per_cpu(thread_imc_mem, cpu_id) = local_mem;
}
 
-   ldbar_value = ((u64)local_mem & THREAD_IMC_LDBAR_MASK) | 
THREAD_IMC_ENABLE;
-
-   mtspr(SPRN_LDBAR, ldbar_value);
+   mtspr(SPRN_LDBAR, 0);
return 0;
 }
 
@@ -977,6 +978,7 @@ static int thread_imc_event_add(struct perf_event *event, 
int flags)
 {
int core_id;
struct imc_pmu_ref *ref;
+   u64 ldbar_value, *local_mem = per_cpu(thread_imc_mem, 
smp_processor_id());
 
if (flags & PERF_EF_START)
imc_event_start(event, flags);
@@ -985,6 +987,9 @@ static int thread_imc_event_add(struct perf_event *event, 
int flags)
return -EINVAL;
 
core_id = smp_processor_id() / threads_per_core;
+   ldbar_value = ((u64)local_mem & THREAD_IMC_LDBAR_MASK) | 
THREAD_IMC_ENABLE;
+   mtspr(SPRN_LDBAR, ldbar_value);
+
/*
 * imc pmus are enabled only when it is used.
 * See if this is triggered for the first time.
@@ -1016,11 +1021,7 @@ static void thread_imc_event_del(struct perf_event 
*event, int flags)
int core_id;
struct imc_pmu_ref *ref;
 
-   /*
-* Take a snapshot and calculate the delta and update
-* the event counter values.
-*/
-   imc_event_update(event);
+   mtspr(SPRN_LDBAR, 0);
 
core_id = smp_processor_id() / threads_per_core;
ref = _imc_refc[core_id];
@@ -1039,6 +1040,11 @@ static void thread_imc_event_del(struct perf_event 
*event, int flags)
ref->refc = 0;
}
mutex_unlock(>lock);
+   /*
+* Take a snapshot and calculate the delta and update
+* the event counter values.
+*/
+   imc_event_update(event);
 }
 
 /* update_pmu_ops : Populate the appropriate operations for "pmu" */
-- 
2.17.2



[PATCH v4 1/5] powerpc/include: Add data structures and macros for IMC trace mode

2019-04-15 Thread Anju T Sudhakar
Add the macros needed for IMC (In-Memory Collection Counters) trace-mode
and data structure to hold the trace-imc record data.
Also, add the new type "OPAL_IMC_COUNTERS_TRACE" in 'opal-api.h', since
there is a new switch case added in the opal-calls for IMC.

Signed-off-by: Anju T Sudhakar 
Reviewed-by: Madhavan Srinivasan 
---
 arch/powerpc/include/asm/imc-pmu.h  | 39 +
 arch/powerpc/include/asm/opal-api.h |  1 +
 2 files changed, 40 insertions(+)

diff --git a/arch/powerpc/include/asm/imc-pmu.h 
b/arch/powerpc/include/asm/imc-pmu.h
index 69f516ecb2fd..7c2ef0e42661 100644
--- a/arch/powerpc/include/asm/imc-pmu.h
+++ b/arch/powerpc/include/asm/imc-pmu.h
@@ -33,6 +33,7 @@
  */
 #define THREAD_IMC_LDBAR_MASK   0x0003e000ULL
 #define THREAD_IMC_ENABLE   0x8000ULL
+#define TRACE_IMC_ENABLE   0x4000ULL
 
 /*
  * For debugfs interface for imc-mode and imc-command
@@ -59,6 +60,34 @@ struct imc_events {
char *scale;
 };
 
+/*
+ * Trace IMC hardware updates a 64bytes record on
+ * Core Performance Monitoring Counter (CPMC)
+ * overflow. Here is the layout for the trace imc record
+ *
+ * DW 0 : Timebase
+ * DW 1 : Program Counter
+ * DW 2 : PIDR information
+ * DW 3 : CPMC1
+ * DW 4 : CPMC2
+ * DW 5 : CPMC3
+ * Dw 6 : CPMC4
+ * DW 7 : Timebase
+ * .
+ *
+ * The following is the data structure to hold trace imc data.
+ */
+struct trace_imc_data {
+   u64 tb1;
+   u64 ip;
+   u64 val;
+   u64 cpmc1;
+   u64 cpmc2;
+   u64 cpmc3;
+   u64 cpmc4;
+   u64 tb2;
+};
+
 /* Event attribute array index */
 #define IMC_FORMAT_ATTR0
 #define IMC_EVENT_ATTR 1
@@ -68,6 +97,13 @@ struct imc_events {
 /* PMU Format attribute macros */
 #define IMC_EVENT_OFFSET_MASK  0xULL
 
+/*
+ * Macro to mask bits 0:21 of first double word(which is the timebase) to
+ * compare with 8th double word (timebase) of trace imc record data.
+ */
+#define IMC_TRACE_RECORD_TB1_MASK  0x3ffULL
+
+
 /*
  * Device tree parser code detects IMC pmu support and
  * registers new IMC pmus. This structure will hold the
@@ -113,6 +149,7 @@ struct imc_pmu_ref {
 
 enum {
IMC_TYPE_THREAD = 0x1,
+   IMC_TYPE_TRACE  = 0x2,
IMC_TYPE_CORE   = 0x4,
IMC_TYPE_CHIP   = 0x10,
 };
@@ -123,6 +160,8 @@ enum {
 #define IMC_DOMAIN_NEST1
 #define IMC_DOMAIN_CORE2
 #define IMC_DOMAIN_THREAD  3
+/* For trace-imc the domain is still thread but it operates in trace-mode */
+#define IMC_DOMAIN_TRACE   4
 
 extern int init_imc_pmu(struct device_node *parent,
struct imc_pmu *pmu_ptr, int pmu_id);
diff --git a/arch/powerpc/include/asm/opal-api.h 
b/arch/powerpc/include/asm/opal-api.h
index 870fb7b239ea..a4130b21b159 100644
--- a/arch/powerpc/include/asm/opal-api.h
+++ b/arch/powerpc/include/asm/opal-api.h
@@ -1118,6 +1118,7 @@ enum {
 enum {
OPAL_IMC_COUNTERS_NEST = 1,
OPAL_IMC_COUNTERS_CORE = 2,
+   OPAL_IMC_COUNTERS_TRACE = 3,
 };
 
 
-- 
2.17.2



[PATCH v4 0/5] powerpc/perf: IMC trace-mode support

2019-04-15 Thread Anju T Sudhakar
   
  0 : Enable/Disable 
  1 : 0 -> Accumulation Mode 
  1 -> Trace Mode
  2:3   : Reserved   
  4-6   : PB scope   
  7 : Reserved   
  8:50  : Counter Address
  51:63 : Reserved 

--

PMI interrupt handling is avoided, since IMC trace mode snapshots the
program counter and update to the memory. And this also provide a way for
the operating system to do instruction sampling in real time without
PMI(Performance Monitoring Interrupts) processing overhead. 


Performance data using 'perf top' with and without trace-imc event:

PMI interrupts count when `perf top` command is executed without trace-imc 
event.

# cat /proc/interrupts  (a snippet from the output)
9944  1072804804   1644804   1306  
804804804804804804804  
804804   1961   1602804804   1258  
[-]
803803803803803803803  
803803803803804804804 
804804804804804804803 
803803803803803   1306803 
803   Performance monitoring interrupts   


`perf top` with trace-imc (executed right after 'perf top' without trace-imc 
event):

# perf top -e trace_imc/trace_cycles/  
12.50%  [kernel]  [k] arch_cpu_idle
11.81%  [kernel]  [k] __next_timer_interrupt   
11.22%  [kernel]  [k] rcu_idle_enter   
10.25%  [kernel]  [k] find_next_bit
 7.91%  [kernel]  [k] do_idle  
 7.69%  [kernel]  [k] rcu_dynticks_eqs_exit
 5.20%  [kernel]  [k] tick_nohz_idle_stop_tick 
 [---]  

# cat /proc/interrupts (a snippet from the output) 

9944  1072804804   1644804   1306  
804804804804804804804  
804804   1961   1602804804   1258  
[-]
803803803803803803803  
803803803804804804804
804804804804804804803 
803803803803803   1306803 
803   Performance monitoring interrupts   

The PMI interrupts count remains the same.


Changelog:
--
>From v3 -> v4:

* trace_imc_refc is introduced. So that even if, core-imc
is disabled, trace-imc can be used.

* trace_imc_pmu_sched_task is removed and opal start/stop
is invoked in trace_imc_event_add/del function.

 
Suggestions/comments are welcome.

Anju T Sudhakar (4):
  powerpc/include: Add data structures and macros for IMC trace mode
  powerpc/perf: Rearrange setting of ldbar for thread-imc
  powerpc/perf: Trace imc events detection and cpuhotplug
  powerpc/perf: Trace imc PMU functions

Madhavan Srinivasan (1):
  powerpc/perf: Add privileged access check for thread_imc

 arch/powerpc/include/asm/imc-pmu.h|  39 +++
 arch/powerpc/include/asm/opal-api.h   |   1 +
 arch/powerpc/perf/imc-pmu.c   | 318 +-
 arch/powerpc/platforms/powernv/opal-imc.c |   3 +
 include/linux/cpuhotplug.h|   1 +
 5 files changed, 351 insertions(+), 11 deletions(-)

-- 
2.17.2



[PATCH v3 5/5] powerpc/perf: Trace imc PMU functions

2019-02-06 Thread Anju T Sudhakar
Add PMU functions to support trace-imc and define the format for
trace-imc events.

Signed-off-by: Anju T Sudhakar 
Reviewed-by: Madhavan Srinivasan 
---
 arch/powerpc/perf/imc-pmu.c | 197 +++-
 1 file changed, 196 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index 1f09265c8fb0..0f1a30f11f6a 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -52,7 +52,7 @@ static struct imc_pmu *imc_event_to_pmu(struct perf_event 
*event)
return container_of(event->pmu, struct imc_pmu, pmu);
 }
 
-PMU_FORMAT_ATTR(event, "config:0-40");
+PMU_FORMAT_ATTR(event, "config:0-61");
 PMU_FORMAT_ATTR(offset, "config:0-31");
 PMU_FORMAT_ATTR(rvalue, "config:32");
 PMU_FORMAT_ATTR(mode, "config:33-40");
@@ -69,6 +69,25 @@ static struct attribute_group imc_format_group = {
.attrs = imc_format_attrs,
 };
 
+/* Format attribute for imc trace-mode */
+PMU_FORMAT_ATTR(cpmc_reserved, "config:0-19");
+PMU_FORMAT_ATTR(cpmc_event, "config:20-27");
+PMU_FORMAT_ATTR(cpmc_samplesel, "config:28-29");
+PMU_FORMAT_ATTR(cpmc_load, "config:30-61");
+static struct attribute *trace_imc_format_attrs[] = {
+   _attr_event.attr,
+   _attr_cpmc_reserved.attr,
+   _attr_cpmc_event.attr,
+   _attr_cpmc_samplesel.attr,
+   _attr_cpmc_load.attr,
+   NULL,
+};
+
+static struct attribute_group trace_imc_format_group = {
+   .name = "format",
+   .attrs = trace_imc_format_attrs,
+};
+
 /* Get the cpumask printed to a buffer "buf" */
 static ssize_t imc_pmu_cpumask_get_attr(struct device *dev,
struct device_attribute *attr,
@@ -1120,6 +1139,173 @@ static int trace_imc_cpu_init(void)
  ppc_trace_imc_cpu_offline);
 }
 
+static u64 get_trace_imc_event_base_addr(void)
+{
+   return (u64)per_cpu(trace_imc_mem, smp_processor_id());
+}
+
+/*
+ * Function to parse trace-imc data obtained
+ * and to prepare the perf sample.
+ */
+static int trace_imc_prepare_sample(struct trace_imc_data *mem,
+   struct perf_sample_data *data,
+   u64 *prev_tb,
+   struct perf_event_header *header,
+   struct perf_event *event)
+{
+   /* Sanity checks for a valid record */
+   if (be64_to_cpu(READ_ONCE(mem->tb1)) > *prev_tb)
+   *prev_tb = be64_to_cpu(READ_ONCE(mem->tb1));
+   else
+   return -EINVAL;
+
+   if ((be64_to_cpu(READ_ONCE(mem->tb1)) & IMC_TRACE_RECORD_TB1_MASK) !=
+be64_to_cpu(READ_ONCE(mem->tb2)))
+   return -EINVAL;
+
+   /* Prepare perf sample */
+   data->ip =  be64_to_cpu(READ_ONCE(mem->ip));
+   data->period = event->hw.last_period;
+
+   header->type = PERF_RECORD_SAMPLE;
+   header->size = sizeof(*header) + event->header_size;
+   header->misc = 0;
+
+   if (is_kernel_addr(data->ip))
+   header->misc |= PERF_RECORD_MISC_KERNEL;
+   else
+   header->misc |= PERF_RECORD_MISC_USER;
+
+   perf_event_header__init_id(header, data, event);
+
+   return 0;
+}
+
+static void dump_trace_imc_data(struct perf_event *event)
+{
+   struct trace_imc_data *mem;
+   int i, ret;
+   u64 prev_tb = 0;
+
+   mem = (struct trace_imc_data *)get_trace_imc_event_base_addr();
+   for (i = 0; i < (trace_imc_mem_size / sizeof(struct trace_imc_data));
+   i++, mem++) {
+   struct perf_sample_data data;
+   struct perf_event_header header;
+
+   ret = trace_imc_prepare_sample(mem, , _tb, , 
event);
+   if (ret) /* Exit, if not a valid record */
+   break;
+   else {
+   /* If this is a valid record, create the sample */
+   struct perf_output_handle handle;
+
+   if (perf_output_begin(, event, header.size))
+   return;
+
+   perf_output_sample(, , , event);
+   perf_output_end();
+   }
+   }
+}
+
+static int trace_imc_event_add(struct perf_event *event, int flags)
+{
+   /* Enable the sched_task to start the engine */
+   perf_sched_cb_inc(event->ctx->pmu);
+   return 0;
+}
+
+static void trace_imc_event_read(struct perf_event *event)
+{
+   dump_trace_imc_data(event);
+}
+
+static void trace_imc_event_stop(struct perf_event *event, int flags)
+{
+   trace_imc_event_read(event);
+}
+
+static void trace_imc_event_start(struct perf_event *event, int flags)
+{
+   return;
+}
+
+static void trace_imc_event_del(struct perf_event *event, int flags)
+{
+  

[PATCH v3 4/5] powerpc/perf: Trace imc events detection and cpuhotplug

2019-02-06 Thread Anju T Sudhakar
Patch detects trace-imc events, does memory initilizations for each online
cpu, and registers cpuhotplug call-backs.

Signed-off-by: Anju T Sudhakar 
Reviewed-by: Madhavan Srinivasan 
---
 arch/powerpc/perf/imc-pmu.c   | 91 +++
 arch/powerpc/platforms/powernv/opal-imc.c |  3 +
 include/linux/cpuhotplug.h|  1 +
 3 files changed, 95 insertions(+)

diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index 5ca80545a849..1f09265c8fb0 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -43,6 +43,10 @@ static DEFINE_PER_CPU(u64 *, thread_imc_mem);
 static struct imc_pmu *thread_imc_pmu;
 static int thread_imc_mem_size;
 
+/* Trace IMC data structures */
+static DEFINE_PER_CPU(u64 *, trace_imc_mem);
+static int trace_imc_mem_size;
+
 static struct imc_pmu *imc_event_to_pmu(struct perf_event *event)
 {
return container_of(event->pmu, struct imc_pmu, pmu);
@@ -1068,6 +1072,54 @@ static void thread_imc_event_del(struct perf_event 
*event, int flags)
imc_event_update(event);
 }
 
+/*
+ * Allocate a page of memory for each cpu, and load LDBAR with 0.
+ */
+static int trace_imc_mem_alloc(int cpu_id, int size)
+{
+   u64 *local_mem = per_cpu(trace_imc_mem, cpu_id);
+   int phys_id = cpu_to_node(cpu_id), rc = 0;
+
+   if (!local_mem) {
+   local_mem = page_address(alloc_pages_node(phys_id,
+   GFP_KERNEL | __GFP_ZERO | 
__GFP_THISNODE |
+   __GFP_NOWARN, get_order(size)));
+   if (!local_mem)
+   return -ENOMEM;
+   per_cpu(trace_imc_mem, cpu_id) = local_mem;
+
+   /* Initialise the counters for trace mode */
+   rc = opal_imc_counters_init(OPAL_IMC_COUNTERS_TRACE, __pa((void 
*)local_mem),
+   get_hard_smp_processor_id(cpu_id));
+   if (rc) {
+   pr_info("IMC:opal init failed for trace imc\n");
+   return rc;
+   }
+   }
+
+   mtspr(SPRN_LDBAR, 0);
+   return 0;
+}
+
+static int ppc_trace_imc_cpu_online(unsigned int cpu)
+{
+   return trace_imc_mem_alloc(cpu, trace_imc_mem_size);
+}
+
+static int ppc_trace_imc_cpu_offline(unsigned int cpu)
+{
+   mtspr(SPRN_LDBAR, 0);
+   return 0;
+}
+
+static int trace_imc_cpu_init(void)
+{
+   return cpuhp_setup_state(CPUHP_AP_PERF_POWERPC_TRACE_IMC_ONLINE,
+ "perf/powerpc/imc_trace:online",
+ ppc_trace_imc_cpu_online,
+ ppc_trace_imc_cpu_offline);
+}
+
 /* update_pmu_ops : Populate the appropriate operations for "pmu" */
 static int update_pmu_ops(struct imc_pmu *pmu)
 {
@@ -1189,6 +1241,17 @@ static void cleanup_all_thread_imc_memory(void)
}
 }
 
+static void cleanup_all_trace_imc_memory(void)
+{
+   int i, order = get_order(trace_imc_mem_size);
+
+   for_each_online_cpu(i) {
+   if (per_cpu(trace_imc_mem, i))
+   free_pages((u64)per_cpu(trace_imc_mem, i), order);
+
+   }
+}
+
 /* Function to free the attr_groups which are dynamically allocated */
 static void imc_common_mem_free(struct imc_pmu *pmu_ptr)
 {
@@ -1230,6 +1293,11 @@ static void imc_common_cpuhp_mem_free(struct imc_pmu 
*pmu_ptr)
cpuhp_remove_state(CPUHP_AP_PERF_POWERPC_THREAD_IMC_ONLINE);
cleanup_all_thread_imc_memory();
}
+
+   if (pmu_ptr->domain == IMC_DOMAIN_TRACE) {
+   cpuhp_remove_state(CPUHP_AP_PERF_POWERPC_TRACE_IMC_ONLINE);
+   cleanup_all_trace_imc_memory();
+   }
 }
 
 /*
@@ -1312,6 +1380,21 @@ static int imc_mem_init(struct imc_pmu *pmu_ptr, struct 
device_node *parent,
 
thread_imc_pmu = pmu_ptr;
break;
+   case IMC_DOMAIN_TRACE:
+   /* Update the pmu name */
+   pmu_ptr->pmu.name = kasprintf(GFP_KERNEL, "%s%s", s, "_imc");
+   if (!pmu_ptr->pmu.name)
+   return -ENOMEM;
+
+   trace_imc_mem_size = pmu_ptr->counter_mem_size;
+   for_each_online_cpu(cpu) {
+   res = trace_imc_mem_alloc(cpu, trace_imc_mem_size);
+   if (res) {
+   cleanup_all_trace_imc_memory();
+   goto err;
+   }
+   }
+   break;
default:
return -EINVAL;
}
@@ -1384,6 +1467,14 @@ int init_imc_pmu(struct device_node *parent, struct 
imc_pmu *pmu_ptr, int pmu_id
goto err_free_mem;
}
 
+   break;
+   case IMC_DOMAIN_TRACE:
+   ret = trace_imc_cpu_init();
+   if (ret) {
+   cleanup_

[PATCH v3 3/5] powerpc/perf: Add privileged access check for thread_imc

2019-02-06 Thread Anju T Sudhakar
From: Madhavan Srinivasan 

Add code to restrict user access to thread_imc pmu since
some event report privilege level information.

Fixes: f74c89bd80fb3 ('powerpc/perf: Add thread IMC PMU support')
Signed-off-by: Madhavan Srinivasan 
Signed-off-by: Anju T Sudhakar 
---
 arch/powerpc/perf/imc-pmu.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index 3bef46f8417d..5ca80545a849 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -877,6 +877,9 @@ static int thread_imc_event_init(struct perf_event *event)
if (event->attr.type != event->pmu->type)
return -ENOENT;
 
+   if (!capable(CAP_SYS_ADMIN))
+   return -EACCES;
+
/* Sampling not supported */
if (event->hw.sample_period)
return -EINVAL;
-- 
2.17.1



[PATCH v3 2/5] powerpc/perf: Rearrange setting of ldbar for thread-imc

2019-02-06 Thread Anju T Sudhakar
LDBAR holds the memory address allocated for each cpu. For thread-imc
the mode bit (i.e bit 1) of LDBAR is set to accumulation.
Currently, ldbar is loaded with per cpu memory address and mode set to
accumulation at boot time.

To enable trace-imc, the mode bit of ldbar should be set to 'trace'. So to
accommodate trace-mode of IMC, reposition setting of ldbar for thread-imc
to thread_imc_event_add(). Also reset ldbar at thread_imc_event_del().

Signed-off-by: Anju T Sudhakar 
Reviewed-by: Madhavan Srinivasan 
---
 arch/powerpc/perf/imc-pmu.c | 28 +---
 1 file changed, 17 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index f292a3f284f1..3bef46f8417d 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -806,8 +806,11 @@ static int core_imc_event_init(struct perf_event *event)
 }
 
 /*
- * Allocates a page of memory for each of the online cpus, and write the
- * physical base address of that page to the LDBAR for that cpu.
+ * Allocates a page of memory for each of the online cpus, and load
+ * LDBAR with 0.
+ * The physical base address of the page allocated for a cpu will be
+ * written to the LDBAR for that cpu, when the thread-imc event
+ * is added.
  *
  * LDBAR Register Layout:
  *
@@ -825,7 +828,7 @@ static int core_imc_event_init(struct perf_event *event)
  */
 static int thread_imc_mem_alloc(int cpu_id, int size)
 {
-   u64 ldbar_value, *local_mem = per_cpu(thread_imc_mem, cpu_id);
+   u64 *local_mem = per_cpu(thread_imc_mem, cpu_id);
int nid = cpu_to_node(cpu_id);
 
if (!local_mem) {
@@ -842,9 +845,7 @@ static int thread_imc_mem_alloc(int cpu_id, int size)
per_cpu(thread_imc_mem, cpu_id) = local_mem;
}
 
-   ldbar_value = ((u64)local_mem & THREAD_IMC_LDBAR_MASK) | 
THREAD_IMC_ENABLE;
-
-   mtspr(SPRN_LDBAR, ldbar_value);
+   mtspr(SPRN_LDBAR, 0);
return 0;
 }
 
@@ -995,6 +996,7 @@ static int thread_imc_event_add(struct perf_event *event, 
int flags)
 {
int core_id;
struct imc_pmu_ref *ref;
+   u64 ldbar_value, *local_mem = per_cpu(thread_imc_mem, 
smp_processor_id());
 
if (flags & PERF_EF_START)
imc_event_start(event, flags);
@@ -1003,6 +1005,9 @@ static int thread_imc_event_add(struct perf_event *event, 
int flags)
return -EINVAL;
 
core_id = smp_processor_id() / threads_per_core;
+   ldbar_value = ((u64)local_mem & THREAD_IMC_LDBAR_MASK) | 
THREAD_IMC_ENABLE;
+   mtspr(SPRN_LDBAR, ldbar_value);
+
/*
 * imc pmus are enabled only when it is used.
 * See if this is triggered for the first time.
@@ -1034,11 +1039,7 @@ static void thread_imc_event_del(struct perf_event 
*event, int flags)
int core_id;
struct imc_pmu_ref *ref;
 
-   /*
-* Take a snapshot and calculate the delta and update
-* the event counter values.
-*/
-   imc_event_update(event);
+   mtspr(SPRN_LDBAR, 0);
 
core_id = smp_processor_id() / threads_per_core;
ref = _imc_refc[core_id];
@@ -1057,6 +1058,11 @@ static void thread_imc_event_del(struct perf_event 
*event, int flags)
ref->refc = 0;
}
mutex_unlock(>lock);
+   /*
+* Take a snapshot and calculate the delta and update
+* the event counter values.
+*/
+   imc_event_update(event);
 }
 
 /* update_pmu_ops : Populate the appropriate operations for "pmu" */
-- 
2.17.1



[PATCH v3 0/5] powerpc/perf: IMC trace-mode support

2019-02-06 Thread Anju T Sudhakar
bs.org/pipermail/skiboot/2018-December/012883.html

* Set LDBAR spr to enable imc-trace mode.

LDBAR Layout:

0 : Enable/Disable
1 : 0 -> Accumulation Mode
1 -> Trace Mode
2:3   : Reserved
4-6   : PB scope
7 : Reserved
8:50  : Counter Address
51:63 : Reserved

   

Key benefit of imc trace-mode is, each sample record contains the address
pointer along with other information. So that, we can profile the IP
without interrupting the application.

Performance data using 'perf top' with and without trace-imc event:

When the application is monitored with trace-imc event, we dont take any   
PMI interrupts.

PMI interrupts count when `perf top` command is executed without trac-imc event.

# perf top  
12.53%  [kernel]   [k] arch_cpu_idle   
11.32%  [kernel]   [k] rcu_idle_enter  
10.76%  [kernel]   [k] __next_timer_interrupt  
 9.49%  [kernel]   [k] find_next_bit   
 8.06%  [kernel]   [k] rcu_dynticks_eqs_exit   
 7.82%  [kernel]   [k] do_idle 
 5.71%  [kernel]   [k] tick_nohz_idle_stop_tic 
 [---]  
# cat /proc/interrupts  (a snippet from the output)
9944  1072804804   1644804   1306  
804804804804804804804  
804804   1961   1602804804   1258  
[-]
803803803803803803803  
803803803803804804804 
804804804804804804803 
803803803803803   1306803 
803   Performance monitoring interrupts   


`perf top` with trace-imc (right after 'perf top' without trace-imc event):

# perf top -e trace_imc/trace_cycles/  
12.50%  [kernel]  [k] arch_cpu_idle
11.81%  [kernel]  [k] __next_timer_interrupt   
11.22%  [kernel]  [k] rcu_idle_enter   
10.25%  [kernel]  [k] find_next_bit
 7.91%  [kernel]  [k] do_idle  
 7.69%  [kernel]  [k] rcu_dynticks_eqs_exit
 5.20%  [kernel]  [k] tick_nohz_idle_stop_tick 
 [---]  

# cat /proc/interrupts (a snippet from the output) 

9944  1072804804   1644804   1306  
804804804804804804804  
804804   1961   1602804804   1258  
[-]
803803803803803803803  
803803803804804804804
804804804804804804803 
803803803803803   1306803 
803   Performance monitoring interrupts   

The PMI interrupts count remains the same.  

Changelog:

>From v2 -> v3
--

* Redefined the event format for trace-imc.

Suggestions/comments are welcome.


Anju T Sudhakar (4):
  powerpc/include: Add data structures and macros for IMC trace mode
  powerpc/perf: Rearrange setting of ldbar for thread-imc
  powerpc/perf: Trace imc events detection and cpuhotplug
  powerpc/perf: Trace imc PMU functions

Madhavan Srinivasan (1):
  powerpc/perf: Add privileged access check for thread_imc

 arch/powerpc/include/asm/im

[PATCH v3 1/5] powerpc/include: Add data structures and macros for IMC trace mode

2019-02-06 Thread Anju T Sudhakar
Add the macros needed for IMC (In-Memory Collection Counters) trace-mode
and data structure to hold the trace-imc record data.
Also, add the new type "OPAL_IMC_COUNTERS_TRACE" in 'opal-api.h', since
there is a new switch case added in the opal-calls for IMC.

Signed-off-by: Anju T Sudhakar 
Reviewed-by: Madhavan Srinivasan 
---
 arch/powerpc/include/asm/imc-pmu.h  | 39 +
 arch/powerpc/include/asm/opal-api.h |  1 +
 2 files changed, 40 insertions(+)

diff --git a/arch/powerpc/include/asm/imc-pmu.h 
b/arch/powerpc/include/asm/imc-pmu.h
index 69f516ecb2fd..7c2ef0e42661 100644
--- a/arch/powerpc/include/asm/imc-pmu.h
+++ b/arch/powerpc/include/asm/imc-pmu.h
@@ -33,6 +33,7 @@
  */
 #define THREAD_IMC_LDBAR_MASK   0x0003e000ULL
 #define THREAD_IMC_ENABLE   0x8000ULL
+#define TRACE_IMC_ENABLE   0x4000ULL
 
 /*
  * For debugfs interface for imc-mode and imc-command
@@ -59,6 +60,34 @@ struct imc_events {
char *scale;
 };
 
+/*
+ * Trace IMC hardware updates a 64bytes record on
+ * Core Performance Monitoring Counter (CPMC)
+ * overflow. Here is the layout for the trace imc record
+ *
+ * DW 0 : Timebase
+ * DW 1 : Program Counter
+ * DW 2 : PIDR information
+ * DW 3 : CPMC1
+ * DW 4 : CPMC2
+ * DW 5 : CPMC3
+ * Dw 6 : CPMC4
+ * DW 7 : Timebase
+ * .
+ *
+ * The following is the data structure to hold trace imc data.
+ */
+struct trace_imc_data {
+   u64 tb1;
+   u64 ip;
+   u64 val;
+   u64 cpmc1;
+   u64 cpmc2;
+   u64 cpmc3;
+   u64 cpmc4;
+   u64 tb2;
+};
+
 /* Event attribute array index */
 #define IMC_FORMAT_ATTR0
 #define IMC_EVENT_ATTR 1
@@ -68,6 +97,13 @@ struct imc_events {
 /* PMU Format attribute macros */
 #define IMC_EVENT_OFFSET_MASK  0xULL
 
+/*
+ * Macro to mask bits 0:21 of first double word(which is the timebase) to
+ * compare with 8th double word (timebase) of trace imc record data.
+ */
+#define IMC_TRACE_RECORD_TB1_MASK  0x3ffULL
+
+
 /*
  * Device tree parser code detects IMC pmu support and
  * registers new IMC pmus. This structure will hold the
@@ -113,6 +149,7 @@ struct imc_pmu_ref {
 
 enum {
IMC_TYPE_THREAD = 0x1,
+   IMC_TYPE_TRACE  = 0x2,
IMC_TYPE_CORE   = 0x4,
IMC_TYPE_CHIP   = 0x10,
 };
@@ -123,6 +160,8 @@ enum {
 #define IMC_DOMAIN_NEST1
 #define IMC_DOMAIN_CORE2
 #define IMC_DOMAIN_THREAD  3
+/* For trace-imc the domain is still thread but it operates in trace-mode */
+#define IMC_DOMAIN_TRACE   4
 
 extern int init_imc_pmu(struct device_node *parent,
struct imc_pmu *pmu_ptr, int pmu_id);
diff --git a/arch/powerpc/include/asm/opal-api.h 
b/arch/powerpc/include/asm/opal-api.h
index 870fb7b239ea..a4130b21b159 100644
--- a/arch/powerpc/include/asm/opal-api.h
+++ b/arch/powerpc/include/asm/opal-api.h
@@ -1118,6 +1118,7 @@ enum {
 enum {
OPAL_IMC_COUNTERS_NEST = 1,
OPAL_IMC_COUNTERS_CORE = 2,
+   OPAL_IMC_COUNTERS_TRACE = 3,
 };
 
 
-- 
2.17.1



[PATCH v2] powerpc/perf: Fix loop exit condition in nest_imc_event_init

2018-12-17 Thread Anju T Sudhakar
The data structure (i.e struct imc_mem_info) to hold the memory address
information for nest imc units is allocated based on the number of nodes
in the system.

nest_imc_event_init() traverse this struct array to calculate the memory
base address for the event-cpu. If we fail to find a match for the event
cpu's chip-id in imc_mem_info struct array, then the do-while loop will
iterate until we crash.

Fix this by changing the loop exit condition based on the number of 
non zero vbase elements in the array, since the allocation is done for
nr_chips + 1.

Reported-by: Dan Carpenter  
Fixes: 885dcd709ba91 ( powerpc/perf: Add nest IMC PMU support)
Signed-off-by: Anju T Sudhakar 
---
 arch/powerpc/perf/imc-pmu.c   | 2 +-
 arch/powerpc/platforms/powernv/opal-imc.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index 4f34c75..d1009fe 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -508,7 +508,7 @@ static int nest_imc_event_init(struct perf_event *event)
break;
}
pcni++;
-   } while (pcni);
+   } while (pcni->vbase != 0);
 
if (!flag)
return -ENODEV;
diff --git a/arch/powerpc/platforms/powernv/opal-imc.c 
b/arch/powerpc/platforms/powernv/opal-imc.c
index 58a0794..3d27f02 100644
--- a/arch/powerpc/platforms/powernv/opal-imc.c
+++ b/arch/powerpc/platforms/powernv/opal-imc.c
@@ -127,7 +127,7 @@ static int imc_get_mem_addr_nest(struct device_node *node,
nr_chips))
goto error;
 
-   pmu_ptr->mem_info = kcalloc(nr_chips, sizeof(*pmu_ptr->mem_info),
+   pmu_ptr->mem_info = kcalloc(nr_chips + 1, sizeof(*pmu_ptr->mem_info),
GFP_KERNEL);
if (!pmu_ptr->mem_info)
goto error;
-- 
1.8.3.1



[PATCH v2 2/5] powerpc/perf: Rearrange setting of ldbar for thread-imc

2018-12-14 Thread Anju T Sudhakar
LDBAR holds the memory address allocated for each cpu. For thread-imc
the mode bit (i.e bit 1) of LDBAR is set to accumulation.
Currently, ldbar is loaded with per cpu memory address and mode set to
accumulation at boot time.

To enable trace-imc, the mode bit of ldbar should be set to 'trace'. So to
accommodate trace-mode of IMC, reposition setting of ldbar for thread-imc
to thread_imc_event_add(). Also reset ldbar at thread_imc_event_del().

Signed-off-by: Anju T Sudhakar 
---
 arch/powerpc/perf/imc-pmu.c | 28 +---
 1 file changed, 17 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index f292a3f284f1..3bef46f8417d 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -806,8 +806,11 @@ static int core_imc_event_init(struct perf_event *event)
 }
 
 /*
- * Allocates a page of memory for each of the online cpus, and write the
- * physical base address of that page to the LDBAR for that cpu.
+ * Allocates a page of memory for each of the online cpus, and load
+ * LDBAR with 0.
+ * The physical base address of the page allocated for a cpu will be
+ * written to the LDBAR for that cpu, when the thread-imc event
+ * is added.
  *
  * LDBAR Register Layout:
  *
@@ -825,7 +828,7 @@ static int core_imc_event_init(struct perf_event *event)
  */
 static int thread_imc_mem_alloc(int cpu_id, int size)
 {
-   u64 ldbar_value, *local_mem = per_cpu(thread_imc_mem, cpu_id);
+   u64 *local_mem = per_cpu(thread_imc_mem, cpu_id);
int nid = cpu_to_node(cpu_id);
 
if (!local_mem) {
@@ -842,9 +845,7 @@ static int thread_imc_mem_alloc(int cpu_id, int size)
per_cpu(thread_imc_mem, cpu_id) = local_mem;
}
 
-   ldbar_value = ((u64)local_mem & THREAD_IMC_LDBAR_MASK) | 
THREAD_IMC_ENABLE;
-
-   mtspr(SPRN_LDBAR, ldbar_value);
+   mtspr(SPRN_LDBAR, 0);
return 0;
 }
 
@@ -995,6 +996,7 @@ static int thread_imc_event_add(struct perf_event *event, 
int flags)
 {
int core_id;
struct imc_pmu_ref *ref;
+   u64 ldbar_value, *local_mem = per_cpu(thread_imc_mem, 
smp_processor_id());
 
if (flags & PERF_EF_START)
imc_event_start(event, flags);
@@ -1003,6 +1005,9 @@ static int thread_imc_event_add(struct perf_event *event, 
int flags)
return -EINVAL;
 
core_id = smp_processor_id() / threads_per_core;
+   ldbar_value = ((u64)local_mem & THREAD_IMC_LDBAR_MASK) | 
THREAD_IMC_ENABLE;
+   mtspr(SPRN_LDBAR, ldbar_value);
+
/*
 * imc pmus are enabled only when it is used.
 * See if this is triggered for the first time.
@@ -1034,11 +1039,7 @@ static void thread_imc_event_del(struct perf_event 
*event, int flags)
int core_id;
struct imc_pmu_ref *ref;
 
-   /*
-* Take a snapshot and calculate the delta and update
-* the event counter values.
-*/
-   imc_event_update(event);
+   mtspr(SPRN_LDBAR, 0);
 
core_id = smp_processor_id() / threads_per_core;
ref = _imc_refc[core_id];
@@ -1057,6 +1058,11 @@ static void thread_imc_event_del(struct perf_event 
*event, int flags)
ref->refc = 0;
}
mutex_unlock(>lock);
+   /*
+* Take a snapshot and calculate the delta and update
+* the event counter values.
+*/
+   imc_event_update(event);
 }
 
 /* update_pmu_ops : Populate the appropriate operations for "pmu" */
-- 
2.17.1



[PATCH v2 5/5] powerpc/perf: Trace imc PMU functions

2018-12-14 Thread Anju T Sudhakar
Add PMU functions to support trace-imc.

Signed-off-by: Anju T Sudhakar 
---
 arch/powerpc/perf/imc-pmu.c | 175 
 1 file changed, 175 insertions(+)

diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index 1f09265c8fb0..32ff0e449fca 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -1120,6 +1120,173 @@ static int trace_imc_cpu_init(void)
  ppc_trace_imc_cpu_offline);
 }
 
+static u64 get_trace_imc_event_base_addr(void)
+{
+   return (u64)per_cpu(trace_imc_mem, smp_processor_id());
+}
+
+/*
+ * Function to parse trace-imc data obtained
+ * and to prepare the perf sample.
+ */
+static int trace_imc_prepare_sample(struct trace_imc_data *mem,
+   struct perf_sample_data *data,
+   u64 *prev_tb,
+   struct perf_event_header *header,
+   struct perf_event *event)
+{
+   /* Sanity checks for a valid record */
+   if (be64_to_cpu(READ_ONCE(mem->tb1)) > *prev_tb)
+   *prev_tb = be64_to_cpu(READ_ONCE(mem->tb1));
+   else
+   return -EINVAL;
+
+   if ((be64_to_cpu(READ_ONCE(mem->tb1)) & IMC_TRACE_RECORD_TB1_MASK) !=
+be64_to_cpu(READ_ONCE(mem->tb2)))
+   return -EINVAL;
+
+   /* Prepare perf sample */
+   data->ip =  be64_to_cpu(READ_ONCE(mem->ip));
+   data->period = event->hw.last_period;
+
+   header->type = PERF_RECORD_SAMPLE;
+   header->size = sizeof(*header) + event->header_size;
+   header->misc = 0;
+
+   if (is_kernel_addr(data->ip))
+   header->misc |= PERF_RECORD_MISC_KERNEL;
+   else
+   header->misc |= PERF_RECORD_MISC_USER;
+
+   perf_event_header__init_id(header, data, event);
+
+   return 0;
+}
+
+static void dump_trace_imc_data(struct perf_event *event)
+{
+   struct trace_imc_data *mem;
+   int i, ret;
+   u64 prev_tb = 0;
+
+   mem = (struct trace_imc_data *)get_trace_imc_event_base_addr();
+   for (i = 0; i < (trace_imc_mem_size / sizeof(struct trace_imc_data));
+   i++, mem++) {
+   struct perf_sample_data data;
+   struct perf_event_header header;
+
+   ret = trace_imc_prepare_sample(mem, , _tb, , 
event);
+   if (ret) /* Exit, if not a valid record */
+   break;
+   else {
+   /* If this is a valid record, create the sample */
+   struct perf_output_handle handle;
+
+   if (perf_output_begin(, event, header.size))
+   return;
+
+   perf_output_sample(, , , event);
+   perf_output_end();
+   }
+   }
+}
+
+static int trace_imc_event_add(struct perf_event *event, int flags)
+{
+   /* Enable the sched_task to start the engine */
+   perf_sched_cb_inc(event->ctx->pmu);
+   return 0;
+}
+
+static void trace_imc_event_read(struct perf_event *event)
+{
+   dump_trace_imc_data(event);
+}
+
+static void trace_imc_event_stop(struct perf_event *event, int flags)
+{
+   trace_imc_event_read(event);
+}
+
+static void trace_imc_event_start(struct perf_event *event, int flags)
+{
+   return;
+}
+
+static void trace_imc_event_del(struct perf_event *event, int flags)
+{
+   perf_sched_cb_dec(event->ctx->pmu);
+}
+
+void trace_imc_pmu_sched_task(struct perf_event_context *ctx,
+   bool sched_in)
+{
+   int core_id = smp_processor_id() / threads_per_core;
+   struct imc_pmu_ref *ref;
+   u64 local_mem, ldbar_value;
+
+   /* Set trace-imc bit in ldbar and load ldbar with per-thread memory 
address */
+   local_mem = get_trace_imc_event_base_addr();
+   ldbar_value = ((u64)local_mem & THREAD_IMC_LDBAR_MASK) | 
TRACE_IMC_ENABLE;
+
+   ref = _imc_refc[core_id];
+   if (!ref)
+   return;
+
+   if (sched_in) {
+   mtspr(SPRN_LDBAR, ldbar_value);
+   mutex_lock(>lock);
+   if (ref->refc == 0) {
+   if (opal_imc_counters_start(OPAL_IMC_COUNTERS_TRACE,
+   
get_hard_smp_processor_id(smp_processor_id( {
+   mutex_unlock(>lock);
+   pr_err("trace-imc: Unable to start the counters 
for core %d\n", core_id);
+   mtspr(SPRN_LDBAR, 0);
+   return;
+   }
+   }
+   ++ref->refc;
+   mutex_unlock(>lock);
+   } else {
+   mtspr(SPRN_LDBAR, 0);
+   mutex_lock(>lock);
+   ref->refc--;
+ 

[PATCH v2 3/5] powerpc/perf: Add privileged access check for thread_imc

2018-12-14 Thread Anju T Sudhakar
From: Madhavan Srinivasan 

Add code to restrict user access to thread_imc pmu since
some event report privilege level information.

Fixes: f74c89bd80fb3 ('powerpc/perf: Add thread IMC PMU support')
Signed-off-by: Madhavan Srinivasan 
Signed-off-by: Anju T Sudhakar 
---
 arch/powerpc/perf/imc-pmu.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index 3bef46f8417d..5ca80545a849 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -877,6 +877,9 @@ static int thread_imc_event_init(struct perf_event *event)
if (event->attr.type != event->pmu->type)
return -ENOENT;
 
+   if (!capable(CAP_SYS_ADMIN))
+   return -EACCES;
+
/* Sampling not supported */
if (event->hw.sample_period)
return -EINVAL;
-- 
2.17.1



[PATCH v2 4/5] powerpc/perf: Trace imc events detection and cpuhotplug

2018-12-14 Thread Anju T Sudhakar
Patch detects trace-imc events, does memory initilizations for each online
cpu, and registers cpuhotplug call-backs.

Signed-off-by: Anju T Sudhakar 
---
 arch/powerpc/perf/imc-pmu.c   | 91 +++
 arch/powerpc/platforms/powernv/opal-imc.c |  3 +
 include/linux/cpuhotplug.h|  1 +
 3 files changed, 95 insertions(+)

diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index 5ca80545a849..1f09265c8fb0 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -43,6 +43,10 @@ static DEFINE_PER_CPU(u64 *, thread_imc_mem);
 static struct imc_pmu *thread_imc_pmu;
 static int thread_imc_mem_size;
 
+/* Trace IMC data structures */
+static DEFINE_PER_CPU(u64 *, trace_imc_mem);
+static int trace_imc_mem_size;
+
 static struct imc_pmu *imc_event_to_pmu(struct perf_event *event)
 {
return container_of(event->pmu, struct imc_pmu, pmu);
@@ -1068,6 +1072,54 @@ static void thread_imc_event_del(struct perf_event 
*event, int flags)
imc_event_update(event);
 }
 
+/*
+ * Allocate a page of memory for each cpu, and load LDBAR with 0.
+ */
+static int trace_imc_mem_alloc(int cpu_id, int size)
+{
+   u64 *local_mem = per_cpu(trace_imc_mem, cpu_id);
+   int phys_id = cpu_to_node(cpu_id), rc = 0;
+
+   if (!local_mem) {
+   local_mem = page_address(alloc_pages_node(phys_id,
+   GFP_KERNEL | __GFP_ZERO | 
__GFP_THISNODE |
+   __GFP_NOWARN, get_order(size)));
+   if (!local_mem)
+   return -ENOMEM;
+   per_cpu(trace_imc_mem, cpu_id) = local_mem;
+
+   /* Initialise the counters for trace mode */
+   rc = opal_imc_counters_init(OPAL_IMC_COUNTERS_TRACE, __pa((void 
*)local_mem),
+   get_hard_smp_processor_id(cpu_id));
+   if (rc) {
+   pr_info("IMC:opal init failed for trace imc\n");
+   return rc;
+   }
+   }
+
+   mtspr(SPRN_LDBAR, 0);
+   return 0;
+}
+
+static int ppc_trace_imc_cpu_online(unsigned int cpu)
+{
+   return trace_imc_mem_alloc(cpu, trace_imc_mem_size);
+}
+
+static int ppc_trace_imc_cpu_offline(unsigned int cpu)
+{
+   mtspr(SPRN_LDBAR, 0);
+   return 0;
+}
+
+static int trace_imc_cpu_init(void)
+{
+   return cpuhp_setup_state(CPUHP_AP_PERF_POWERPC_TRACE_IMC_ONLINE,
+ "perf/powerpc/imc_trace:online",
+ ppc_trace_imc_cpu_online,
+ ppc_trace_imc_cpu_offline);
+}
+
 /* update_pmu_ops : Populate the appropriate operations for "pmu" */
 static int update_pmu_ops(struct imc_pmu *pmu)
 {
@@ -1189,6 +1241,17 @@ static void cleanup_all_thread_imc_memory(void)
}
 }
 
+static void cleanup_all_trace_imc_memory(void)
+{
+   int i, order = get_order(trace_imc_mem_size);
+
+   for_each_online_cpu(i) {
+   if (per_cpu(trace_imc_mem, i))
+   free_pages((u64)per_cpu(trace_imc_mem, i), order);
+
+   }
+}
+
 /* Function to free the attr_groups which are dynamically allocated */
 static void imc_common_mem_free(struct imc_pmu *pmu_ptr)
 {
@@ -1230,6 +1293,11 @@ static void imc_common_cpuhp_mem_free(struct imc_pmu 
*pmu_ptr)
cpuhp_remove_state(CPUHP_AP_PERF_POWERPC_THREAD_IMC_ONLINE);
cleanup_all_thread_imc_memory();
}
+
+   if (pmu_ptr->domain == IMC_DOMAIN_TRACE) {
+   cpuhp_remove_state(CPUHP_AP_PERF_POWERPC_TRACE_IMC_ONLINE);
+   cleanup_all_trace_imc_memory();
+   }
 }
 
 /*
@@ -1312,6 +1380,21 @@ static int imc_mem_init(struct imc_pmu *pmu_ptr, struct 
device_node *parent,
 
thread_imc_pmu = pmu_ptr;
break;
+   case IMC_DOMAIN_TRACE:
+   /* Update the pmu name */
+   pmu_ptr->pmu.name = kasprintf(GFP_KERNEL, "%s%s", s, "_imc");
+   if (!pmu_ptr->pmu.name)
+   return -ENOMEM;
+
+   trace_imc_mem_size = pmu_ptr->counter_mem_size;
+   for_each_online_cpu(cpu) {
+   res = trace_imc_mem_alloc(cpu, trace_imc_mem_size);
+   if (res) {
+   cleanup_all_trace_imc_memory();
+   goto err;
+   }
+   }
+   break;
default:
return -EINVAL;
}
@@ -1384,6 +1467,14 @@ int init_imc_pmu(struct device_node *parent, struct 
imc_pmu *pmu_ptr, int pmu_id
goto err_free_mem;
}
 
+   break;
+   case IMC_DOMAIN_TRACE:
+   ret = trace_imc_cpu_init();
+   if (ret) {
+   cleanup_all_trace_imc_memory();
+   goto err_fre

[PATCH v2 1/5] powerpc/include: Add data structures and macros for IMC trace mode

2018-12-14 Thread Anju T Sudhakar
Add the macros needed for IMC (In-Memory Collection Counters) trace-mode
and data structure to hold the trace-imc record data.
Also, add the new type "OPAL_IMC_COUNTERS_TRACE" in 'opal-api.h', since
there is a new switch case added in the opal-calls for IMC.

Signed-off-by: Anju T Sudhakar 
---
 arch/powerpc/include/asm/imc-pmu.h  | 39 +
 arch/powerpc/include/asm/opal-api.h |  1 +
 2 files changed, 40 insertions(+)

diff --git a/arch/powerpc/include/asm/imc-pmu.h 
b/arch/powerpc/include/asm/imc-pmu.h
index 69f516ecb2fd..7c2ef0e42661 100644
--- a/arch/powerpc/include/asm/imc-pmu.h
+++ b/arch/powerpc/include/asm/imc-pmu.h
@@ -33,6 +33,7 @@
  */
 #define THREAD_IMC_LDBAR_MASK   0x0003e000ULL
 #define THREAD_IMC_ENABLE   0x8000ULL
+#define TRACE_IMC_ENABLE   0x4000ULL
 
 /*
  * For debugfs interface for imc-mode and imc-command
@@ -59,6 +60,34 @@ struct imc_events {
char *scale;
 };
 
+/*
+ * Trace IMC hardware updates a 64bytes record on
+ * Core Performance Monitoring Counter (CPMC)
+ * overflow. Here is the layout for the trace imc record
+ *
+ * DW 0 : Timebase
+ * DW 1 : Program Counter
+ * DW 2 : PIDR information
+ * DW 3 : CPMC1
+ * DW 4 : CPMC2
+ * DW 5 : CPMC3
+ * Dw 6 : CPMC4
+ * DW 7 : Timebase
+ * .
+ *
+ * The following is the data structure to hold trace imc data.
+ */
+struct trace_imc_data {
+   u64 tb1;
+   u64 ip;
+   u64 val;
+   u64 cpmc1;
+   u64 cpmc2;
+   u64 cpmc3;
+   u64 cpmc4;
+   u64 tb2;
+};
+
 /* Event attribute array index */
 #define IMC_FORMAT_ATTR0
 #define IMC_EVENT_ATTR 1
@@ -68,6 +97,13 @@ struct imc_events {
 /* PMU Format attribute macros */
 #define IMC_EVENT_OFFSET_MASK  0xULL
 
+/*
+ * Macro to mask bits 0:21 of first double word(which is the timebase) to
+ * compare with 8th double word (timebase) of trace imc record data.
+ */
+#define IMC_TRACE_RECORD_TB1_MASK  0x3ffULL
+
+
 /*
  * Device tree parser code detects IMC pmu support and
  * registers new IMC pmus. This structure will hold the
@@ -113,6 +149,7 @@ struct imc_pmu_ref {
 
 enum {
IMC_TYPE_THREAD = 0x1,
+   IMC_TYPE_TRACE  = 0x2,
IMC_TYPE_CORE   = 0x4,
IMC_TYPE_CHIP   = 0x10,
 };
@@ -123,6 +160,8 @@ enum {
 #define IMC_DOMAIN_NEST1
 #define IMC_DOMAIN_CORE2
 #define IMC_DOMAIN_THREAD  3
+/* For trace-imc the domain is still thread but it operates in trace-mode */
+#define IMC_DOMAIN_TRACE   4
 
 extern int init_imc_pmu(struct device_node *parent,
struct imc_pmu *pmu_ptr, int pmu_id);
diff --git a/arch/powerpc/include/asm/opal-api.h 
b/arch/powerpc/include/asm/opal-api.h
index 870fb7b239ea..a4130b21b159 100644
--- a/arch/powerpc/include/asm/opal-api.h
+++ b/arch/powerpc/include/asm/opal-api.h
@@ -1118,6 +1118,7 @@ enum {
 enum {
OPAL_IMC_COUNTERS_NEST = 1,
OPAL_IMC_COUNTERS_CORE = 2,
+   OPAL_IMC_COUNTERS_TRACE = 3,
 };
 
 
-- 
2.17.1



[PATCH v2 0/5] powerpc/perf: IMC trace-mode support

2018-12-14 Thread Anju T Sudhakar
bs.org/pipermail/skiboot/2018-December/012883.html

* Set LDBAR spr to enable imc-trace mode.

LDBAR Layout:

0 : Enable/Disable
1 : 0 -> Accumulation Mode
1 -> Trace Mode
2:3   : Reserved
4-6   : PB scope
7 : Reserved
8:50  : Counter Address
51:63 : Reserved

   

Key benefit of imc trace-mode is, each sample record contains the address
pointer along with other information. So that, we can profile the IP
without interrupting the application.

Performance data using 'perf top' with and without trace-imc event:

When the application is monitored with trace-imc event, we dont take any   
PMI interrupts.

PMI interrupts count when `perf top` command is executed without trac-imc event.

# perf top  
12.53%  [kernel]   [k] arch_cpu_idle   
11.32%  [kernel]   [k] rcu_idle_enter  
10.76%  [kernel]   [k] __next_timer_interrupt  
 9.49%  [kernel]   [k] find_next_bit   
 8.06%  [kernel]   [k] rcu_dynticks_eqs_exit   
 7.82%  [kernel]   [k] do_idle 
 5.71%  [kernel]   [k] tick_nohz_idle_stop_tic 
 [---]  
# cat /proc/interrupts  (a snippet from the output)
9944  1072804804   1644804   1306  
804804804804804804804  
804804   1961   1602804804   1258  
[-]
803803803803803803803  
803803803803804804804 
804804804804804804803 
803803803803803   1306803 
803   Performance monitoring interrupts   


`perf top` with trace-imc (right after 'perf top' without trace-imc event):

# perf top -e trace_imc/trace_cycles/  
12.50%  [kernel]  [k] arch_cpu_idle
11.81%  [kernel]  [k] __next_timer_interrupt   
11.22%  [kernel]  [k] rcu_idle_enter   
10.25%  [kernel]  [k] find_next_bit
 7.91%  [kernel]  [k] do_idle  
 7.69%  [kernel]  [k] rcu_dynticks_eqs_exit
 5.20%  [kernel]  [k] tick_nohz_idle_stop_tick 
 [---]  

# cat /proc/interrupts (a snippet from the output) 

9944  1072804804   1644804   1306  
804804804804804804804  
804804   1961   1602804804   1258  
[-]
803803803803803803803  
803803803804804804804
804804804804804804803 
803803803803803   1306803 
803   Performance monitoring interrupts   

The PMI interrupts count remains the same.  

Changelog:

>From v1 -> v2
--

* Added privileged access check for thread-imc and trace-imc

Suggestions/comments are welcome.

Anju T Sudhakar (4):
  powerpc/include: Add data structures and macros for IMC trace mode
  powerpc/perf: Rearrange setting of ldbar for thread-imc
  powerpc/perf: Trace imc events detection and cpuhotplug
  powerpc/perf: Trace imc PMU functions

Madhavan Srinivasan (1):
  powerpc/perf: Add privileged access check for thread_imc

 arch/powerpc

[PATCH 0/4] powerpc/perf: IMC trace-mode support

2018-11-28 Thread Anju T Sudhakar
e
1 : 0 -> Accumulation Mode
1 -> Trace Mode
2:3   : Reserved
4-6   : PB scope
7 : Reserved
8:50  : Counter Address
51:63 : Reserved

   

Key benefit of imc trace-mode is, each sample record contains the address
pointer along with other information. So that, we can profile the IP
without interrupting the application.

Performance data using 'perf top' with and without trace-imc event:

When the application is monitored with trace-imc event, we dont take any   
PMI interrupts.

PMI interrupts count when `perf top` command is executed without trac-imc event.

# perf top  
12.53%  [kernel]   [k] arch_cpu_idle   
11.32%  [kernel]   [k] rcu_idle_enter  
10.76%  [kernel]   [k] __next_timer_interrupt  
 9.49%  [kernel]   [k] find_next_bit   
 8.06%  [kernel]   [k] rcu_dynticks_eqs_exit   
 7.82%  [kernel]   [k] do_idle 
 5.71%  [kernel]   [k] tick_nohz_idle_stop_tic 
 [---]  
# cat /proc/interrupts  (a snippet from the output)
9944  1072804804   1644804   1306  
804804804804804804804  
804804   1961   1602804804   1258  
[-]
803803803803803803803  
803803803803804804804 
804804804804804804803 
803803803803803   1306803 
803   Performance monitoring interrupts   


`perf top` with trace-imc (right after 'perf top' without trace-imc event):

# perf top -e trace_imc/trace_cycles/  
12.50%  [kernel]  [k] arch_cpu_idle
11.81%  [kernel]  [k] __next_timer_interrupt   
11.22%  [kernel]  [k] rcu_idle_enter   
10.25%  [kernel]  [k] find_next_bit
 7.91%  [kernel]  [k] do_idle  
 7.69%  [kernel]  [k] rcu_dynticks_eqs_exit
 5.20%  [kernel]  [k] tick_nohz_idle_stop_tick 
 [---]  

# cat /proc/interrupts (a snippet from the output) 

9944  1072804804   1644804   1306  
804804804804804804804  
804804   1961   1602804804   1258  
[-]
803803803803803803803  
803803803804804804804
804804804804804804803 
803803803803803   1306803 
803   Performance monitoring interrupts   
        
The PMI interrupts count remains the same.

Anju T Sudhakar (4):
  powerpc/include: Add data structures and macros for IMC trace mode
  powerpc/perf: Rearrange setting of ldbar for thread-imc
  powerpc/perf: Trace imc events detectionand cpuhotplug
  powerpc/perf: Trace imc PMU functions

 arch/powerpc/include/asm/imc-pmu.h|  39 +++
 arch/powerpc/include/asm/opal-api.h   |   1 +
 arch/powerpc/perf/imc-pmu.c   | 291 +-
 arch/powerpc/platforms/powernv/opal-imc.c |   3 +
 include/linux/cpuhotplug.h|   1 +
 5 files changed, 324 insertions(+), 11 deletions(-)

-- 
2.17.1



[PATCH 4/4] powerpc/perf: Trace imc PMU functions

2018-11-28 Thread Anju T Sudhakar
Add PMU functions to support trace-imc.

Signed-off-by: Anju T Sudhakar 
---
 arch/powerpc/perf/imc-pmu.c | 172 
 1 file changed, 172 insertions(+)

diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index d9ffe7f03f1e..18af7c3e2345 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -1117,6 +1117,170 @@ static int trace_imc_cpu_init(void)
  ppc_trace_imc_cpu_offline);
 }
 
+static u64 get_trace_imc_event_base_addr(void)
+{
+   return (u64)per_cpu(trace_imc_mem, smp_processor_id());
+}
+
+/*
+ * Function to parse trace-imc data obtained
+ * and to prepare the perf sample.
+ */
+static int trace_imc_prepare_sample(struct trace_imc_data *mem,
+   struct perf_sample_data *data,
+   u64 *prev_tb,
+   struct perf_event_header *header,
+   struct perf_event *event)
+{
+   /* Sanity checks for a valid record */
+   if (be64_to_cpu(READ_ONCE(mem->tb1)) > *prev_tb)
+   *prev_tb = be64_to_cpu(READ_ONCE(mem->tb1));
+   else
+   return -EINVAL;
+
+   if ((be64_to_cpu(READ_ONCE(mem->tb1)) & IMC_TRACE_RECORD_TB1_MASK) !=
+be64_to_cpu(READ_ONCE(mem->tb2)))
+   return -EINVAL;
+
+   /* Prepare perf sample */
+   data->ip =  be64_to_cpu(READ_ONCE(mem->ip));
+   data->period = event->hw.last_period;
+
+   header->type = PERF_RECORD_SAMPLE;
+   header->size = sizeof(*header) + event->header_size;
+   header->misc = 0;
+
+   if (is_kernel_addr(data->ip))
+   header->misc |= PERF_RECORD_MISC_KERNEL;
+   else
+   header->misc |= PERF_RECORD_MISC_USER;
+
+   perf_event_header__init_id(header, data, event);
+
+   return 0;
+}
+
+static void dump_trace_imc_data(struct perf_event *event)
+{
+   struct trace_imc_data *mem;
+   int i, ret;
+   u64 prev_tb = 0;
+
+   mem = (struct trace_imc_data *)get_trace_imc_event_base_addr();
+   for (i = 0; i < (trace_imc_mem_size / sizeof(struct trace_imc_data));
+   i++, mem++) {
+   struct perf_sample_data data;
+   struct perf_event_header header;
+
+   ret = trace_imc_prepare_sample(mem, , _tb, , 
event);
+   if (ret) /* Exit, if not a valid record */
+   break;
+   else {
+   /* If this is a valid record, create the sample */
+   struct perf_output_handle handle;
+
+   if (perf_output_begin(, event, header.size))
+   return;
+
+   perf_output_sample(, , , event);
+   perf_output_end();
+   }
+   }
+}
+
+static int trace_imc_event_add(struct perf_event *event, int flags)
+{
+   /* Enable the sched_task to start the engine */
+   perf_sched_cb_inc(event->ctx->pmu);
+   return 0;
+}
+
+static void trace_imc_event_read(struct perf_event *event)
+{
+   dump_trace_imc_data(event);
+}
+
+static void trace_imc_event_stop(struct perf_event *event, int flags)
+{
+   trace_imc_event_read(event);
+}
+
+static void trace_imc_event_start(struct perf_event *event, int flags)
+{
+   return;
+}
+
+static void trace_imc_event_del(struct perf_event *event, int flags)
+{
+   perf_sched_cb_dec(event->ctx->pmu);
+}
+
+void trace_imc_pmu_sched_task(struct perf_event_context *ctx,
+   bool sched_in)
+{
+   int core_id = smp_processor_id() / threads_per_core;
+   struct imc_pmu_ref *ref;
+   u64 local_mem, ldbar_value;
+
+   /* Set trace-imc bit in ldbar and load ldbar with per-thread memory 
address */
+   local_mem = get_trace_imc_event_base_addr();
+   ldbar_value = ((u64)local_mem & THREAD_IMC_LDBAR_MASK) | 
TRACE_IMC_ENABLE;
+
+   ref = _imc_refc[core_id];
+   if (!ref)
+   return;
+
+   if (sched_in) {
+   mtspr(SPRN_LDBAR, ldbar_value);
+   mutex_lock(>lock);
+   if (ref->refc == 0) {
+   if (opal_imc_counters_start(OPAL_IMC_COUNTERS_TRACE,
+   
get_hard_smp_processor_id(smp_processor_id( {
+   mutex_unlock(>lock);
+   pr_err("trace-imc: Unable to start the counters 
for core %d\n", core_id);
+   mtspr(SPRN_LDBAR, 0);
+   return;
+   }
+   }
+   ++ref->refc;
+   mutex_unlock(>lock);
+   } else {
+   mtspr(SPRN_LDBAR, 0);
+   mutex_lock(>lock);
+   ref->refc--;
+ 

[PATCH 3/4] powerpc/perf: Trace imc events detection and cpuhotplug

2018-11-28 Thread Anju T Sudhakar
Patch detects trace-imc events, does memory initilizations for each online
cpu, and registers cpuhotplug call-backs.

Signed-off-by: Anju T Sudhakar 
---
 arch/powerpc/perf/imc-pmu.c   | 91 +++
 arch/powerpc/platforms/powernv/opal-imc.c |  3 +
 include/linux/cpuhotplug.h|  1 +
 3 files changed, 95 insertions(+)

diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index 3bef46f8417d..d9ffe7f03f1e 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -43,6 +43,10 @@ static DEFINE_PER_CPU(u64 *, thread_imc_mem);
 static struct imc_pmu *thread_imc_pmu;
 static int thread_imc_mem_size;
 
+/* Trace IMC data structures */
+static DEFINE_PER_CPU(u64 *, trace_imc_mem);
+static int trace_imc_mem_size;
+
 static struct imc_pmu *imc_event_to_pmu(struct perf_event *event)
 {
return container_of(event->pmu, struct imc_pmu, pmu);
@@ -1065,6 +1069,54 @@ static void thread_imc_event_del(struct perf_event 
*event, int flags)
imc_event_update(event);
 }
 
+/*
+ * Allocate a page of memory for each cpu, and load LDBAR with 0.
+ */
+static int trace_imc_mem_alloc(int cpu_id, int size)
+{
+   u64 *local_mem = per_cpu(trace_imc_mem, cpu_id);
+   int phys_id = cpu_to_node(cpu_id), rc = 0;
+
+   if (!local_mem) {
+   local_mem = page_address(alloc_pages_node(phys_id,
+   GFP_KERNEL | __GFP_ZERO | 
__GFP_THISNODE |
+   __GFP_NOWARN, get_order(size)));
+   if (!local_mem)
+   return -ENOMEM;
+   per_cpu(trace_imc_mem, cpu_id) = local_mem;
+
+   /* Initialise the counters for trace mode */
+   rc = opal_imc_counters_init(OPAL_IMC_COUNTERS_TRACE, __pa((void 
*)local_mem),
+   get_hard_smp_processor_id(cpu_id));
+   if (rc) {
+   pr_info("IMC:opal init failed for trace imc\n");
+   return rc;
+   }
+   }
+
+   mtspr(SPRN_LDBAR, 0);
+   return 0;
+}
+
+static int ppc_trace_imc_cpu_online(unsigned int cpu)
+{
+   return trace_imc_mem_alloc(cpu, trace_imc_mem_size);
+}
+
+static int ppc_trace_imc_cpu_offline(unsigned int cpu)
+{
+   mtspr(SPRN_LDBAR, 0);
+   return 0;
+}
+
+static int trace_imc_cpu_init(void)
+{
+   return cpuhp_setup_state(CPUHP_AP_PERF_POWERPC_TRACE_IMC_ONLINE,
+ "perf/powerpc/imc_trace:online",
+ ppc_trace_imc_cpu_online,
+ ppc_trace_imc_cpu_offline);
+}
+
 /* update_pmu_ops : Populate the appropriate operations for "pmu" */
 static int update_pmu_ops(struct imc_pmu *pmu)
 {
@@ -1186,6 +1238,17 @@ static void cleanup_all_thread_imc_memory(void)
}
 }
 
+static void cleanup_all_trace_imc_memory(void)
+{
+   int i, order = get_order(trace_imc_mem_size);
+
+   for_each_online_cpu(i) {
+   if (per_cpu(trace_imc_mem, i))
+   free_pages((u64)per_cpu(trace_imc_mem, i), order);
+
+   }
+}
+
 /* Function to free the attr_groups which are dynamically allocated */
 static void imc_common_mem_free(struct imc_pmu *pmu_ptr)
 {
@@ -1227,6 +1290,11 @@ static void imc_common_cpuhp_mem_free(struct imc_pmu 
*pmu_ptr)
cpuhp_remove_state(CPUHP_AP_PERF_POWERPC_THREAD_IMC_ONLINE);
cleanup_all_thread_imc_memory();
}
+
+   if (pmu_ptr->domain == IMC_DOMAIN_TRACE) {
+   cpuhp_remove_state(CPUHP_AP_PERF_POWERPC_TRACE_IMC_ONLINE);
+   cleanup_all_trace_imc_memory();
+   }
 }
 
 /*
@@ -1309,6 +1377,21 @@ static int imc_mem_init(struct imc_pmu *pmu_ptr, struct 
device_node *parent,
 
thread_imc_pmu = pmu_ptr;
break;
+   case IMC_DOMAIN_TRACE:
+   /* Update the pmu name */
+   pmu_ptr->pmu.name = kasprintf(GFP_KERNEL, "%s%s", s, "_imc");
+   if (!pmu_ptr->pmu.name)
+   return -ENOMEM;
+
+   trace_imc_mem_size = pmu_ptr->counter_mem_size;
+   for_each_online_cpu(cpu) {
+   res = trace_imc_mem_alloc(cpu, trace_imc_mem_size);
+   if (res) {
+   cleanup_all_trace_imc_memory();
+   goto err;
+   }
+   }
+   break;
default:
return -EINVAL;
}
@@ -1381,6 +1464,14 @@ int init_imc_pmu(struct device_node *parent, struct 
imc_pmu *pmu_ptr, int pmu_id
goto err_free_mem;
}
 
+   break;
+   case IMC_DOMAIN_TRACE:
+   ret = trace_imc_cpu_init();
+   if (ret) {
+   cleanup_all_trace_imc_memory();
+   goto err_fre

[PATCH 1/4] powerpc/include: Add data structures and macros for IMC trace mode

2018-11-28 Thread Anju T Sudhakar
Add the macros needed for IMC (In-Memory Collection Counters) trace-mode
and data structure to hold the trace-imc record data.
Also, add the new type "OPAL_IMC_COUNTERS_TRACE" in 'opal-api.h', since
there is a new switch case added in the opal-calls for IMC.

Signed-off-by: Anju T Sudhakar 
---
 arch/powerpc/include/asm/imc-pmu.h  | 39 +
 arch/powerpc/include/asm/opal-api.h |  1 +
 2 files changed, 40 insertions(+)

diff --git a/arch/powerpc/include/asm/imc-pmu.h 
b/arch/powerpc/include/asm/imc-pmu.h
index 69f516ecb2fd..7c2ef0e42661 100644
--- a/arch/powerpc/include/asm/imc-pmu.h
+++ b/arch/powerpc/include/asm/imc-pmu.h
@@ -33,6 +33,7 @@
  */
 #define THREAD_IMC_LDBAR_MASK   0x0003e000ULL
 #define THREAD_IMC_ENABLE   0x8000ULL
+#define TRACE_IMC_ENABLE   0x4000ULL
 
 /*
  * For debugfs interface for imc-mode and imc-command
@@ -59,6 +60,34 @@ struct imc_events {
char *scale;
 };
 
+/*
+ * Trace IMC hardware updates a 64bytes record on
+ * Core Performance Monitoring Counter (CPMC)
+ * overflow. Here is the layout for the trace imc record
+ *
+ * DW 0 : Timebase
+ * DW 1 : Program Counter
+ * DW 2 : PIDR information
+ * DW 3 : CPMC1
+ * DW 4 : CPMC2
+ * DW 5 : CPMC3
+ * Dw 6 : CPMC4
+ * DW 7 : Timebase
+ * .
+ *
+ * The following is the data structure to hold trace imc data.
+ */
+struct trace_imc_data {
+   u64 tb1;
+   u64 ip;
+   u64 val;
+   u64 cpmc1;
+   u64 cpmc2;
+   u64 cpmc3;
+   u64 cpmc4;
+   u64 tb2;
+};
+
 /* Event attribute array index */
 #define IMC_FORMAT_ATTR0
 #define IMC_EVENT_ATTR 1
@@ -68,6 +97,13 @@ struct imc_events {
 /* PMU Format attribute macros */
 #define IMC_EVENT_OFFSET_MASK  0xULL
 
+/*
+ * Macro to mask bits 0:21 of first double word(which is the timebase) to
+ * compare with 8th double word (timebase) of trace imc record data.
+ */
+#define IMC_TRACE_RECORD_TB1_MASK  0x3ffULL
+
+
 /*
  * Device tree parser code detects IMC pmu support and
  * registers new IMC pmus. This structure will hold the
@@ -113,6 +149,7 @@ struct imc_pmu_ref {
 
 enum {
IMC_TYPE_THREAD = 0x1,
+   IMC_TYPE_TRACE  = 0x2,
IMC_TYPE_CORE   = 0x4,
IMC_TYPE_CHIP   = 0x10,
 };
@@ -123,6 +160,8 @@ enum {
 #define IMC_DOMAIN_NEST1
 #define IMC_DOMAIN_CORE2
 #define IMC_DOMAIN_THREAD  3
+/* For trace-imc the domain is still thread but it operates in trace-mode */
+#define IMC_DOMAIN_TRACE   4
 
 extern int init_imc_pmu(struct device_node *parent,
struct imc_pmu *pmu_ptr, int pmu_id);
diff --git a/arch/powerpc/include/asm/opal-api.h 
b/arch/powerpc/include/asm/opal-api.h
index 870fb7b239ea..a4130b21b159 100644
--- a/arch/powerpc/include/asm/opal-api.h
+++ b/arch/powerpc/include/asm/opal-api.h
@@ -1118,6 +1118,7 @@ enum {
 enum {
OPAL_IMC_COUNTERS_NEST = 1,
OPAL_IMC_COUNTERS_CORE = 2,
+   OPAL_IMC_COUNTERS_TRACE = 3,
 };
 
 
-- 
2.17.1



[PATCH 2/4] powerpc/perf: Rearrange setting of ldbar for thread-imc

2018-11-28 Thread Anju T Sudhakar
LDBAR holds the memory address allocated for each cpu. For thread-imc
the mode bit (i.e bit 1) of LDBAR is set to accumulation.
Currently, ldbar is loaded with per cpu memory address and mode set to
accumulation at boot time.

To enable trace-imc, the mode bit of ldbar should be set to 'trace'. So to
accommodate trace-mode of IMC, reposition setting of ldbar for thread-imc
to thread_imc_event_add(). Also reset ldbar at thread_imc_event_del().

Signed-off-by: Anju T Sudhakar 
---
 arch/powerpc/perf/imc-pmu.c | 28 +---
 1 file changed, 17 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index f292a3f284f1..3bef46f8417d 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -806,8 +806,11 @@ static int core_imc_event_init(struct perf_event *event)
 }
 
 /*
- * Allocates a page of memory for each of the online cpus, and write the
- * physical base address of that page to the LDBAR for that cpu.
+ * Allocates a page of memory for each of the online cpus, and load
+ * LDBAR with 0.
+ * The physical base address of the page allocated for a cpu will be
+ * written to the LDBAR for that cpu, when the thread-imc event
+ * is added.
  *
  * LDBAR Register Layout:
  *
@@ -825,7 +828,7 @@ static int core_imc_event_init(struct perf_event *event)
  */
 static int thread_imc_mem_alloc(int cpu_id, int size)
 {
-   u64 ldbar_value, *local_mem = per_cpu(thread_imc_mem, cpu_id);
+   u64 *local_mem = per_cpu(thread_imc_mem, cpu_id);
int nid = cpu_to_node(cpu_id);
 
if (!local_mem) {
@@ -842,9 +845,7 @@ static int thread_imc_mem_alloc(int cpu_id, int size)
per_cpu(thread_imc_mem, cpu_id) = local_mem;
}
 
-   ldbar_value = ((u64)local_mem & THREAD_IMC_LDBAR_MASK) | 
THREAD_IMC_ENABLE;
-
-   mtspr(SPRN_LDBAR, ldbar_value);
+   mtspr(SPRN_LDBAR, 0);
return 0;
 }
 
@@ -995,6 +996,7 @@ static int thread_imc_event_add(struct perf_event *event, 
int flags)
 {
int core_id;
struct imc_pmu_ref *ref;
+   u64 ldbar_value, *local_mem = per_cpu(thread_imc_mem, 
smp_processor_id());
 
if (flags & PERF_EF_START)
imc_event_start(event, flags);
@@ -1003,6 +1005,9 @@ static int thread_imc_event_add(struct perf_event *event, 
int flags)
return -EINVAL;
 
core_id = smp_processor_id() / threads_per_core;
+   ldbar_value = ((u64)local_mem & THREAD_IMC_LDBAR_MASK) | 
THREAD_IMC_ENABLE;
+   mtspr(SPRN_LDBAR, ldbar_value);
+
/*
 * imc pmus are enabled only when it is used.
 * See if this is triggered for the first time.
@@ -1034,11 +1039,7 @@ static void thread_imc_event_del(struct perf_event 
*event, int flags)
int core_id;
struct imc_pmu_ref *ref;
 
-   /*
-* Take a snapshot and calculate the delta and update
-* the event counter values.
-*/
-   imc_event_update(event);
+   mtspr(SPRN_LDBAR, 0);
 
core_id = smp_processor_id() / threads_per_core;
ref = _imc_refc[core_id];
@@ -1057,6 +1058,11 @@ static void thread_imc_event_del(struct perf_event 
*event, int flags)
ref->refc = 0;
}
mutex_unlock(>lock);
+   /*
+* Take a snapshot and calculate the delta and update
+* the event counter values.
+*/
+   imc_event_update(event);
 }
 
 /* update_pmu_ops : Populate the appropriate operations for "pmu" */
-- 
2.17.1



[PATCH] powerpc/perf: Return accordingly on invalid chip-id in

2018-11-27 Thread Anju T Sudhakar
Nest hardware counter memory resides in a per-chip reserve-memory.
During nest_imc_event_init(), chip-id of the event-cpu is considered to
calculate the base memory addresss for that cpu. Return, proper error
condition if the chip_id calculated is invalid.

Reported-by: Dan Carpenter 
Fixes: 885dcd709ba91 ("powerpc/perf: Add nest IMC PMU support")
Reviewed-by: Madhavan Srinivasan 
Signed-off-by: Anju T Sudhakar 
---
 arch/powerpc/perf/imc-pmu.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index 6954636b16d1..78514170cf71 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -496,6 +496,11 @@ static int nest_imc_event_init(struct perf_event *event)
 * Get the base memory addresss for this cpu.
 */
chip_id = cpu_to_chip_id(event->cpu);
+
+   /* Return, if chip_id is not valid */
+   if (chip_id < 0)
+   return -ENODEV;
+
pcni = pmu->mem_info;
do {
if (pcni->id == chip_id) {
-- 
2.17.1



[PATCH] powerpc/perf: Fix loop exit condition in nest_imc_event_init

2018-11-27 Thread Anju T Sudhakar
The data structure (i.e struct imc_mem_info) to hold the memory address
information for nest imc units is allocated based on the number of nodes
in the system.

nest_imc_event_init() traverse this struct array to calculate the memory
base address for the event-cpu. If we fail to find a match for the event
cpu's chip-id in imc_mem_info struct array, then the do-while loop will
iterate until we crash.

Fix this by changing the loop exit condition based on the number of
nodes in the system.

Reported-by: Dan Carpenter  
Fixes: 885dcd709ba91 ( powerpc/perf: Add nest IMC PMU support)
Signed-off-by: Anju T Sudhakar 
---
 arch/powerpc/perf/imc-pmu.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index 78514170cf71..e9dc771f3e3d 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -459,7 +459,7 @@ static void nest_imc_counters_release(struct perf_event 
*event)
 
 static int nest_imc_event_init(struct perf_event *event)
 {
-   int chip_id, rc, node_id;
+   int chip_id, rc, node_id, nr_chips = num_possible_nodes();
u32 l_config, config = event->attr.config;
struct imc_mem_info *pcni;
struct imc_pmu *pmu;
@@ -508,7 +508,7 @@ static int nest_imc_event_init(struct perf_event *event)
break;
}
pcni++;
-   } while (pcni);
+   } while (--nr_chips);
 
if (!flag)
return -ENODEV;
-- 
2.17.1



Re: [bug report] powerpc/perf: Add nest IMC PMU support

2018-10-24 Thread Anju T Sudhakar

Hi,


On 10/18/18 3:03 PM, Dan Carpenter wrote:

Hello Anju T Sudhakar,

The patch 885dcd709ba9: "powerpc/perf: Add nest IMC PMU support" from
Jul 19, 2017, leads to the following static checker warning:

arch/powerpc/perf/imc-pmu.c:506 nest_imc_event_init()
warn: 'pcni' can't be NULL.


Unfortunately this warning didn't appear when I checked with smatch.

Could you please provide the steps to reproduce this?

This is the commit id with which I build smatch: commit 
79fe36620a7a3a45d1a51d62238da250fb8db920


But anyway I am looking into the code part. Thanks for mentioning this.

I will update soon.


Thanks,

Anju





[PATCH v2 4/4] powerpc/perf: Unregister thread-imc if core-imc not supported

2018-05-22 Thread Anju T Sudhakar
Since thread-imc internally use the core-imc hardware infrastructure
and is depended on it, having thread-imc in the kernel in the
absence of core-imc is trivial. Patch disables thread-imc, if
core-imc is not registered.

Signed-off-by: Anju T Sudhakar <a...@linux.vnet.ibm.com>
Reviewed-by: Madhavan Srinivasan <ma...@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/imc-pmu.h|  1 +
 arch/powerpc/perf/imc-pmu.c   | 12 
 arch/powerpc/platforms/powernv/opal-imc.c |  9 +
 3 files changed, 22 insertions(+)

diff --git a/arch/powerpc/include/asm/imc-pmu.h 
b/arch/powerpc/include/asm/imc-pmu.h
index d76cb11..69f516e 100644
--- a/arch/powerpc/include/asm/imc-pmu.h
+++ b/arch/powerpc/include/asm/imc-pmu.h
@@ -128,4 +128,5 @@ extern int init_imc_pmu(struct device_node *parent,
struct imc_pmu *pmu_ptr, int pmu_id);
 extern void thread_imc_disable(void);
 extern int get_max_nest_dev(void);
+extern void unregister_thread_imc(void);
 #endif /* __ASM_POWERPC_IMC_PMU_H */
diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index f563d30..d1977b6 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -40,6 +40,7 @@ static struct imc_pmu *core_imc_pmu;
 /* Thread IMC data structures and variables */
 
 static DEFINE_PER_CPU(u64 *, thread_imc_mem);
+static struct imc_pmu *thread_imc_pmu;
 static int thread_imc_mem_size;
 
 struct imc_pmu *imc_event_to_pmu(struct perf_event *event)
@@ -1228,6 +1229,16 @@ static void imc_common_cpuhp_mem_free(struct imc_pmu 
*pmu_ptr)
}
 }
 
+/*
+ * Function to unregister thread-imc if core-imc
+ * is not registered.
+ */
+void unregister_thread_imc(void)
+{
+   imc_common_cpuhp_mem_free(thread_imc_pmu);
+   imc_common_mem_free(thread_imc_pmu);
+   perf_pmu_unregister(_imc_pmu->pmu);
+}
 
 /*
  * imc_mem_init : Function to support memory allocation for core imc.
@@ -1296,6 +1307,7 @@ static int imc_mem_init(struct imc_pmu *pmu_ptr, struct 
device_node *parent,
}
}
 
+   thread_imc_pmu = pmu_ptr;
break;
default:
return -EINVAL;
diff --git a/arch/powerpc/platforms/powernv/opal-imc.c 
b/arch/powerpc/platforms/powernv/opal-imc.c
index 490bb72..58a0794 100644
--- a/arch/powerpc/platforms/powernv/opal-imc.c
+++ b/arch/powerpc/platforms/powernv/opal-imc.c
@@ -255,6 +255,7 @@ static int opal_imc_counters_probe(struct platform_device 
*pdev)
 {
struct device_node *imc_dev = pdev->dev.of_node;
int pmu_count = 0, domain;
+   bool core_imc_reg = false, thread_imc_reg = false;
u32 type;
 
/*
@@ -292,6 +293,10 @@ static int opal_imc_counters_probe(struct platform_device 
*pdev)
if (!imc_pmu_create(imc_dev, pmu_count, domain)) {
if (domain == IMC_DOMAIN_NEST)
pmu_count++;
+   if (domain == IMC_DOMAIN_CORE)
+   core_imc_reg = true;
+   if (domain == IMC_DOMAIN_THREAD)
+   thread_imc_reg = true;
}
}
 
@@ -299,6 +304,10 @@ static int opal_imc_counters_probe(struct platform_device 
*pdev)
if (pmu_count == 0)
debugfs_remove_recursive(imc_debugfs_parent);
 
+   /* If core imc is not registered, unregister thread-imc */
+   if (!core_imc_reg && thread_imc_reg)
+   unregister_thread_imc();
+
return 0;
 }
 
-- 
2.7.4



[PATCH v2 3/4] powerpc/perf: Return appropriate value for unknown domain

2018-05-22 Thread Anju T Sudhakar
Return proper error code for unknown domain during IMC initialization.

Signed-off-by: Anju T Sudhakar <a...@linux.vnet.ibm.com>
Reviewed-by: Madhavan Srinivasan <ma...@linux.vnet.ibm.com>
---
 arch/powerpc/perf/imc-pmu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index 61000d1..f563d30 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -1371,7 +1371,7 @@ int init_imc_pmu(struct device_node *parent, struct 
imc_pmu *pmu_ptr, int pmu_id
 
break;
default:
-   return  -1; /* Unknown domain */
+   return  -EINVAL;/* Unknown domain */
}
 
ret = update_events_in_group(parent, pmu_ptr);
-- 
2.7.4



[PATCH v2 2/4] powerpc/perf: Replace the direct return with goto statement

2018-05-22 Thread Anju T Sudhakar
Replace the direct return statement in imc_mem_init() with goto, to adhere
to the kernel coding style.

Signed-off-by: Anju T Sudhakar <a...@linux.vnet.ibm.com> 
Reviewed-by: Madhavan Srinivasan <ma...@linux.vnet.ibm.com>
---
 arch/powerpc/perf/imc-pmu.c | 18 ++
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index c1665ff..61000d1 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -1236,7 +1236,7 @@ static int imc_mem_init(struct imc_pmu *pmu_ptr, struct 
device_node *parent,
int pmu_index)
 {
const char *s;
-   int nr_cores, cpu, res;
+   int nr_cores, cpu, res = -ENOMEM;
 
if (of_property_read_string(parent, "name", ))
return -ENODEV;
@@ -1246,7 +1246,7 @@ static int imc_mem_init(struct imc_pmu *pmu_ptr, struct 
device_node *parent,
/* Update the pmu name */
pmu_ptr->pmu.name = kasprintf(GFP_KERNEL, "%s%s_imc", "nest_", 
s);
if (!pmu_ptr->pmu.name)
-   return -ENOMEM;
+   goto err;
 
/* Needed for hotplug/migration */
if (!per_nest_pmu_arr) {
@@ -1254,7 +1254,7 @@ static int imc_mem_init(struct imc_pmu *pmu_ptr, struct 
device_node *parent,
sizeof(struct imc_pmu *),
GFP_KERNEL);
if (!per_nest_pmu_arr)
-   return -ENOMEM;
+   goto err;
}
per_nest_pmu_arr[pmu_index] = pmu_ptr;
break;
@@ -1262,21 +1262,21 @@ static int imc_mem_init(struct imc_pmu *pmu_ptr, struct 
device_node *parent,
/* Update the pmu name */
pmu_ptr->pmu.name = kasprintf(GFP_KERNEL, "%s%s", s, "_imc");
if (!pmu_ptr->pmu.name)
-   return -ENOMEM;
+   goto err;
 
nr_cores = DIV_ROUND_UP(num_possible_cpus(), threads_per_core);
pmu_ptr->mem_info = kcalloc(nr_cores, sizeof(struct 
imc_mem_info),
GFP_KERNEL);
 
if (!pmu_ptr->mem_info)
-   return -ENOMEM;
+   goto err;
 
core_imc_refc = kcalloc(nr_cores, sizeof(struct imc_pmu_ref),
GFP_KERNEL);
 
if (!core_imc_refc) {
kfree(pmu_ptr->mem_info);
-   return -ENOMEM;
+   goto err;
}
 
core_imc_pmu = pmu_ptr;
@@ -1285,14 +1285,14 @@ static int imc_mem_init(struct imc_pmu *pmu_ptr, struct 
device_node *parent,
/* Update the pmu name */
pmu_ptr->pmu.name = kasprintf(GFP_KERNEL, "%s%s", s, "_imc");
if (!pmu_ptr->pmu.name)
-   return -ENOMEM;
+   goto err;
 
thread_imc_mem_size = pmu_ptr->counter_mem_size;
for_each_online_cpu(cpu) {
res = thread_imc_mem_alloc(cpu, 
pmu_ptr->counter_mem_size);
if (res) {
cleanup_all_thread_imc_memory();
-   return res;
+   goto err;
}
}
 
@@ -1302,6 +1302,8 @@ static int imc_mem_init(struct imc_pmu *pmu_ptr, struct 
device_node *parent,
}
 
return 0;
+err:
+   return res;
 }
 
 /*
-- 
2.7.4



[PATCH v2 1/4] powerpc/perf: Rearrange memory freeing in imc init

2018-05-22 Thread Anju T Sudhakar
When any of the IMC (In-Memory Collection counter) devices fail
to initialize, imc_common_mem_free() frees set of memory. In doing so,
pmu_ptr pointer is also freed. But pmu_ptr pointer is used in subsequent
function (imc_common_cpuhp_mem_free()) which is wrong. Patch here reorders
the code to avoid such access.

Also free the memory which is dynamically allocated during imc
initialization, wherever required.

Signed-off-by: Anju T Sudhakar <a...@linux.vnet.ibm.com>
Reviewed-by: Madhavan Srinivasan <ma...@linux.vnet.ibm.com>
---
 arch/powerpc/perf/imc-pmu.c   | 32 ---
 arch/powerpc/platforms/powernv/opal-imc.c | 13 ++---
 2 files changed, 27 insertions(+), 18 deletions(-)

diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index 75fb23c..c1665ff 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -1153,7 +1153,7 @@ static void cleanup_all_core_imc_memory(void)
/* mem_info will never be NULL */
for (i = 0; i < nr_cores; i++) {
if (ptr[i].vbase)
-   free_pages((u64)ptr->vbase, get_order(size));
+   free_pages((u64)ptr[i].vbase, get_order(size));
}
 
kfree(ptr);
@@ -1191,7 +1191,6 @@ static void imc_common_mem_free(struct imc_pmu *pmu_ptr)
if (pmu_ptr->attr_groups[IMC_EVENT_ATTR])
kfree(pmu_ptr->attr_groups[IMC_EVENT_ATTR]->attrs);
kfree(pmu_ptr->attr_groups[IMC_EVENT_ATTR]);
-   kfree(pmu_ptr);
 }
 
 /*
@@ -1208,6 +1207,7 @@ static void imc_common_cpuhp_mem_free(struct imc_pmu 
*pmu_ptr)

cpuhp_remove_state(CPUHP_AP_PERF_POWERPC_NEST_IMC_ONLINE);
kfree(nest_imc_refc);
kfree(per_nest_pmu_arr);
+   per_nest_pmu_arr = NULL;
}
 
if (nest_pmus > 0)
@@ -1319,10 +1319,8 @@ int init_imc_pmu(struct device_node *parent, struct 
imc_pmu *pmu_ptr, int pmu_id
int ret;
 
ret = imc_mem_init(pmu_ptr, parent, pmu_idx);
-   if (ret) {
-   imc_common_mem_free(pmu_ptr);
-   return ret;
-   }
+   if (ret)
+   goto err_free_mem;
 
switch (pmu_ptr->domain) {
case IMC_DOMAIN_NEST:
@@ -1337,7 +1335,9 @@ int init_imc_pmu(struct device_node *parent, struct 
imc_pmu *pmu_ptr, int pmu_id
ret = init_nest_pmu_ref();
if (ret) {
mutex_unlock(_init_lock);
-   goto err_free;
+   kfree(per_nest_pmu_arr);
+   per_nest_pmu_arr = NULL;
+   goto err_free_mem;
}
/* Register for cpu hotplug notification. */
ret = nest_pmu_cpumask_init();
@@ -1345,7 +1345,8 @@ int init_imc_pmu(struct device_node *parent, struct 
imc_pmu *pmu_ptr, int pmu_id
mutex_unlock(_init_lock);
kfree(nest_imc_refc);
kfree(per_nest_pmu_arr);
-   goto err_free;
+   per_nest_pmu_arr = NULL;
+   goto err_free_mem;
}
}
nest_pmus++;
@@ -1355,7 +1356,7 @@ int init_imc_pmu(struct device_node *parent, struct 
imc_pmu *pmu_ptr, int pmu_id
ret = core_imc_pmu_cpumask_init();
if (ret) {
cleanup_all_core_imc_memory();
-   return ret;
+   goto err_free_mem;
}
 
break;
@@ -1363,7 +1364,7 @@ int init_imc_pmu(struct device_node *parent, struct 
imc_pmu *pmu_ptr, int pmu_id
ret = thread_imc_cpu_init();
if (ret) {
cleanup_all_thread_imc_memory();
-   return ret;
+   goto err_free_mem;
}
 
break;
@@ -1373,23 +1374,24 @@ int init_imc_pmu(struct device_node *parent, struct 
imc_pmu *pmu_ptr, int pmu_id
 
ret = update_events_in_group(parent, pmu_ptr);
if (ret)
-   goto err_free;
+   goto err_free_cpuhp_mem;
 
ret = update_pmu_ops(pmu_ptr);
if (ret)
-   goto err_free;
+   goto err_free_cpuhp_mem;
 
ret = perf_pmu_register(_ptr->pmu, pmu_ptr->pmu.name, -1);
if (ret)
-   goto err_free;
+   goto err_free_cpuhp_mem;
 
pr_info("%s performance monitor hardware support registered\n",
pmu_ptr->pmu.name);
 
return 0;
 
-err_free:
-   imc_common_mem_free(pmu_ptr);
+err_free_cpuhp_mem:
  

[PATCH v2 0/4] powerpc/perf: IMC Cleanups

2018-05-22 Thread Anju T Sudhakar
This patch series includes some cleanups and Unregistration of  
thread-imc pmu, if the kernel does not have core-imc registered.

The entire patch set has been verified using the static checker smatch. 
Command used:   
$ make ARCH=powerpc CHECK="/smatch -p=kernel"  C=1 vmlinux | tee warns.txt

Tests Done: 

* Fail core-imc at init:
nest-imc - working  
cpuhotplug - works as expected  
thread-imc - not registered 

* Fail thread-imc at init:  
nest-imc - works
core-imc - works
cpuhotplug - works  

* Fail nest-imc at init 

core-imc - works
thread-imc -works   
cpuhotplug - works  

* Fail only one nest unit (say for mcs23)   

Other nest-units - works
core-imc - works
thread-imc - works  
cpuhotplug - works. 


* Kexec works   

The first three patches in this series addresses the comments by Dan Carpenter. 

Patch series is based on:
https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git (merge branch)


Anju T Sudhakar (4):
  powerpc/perf: Rearrange memory freeing in imc init
  powerpc/perf: Replace the direct return with goto statement
  powerpc/perf: Return appropriate value for unknown domain
  powerpc/perf: Unregister thread-imc if core-imc not supported

 arch/powerpc/include/asm/imc-pmu.h|  1 +
 arch/powerpc/perf/imc-pmu.c   | 64 +++
 arch/powerpc/platforms/powernv/opal-imc.c | 22 +--
 3 files changed, 60 insertions(+), 27 deletions(-)

-- 
2.7.4



[PATCH] powerpc/perf: Remove sched_task function defined for thread-imc

2018-05-18 Thread Anju T Sudhakar
Call trace observed while running perf-fuzzer:

[  329.228068] CPU: 43 PID: 9088 Comm: perf_fuzzer Not tainted 
4.13.0-32-generic #35~lp1746225
[  329.228070] task: c03f776ac900 task.stack: c03f77728000
[  329.228071] NIP: c0299b70 LR: c02a4534 CTR: c029bb80
[  329.228073] REGS: c03f7772b760 TRAP: 0700   Not tainted  
(4.13.0-32-generic)
[  329.228073] MSR: 9282b033 <SF,HV,VEC,VSX,EE,FP,ME,IR,DR,RI,LE>
[  329.228079]   CR: 24008822  XER: 
[  329.228080] CFAR: c0299a70 SOFTE: 0
GPR00: c02a4534 c03f7772b9e0 c1606200 c03fef858908
GPR04: c03f776ac900 0001  003fee73
GPR08:   c11220d8 0002
GPR12: c029bb80 c7a3d900  
GPR16:    
GPR20:   c03f776ad090 c0c71354
GPR24: c03fef716780 003fee73 c03fe69d4200 c03f776ad330
GPR28: c11220d8 0001 c14c6108 c03fef858900
[  329.228098] NIP [c0299b70] perf_pmu_sched_task+0x170/0x180
[  329.228100] LR [c02a4534] __perf_event_task_sched_in+0xc4/0x230
[  329.228101] Call Trace:
[  329.228102] [c03f7772b9e0] [c02a0678] 
perf_iterate_sb+0x158/0x2a0 (unreliable)
[  329.228105] [c03f7772ba30] [c02a4534] 
__perf_event_task_sched_in+0xc4/0x230
[  329.228107] [c03f7772bab0] [c01396dc] 
finish_task_switch+0x21c/0x310
[  329.228109] [c03f7772bb60] [c0c71354] __schedule+0x304/0xb80
[  329.228111] [c03f7772bc40] [c0c71c10] schedule+0x40/0xc0
[  329.228113] [c03f7772bc60] [c01033f4] do_wait+0x254/0x2e0
[  329.228115] [c03f7772bcd0] [c0104ac0] kernel_wait4+0xa0/0x1a0
[  329.228117] [c03f7772bd70] [c0104c24] SyS_wait4+0x64/0xc0
[  329.228121] [c03f7772be30] [c000b184] system_call+0x58/0x6c
[  329.228121] Instruction dump:
[  329.228123] 3beafea0 7faa4800 409eff18 e8010060 eb610028 ebc10040 7c0803a6 
38210050
[  329.228127] eb81ffe0 eba1ffe8 ebe1fff8 4e800020 <0fe0> 4bbc 6000 
6042
[  329.228131] ---[ end trace 8c46856d314c1811 ]---
[  375.755943] hrtimer: interrupt took 31601 ns


The context switch call-backs for thread-imc are defined in sched_task function.
So when thread-imc events are grouped with software pmu events,
perf_pmu_sched_task hits the WARN_ON_ONCE condition, since software PMUs are
assumed not to have a sched_task defined. 
 
Patch to move the thread_imc enable/disable opal call back from sched_task to
event_[add/del] function

Signed-off-by: Anju T Sudhakar <a...@linux.vnet.ibm.com>
---
 arch/powerpc/perf/imc-pmu.c | 108 +---
 1 file changed, 51 insertions(+), 57 deletions(-)

diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index d7532e7..71d9ba7 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -866,59 +866,6 @@ static int thread_imc_cpu_init(void)
  ppc_thread_imc_cpu_offline);
 }
 
-void thread_imc_pmu_sched_task(struct perf_event_context *ctx,
- bool sched_in)
-{
-   int core_id;
-   struct imc_pmu_ref *ref;
-
-   if (!is_core_imc_mem_inited(smp_processor_id()))
-   return;
-
-   core_id = smp_processor_id() / threads_per_core;
-   /*
-* imc pmus are enabled only when it is used.
-* See if this is triggered for the first time.
-* If yes, take the mutex lock and enable the counters.
-* If not, just increment the count in ref count struct.
-*/
-   ref = _imc_refc[core_id];
-   if (!ref)
-   return;
-
-   if (sched_in) {
-   mutex_lock(>lock);
-   if (ref->refc == 0) {
-   if (opal_imc_counters_start(OPAL_IMC_COUNTERS_CORE,
-get_hard_smp_processor_id(smp_processor_id( {
-   mutex_unlock(>lock);
-   pr_err("thread-imc: Unable to start the counter\
-   for core %d\n", 
core_id);
-   return;
-   }
-   }
-   ++ref->refc;
-   mutex_unlock(>lock);
-   } else {
-   mutex_lock(>lock);
-   ref->refc--;
-   if (ref->refc == 0) {
-   if (opal_imc_counters_stop(OPAL_IMC_COUNTERS_CORE,
-   get_hard_smp_processor_id(smp_processor_id( {
-   mutex_unlock(>lock);
-   pr_err("thread-imc: Unable to stop the counters\
-   for c

Re: [PATCH v2] powerpc/perf: Fix memory allocation for core-imc based on num_possible_cpus()

2018-05-16 Thread Anju T Sudhakar



On Wednesday 16 May 2018 12:18 PM, ppaidipe wrote:

On 2018-05-16 12:05, Anju T Sudhakar wrote:

Currently memory is allocated for core-imc based on cpu_present_mask,
which has bit 'cpu' set iff cpu is populated. We use (cpu number / 
threads

per core) as the array index to access the memory.

Under some circumstances firmware marks a CPU as GUARDed CPU and boot 
the

system, until cleared of errors, these CPU's are unavailable for all
subsequent boots. GUARDed CPUs are possible but not present from linux
view, so it blows a hole when we assume the max length of our allocation
is driven by our max present cpus, where as one of the cpus might be 
online

and be beyond the max present cpus, due to the hole.
So (cpu number / threads per core) value bounds the array index and 
leads

to memory overflow.

Call trace observed during a guard test:

Faulting instruction address: 0xc0149f1c
cpu 0x69: Vector: 380 (Data Access Out of Range) at [c03fea303420]
    pc:c0149f1c: prefetch_freepointer+0x14/0x30
    lr:c014e0f8: __kmalloc+0x1a8/0x1ac
    sp:c03fea3036a0
   msr:90009033
   dar:c9c54b2c91dbf6b7
  current = 0xc03fea2c
  paca    = 0xcfddd880 softe: 3 irq_happened: 0x01
    pid   = 1, comm = swapper/104
Linux version 4.16.7-openpower1 (smc@smc-desktop) (gcc version 6.4.0
(Buildroot 2018.02.1-6-ga8d1126)) #2 SMP Fri May 4 16:44:54 PDT 2018
enter ? for help
call trace:
 __kmalloc+0x1a8/0x1ac
 (unreliable)
 init_imc_pmu+0x7f4/0xbf0
 opal_imc_counters_probe+0x3fc/0x43c
 platform_drv_probe+0x48/0x80
 driver_probe_device+0x22c/0x308
 __driver_attach+0xa0/0xd8
 bus_for_each_dev+0x88/0xb4
 driver_attach+0x2c/0x40
 bus_add_driver+0x1e8/0x228
 driver_register+0xd0/0x114
 __platform_driver_register+0x50/0x64
 opal_imc_driver_init+0x24/0x38
 do_one_initcall+0x150/0x15c
 kernel_init_freeable+0x250/0x254
 kernel_init+0x1c/0x150
 ret_from_kernel_thread+0x5c/0xc8

Allocating memory for core-imc based on cpu_possible_mask, which has
bit 'cpu' set iff cpu is populatable, will fix this issue.

Reported-by: Pridhiviraj Paidipeddi <ppaid...@linux.vnet.ibm.com>
Signed-off-by: Anju T Sudhakar <a...@linux.vnet.ibm.com>
Reviewed-by: Balbir Singh <bsinghar...@gmail.com>


I have verified this fix with both normal and kexec boot multiple 
times when
system is having GARDed cores. Not seen any crash/memory corruption 
issues with

this.

Tested-by: Pridhiviraj Paidipeddi <ppaid...@linux.vnet.ibm.com>




Thanks.

-Anju


---
 arch/powerpc/perf/imc-pmu.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index d7532e7..75fb23c 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -1146,7 +1146,7 @@ static int init_nest_pmu_ref(void)

 static void cleanup_all_core_imc_memory(void)
 {
-    int i, nr_cores = DIV_ROUND_UP(num_present_cpus(), 
threads_per_core);
+    int i, nr_cores = DIV_ROUND_UP(num_possible_cpus(), 
threads_per_core);

 struct imc_mem_info *ptr = core_imc_pmu->mem_info;
 int size = core_imc_pmu->counter_mem_size;

@@ -1264,7 +1264,7 @@ static int imc_mem_init(struct imc_pmu *pmu_ptr,
struct device_node *parent,
 if (!pmu_ptr->pmu.name)
 return -ENOMEM;

-    nr_cores = DIV_ROUND_UP(num_present_cpus(), threads_per_core);
+    nr_cores = DIV_ROUND_UP(num_possible_cpus(), threads_per_core);
 pmu_ptr->mem_info = kcalloc(nr_cores, sizeof(struct 
imc_mem_info),

 GFP_KERNEL);




[PATCH v2] powerpc/perf: Fix memory allocation for core-imc based on num_possible_cpus()

2018-05-16 Thread Anju T Sudhakar
Currently memory is allocated for core-imc based on cpu_present_mask,
which has bit 'cpu' set iff cpu is populated. We use (cpu number / threads
per core) as the array index to access the memory.

Under some circumstances firmware marks a CPU as GUARDed CPU and boot the
system, until cleared of errors, these CPU's are unavailable for all
subsequent boots. GUARDed CPUs are possible but not present from linux
view, so it blows a hole when we assume the max length of our allocation
is driven by our max present cpus, where as one of the cpus might be online
and be beyond the max present cpus, due to the hole. 
So (cpu number / threads per core) value bounds the array index and leads
to memory overflow.

Call trace observed during a guard test:

Faulting instruction address: 0xc0149f1c
cpu 0x69: Vector: 380 (Data Access Out of Range) at [c03fea303420]
pc:c0149f1c: prefetch_freepointer+0x14/0x30
lr:c014e0f8: __kmalloc+0x1a8/0x1ac
sp:c03fea3036a0
   msr:90009033
   dar:c9c54b2c91dbf6b7
  current = 0xc03fea2c
  paca= 0xcfddd880   softe: 3irq_happened: 0x01
pid   = 1, comm = swapper/104
Linux version 4.16.7-openpower1 (smc@smc-desktop) (gcc version 6.4.0
(Buildroot 2018.02.1-6-ga8d1126)) #2 SMP Fri May 4 16:44:54 PDT 2018
enter ? for help
call trace:
 __kmalloc+0x1a8/0x1ac
 (unreliable)
 init_imc_pmu+0x7f4/0xbf0
 opal_imc_counters_probe+0x3fc/0x43c
 platform_drv_probe+0x48/0x80
 driver_probe_device+0x22c/0x308
 __driver_attach+0xa0/0xd8
 bus_for_each_dev+0x88/0xb4
 driver_attach+0x2c/0x40
 bus_add_driver+0x1e8/0x228
 driver_register+0xd0/0x114
 __platform_driver_register+0x50/0x64
 opal_imc_driver_init+0x24/0x38
 do_one_initcall+0x150/0x15c
 kernel_init_freeable+0x250/0x254
 kernel_init+0x1c/0x150
 ret_from_kernel_thread+0x5c/0xc8

Allocating memory for core-imc based on cpu_possible_mask, which has
bit 'cpu' set iff cpu is populatable, will fix this issue.

Reported-by: Pridhiviraj Paidipeddi <ppaid...@linux.vnet.ibm.com>
Signed-off-by: Anju T Sudhakar <a...@linux.vnet.ibm.com>
Reviewed-by: Balbir Singh <bsinghar...@gmail.com>
---
 arch/powerpc/perf/imc-pmu.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index d7532e7..75fb23c 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -1146,7 +1146,7 @@ static int init_nest_pmu_ref(void)
 
 static void cleanup_all_core_imc_memory(void)
 {
-   int i, nr_cores = DIV_ROUND_UP(num_present_cpus(), threads_per_core);
+   int i, nr_cores = DIV_ROUND_UP(num_possible_cpus(), threads_per_core);
struct imc_mem_info *ptr = core_imc_pmu->mem_info;
int size = core_imc_pmu->counter_mem_size;
 
@@ -1264,7 +1264,7 @@ static int imc_mem_init(struct imc_pmu *pmu_ptr, struct 
device_node *parent,
if (!pmu_ptr->pmu.name)
return -ENOMEM;
 
-   nr_cores = DIV_ROUND_UP(num_present_cpus(), threads_per_core);
+   nr_cores = DIV_ROUND_UP(num_possible_cpus(), threads_per_core);
pmu_ptr->mem_info = kcalloc(nr_cores, sizeof(struct 
imc_mem_info),
GFP_KERNEL);
 
-- 
2.7.4



Re: [PATCH] powerpc/perf: Fix memory allocation for core-imc based on num_possible_cpus()

2018-05-14 Thread Anju T Sudhakar



On Friday 11 May 2018 07:13 PM, Anju T Sudhakar wrote:

Currently memory is allocated for core-imc based on cpu_present_mask, which has
bit 'cpu' set iff cpu is populated. We use  (cpu number / threads per core)
as as array index to access the memory.
So in a system with guarded cores, since allocation happens based on
cpu_present_mask, (cpu number / threads per core) bounds the index and leads
to memory overflow.

The issue is exposed in a guard test.
The guard test will make some CPU's as un-available to the system during boot
time as well as at runtime. So when the cpu is unavailable to the system during
boot time, the memory allocation happens depending on the number of available
cpus. And when we access the memory using (cpu number / threads per core) as the
index the system crashes due to memory overflow.

Allocating memory for core-imc based on cpu_possible_mask, which has
bit 'cpu' set iff cpu is populatable, will fix this issue.

Reported-by: Pridhiviraj Paidipeddi <ppaid...@linux.vnet.ibm.com>
Signed-off-by: Anju T Sudhakar <a...@linux.vnet.ibm.com>


Cc: <sta...@vger.kernel.org> # v4.14 +


---
  arch/powerpc/perf/imc-pmu.c | 4 ++--
  1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index d7532e7..75fb23c 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -1146,7 +1146,7 @@ static int init_nest_pmu_ref(void)

  static void cleanup_all_core_imc_memory(void)
  {
-   int i, nr_cores = DIV_ROUND_UP(num_present_cpus(), threads_per_core);
+   int i, nr_cores = DIV_ROUND_UP(num_possible_cpus(), threads_per_core);
struct imc_mem_info *ptr = core_imc_pmu->mem_info;
int size = core_imc_pmu->counter_mem_size;

@@ -1264,7 +1264,7 @@ static int imc_mem_init(struct imc_pmu *pmu_ptr, struct 
device_node *parent,
if (!pmu_ptr->pmu.name)
return -ENOMEM;

-   nr_cores = DIV_ROUND_UP(num_present_cpus(), threads_per_core);
+   nr_cores = DIV_ROUND_UP(num_possible_cpus(), threads_per_core);
pmu_ptr->mem_info = kcalloc(nr_cores, sizeof(struct 
imc_mem_info),
GFP_KERNEL);





Re: [PATCH] powerpc/perf: Fix memory allocation for core-imc based on num_possible_cpus()

2018-05-14 Thread Anju T Sudhakar

Hi,


On Saturday 12 May 2018 06:05 AM, Balbir Singh wrote:

On Fri, May 11, 2018 at 11:43 PM, Anju T Sudhakar
<a...@linux.vnet.ibm.com> wrote:

Currently memory is allocated for core-imc based on cpu_present_mask, which has
bit 'cpu' set iff cpu is populated. We use  (cpu number / threads per core)
as as array index to access the memory.
So in a system with guarded cores, since allocation happens based on
cpu_present_mask, (cpu number / threads per core) bounds the index and leads
to memory overflow.

The issue is exposed in a guard test.
The guard test will make some CPU's as un-available to the system during boot
time as well as at runtime. So when the cpu is unavailable to the system during
boot time, the memory allocation happens depending on the number of available
cpus. And when we access the memory using (cpu number / threads per core) as the
index the system crashes due to memory overflow.

Allocating memory for core-imc based on cpu_possible_mask, which has
bit 'cpu' set iff cpu is populatable, will fix this issue.

Reported-by: Pridhiviraj Paidipeddi <ppaid...@linux.vnet.ibm.com>
Signed-off-by: Anju T Sudhakar <a...@linux.vnet.ibm.com>
---
  arch/powerpc/perf/imc-pmu.c | 4 ++--
  1 file changed, 2 insertions(+), 2 deletions(-)

The changelog does not clearly call out the confusion between present
and possible.
Guarded CPUs are possible but not present, so it blows a hole when we assume the
max length of our allocation is driven by our max present cpus, where
as one of the cpus
might be online and be beyond the max present cpus, due to the hole..

Reviewed-by: Balbir Singh <bsinghar...@gmail.com>

Balbir Singh.



Thanks for the review.
OK. I will update the commit message here.



Regards,
Anju



[PATCH] powerpc/perf: Fix memory allocation for core-imc based on num_possible_cpus()

2018-05-11 Thread Anju T Sudhakar
Currently memory is allocated for core-imc based on cpu_present_mask, which has
bit 'cpu' set iff cpu is populated. We use  (cpu number / threads per core)
as as array index to access the memory.
So in a system with guarded cores, since allocation happens based on
cpu_present_mask, (cpu number / threads per core) bounds the index and leads
to memory overflow.

The issue is exposed in a guard test.
The guard test will make some CPU's as un-available to the system during boot
time as well as at runtime. So when the cpu is unavailable to the system during
boot time, the memory allocation happens depending on the number of available
cpus. And when we access the memory using (cpu number / threads per core) as the
index the system crashes due to memory overflow.

Allocating memory for core-imc based on cpu_possible_mask, which has
bit 'cpu' set iff cpu is populatable, will fix this issue.

Reported-by: Pridhiviraj Paidipeddi <ppaid...@linux.vnet.ibm.com>
Signed-off-by: Anju T Sudhakar <a...@linux.vnet.ibm.com>
---
 arch/powerpc/perf/imc-pmu.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index d7532e7..75fb23c 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -1146,7 +1146,7 @@ static int init_nest_pmu_ref(void)
 
 static void cleanup_all_core_imc_memory(void)
 {
-   int i, nr_cores = DIV_ROUND_UP(num_present_cpus(), threads_per_core);
+   int i, nr_cores = DIV_ROUND_UP(num_possible_cpus(), threads_per_core);
struct imc_mem_info *ptr = core_imc_pmu->mem_info;
int size = core_imc_pmu->counter_mem_size;
 
@@ -1264,7 +1264,7 @@ static int imc_mem_init(struct imc_pmu *pmu_ptr, struct 
device_node *parent,
if (!pmu_ptr->pmu.name)
return -ENOMEM;
 
-   nr_cores = DIV_ROUND_UP(num_present_cpus(), threads_per_core);
+   nr_cores = DIV_ROUND_UP(num_possible_cpus(), threads_per_core);
pmu_ptr->mem_info = kcalloc(nr_cores, sizeof(struct 
imc_mem_info),
GFP_KERNEL);
 
-- 
2.7.4



[PATCH 3/4] powerpc/perf: Return appropriate value for unknown domain

2018-04-09 Thread Anju T Sudhakar
Return proper error code for unknown domain during IMC initialization.

Signed-off-by: Anju T Sudhakar <a...@linux.vnet.ibm.com>
---
 arch/powerpc/perf/imc-pmu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index 1b285cd..4b4ca83 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -1371,7 +1371,7 @@ int init_imc_pmu(struct device_node *parent, struct 
imc_pmu *pmu_ptr, int pmu_id
 
break;
default:
-   return  -1; /* Unknown domain */
+   return  -EINVAL;/* Unknown domain */
}
 
ret = update_events_in_group(parent, pmu_ptr);
-- 
2.7.4



[PATCH 4/4] powerpc/perf: Unregister thread-imc if core-imc not supported

2018-04-09 Thread Anju T Sudhakar
Enable thread-imc in the kernel, only if core-imc is registered.

Signed-off-by: Anju T Sudhakar <a...@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/imc-pmu.h|  1 +
 arch/powerpc/perf/imc-pmu.c   | 12 
 arch/powerpc/platforms/powernv/opal-imc.c |  9 +
 3 files changed, 22 insertions(+)

diff --git a/arch/powerpc/include/asm/imc-pmu.h 
b/arch/powerpc/include/asm/imc-pmu.h
index d76cb11..69f516e 100644
--- a/arch/powerpc/include/asm/imc-pmu.h
+++ b/arch/powerpc/include/asm/imc-pmu.h
@@ -128,4 +128,5 @@ extern int init_imc_pmu(struct device_node *parent,
struct imc_pmu *pmu_ptr, int pmu_id);
 extern void thread_imc_disable(void);
 extern int get_max_nest_dev(void);
+extern void unregister_thread_imc(void);
 #endif /* __ASM_POWERPC_IMC_PMU_H */
diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index 4b4ca83..fa88785 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -40,6 +40,7 @@ static struct imc_pmu *core_imc_pmu;
 /* Thread IMC data structures and variables */
 
 static DEFINE_PER_CPU(u64 *, thread_imc_mem);
+static struct imc_pmu *thread_imc_pmu;
 static int thread_imc_mem_size;
 
 struct imc_pmu *imc_event_to_pmu(struct perf_event *event)
@@ -1228,6 +1229,16 @@ static void imc_common_cpuhp_mem_free(struct imc_pmu 
*pmu_ptr)
}
 }
 
+/*
+ * Function to unregister thread-imc if core-imc
+ * is not registered.
+ */
+void unregister_thread_imc(void)
+{
+   imc_common_cpuhp_mem_free(thread_imc_pmu);
+   imc_common_mem_free(thread_imc_pmu);
+   perf_pmu_unregister(_imc_pmu->pmu);
+}
 
 /*
  * imc_mem_init : Function to support memory allocation for core imc.
@@ -1296,6 +1307,7 @@ static int imc_mem_init(struct imc_pmu *pmu_ptr, struct 
device_node *parent,
}
}
 
+   thread_imc_pmu = pmu_ptr;
break;
default:
return -EINVAL;
diff --git a/arch/powerpc/platforms/powernv/opal-imc.c 
b/arch/powerpc/platforms/powernv/opal-imc.c
index 490bb72..58a0794 100644
--- a/arch/powerpc/platforms/powernv/opal-imc.c
+++ b/arch/powerpc/platforms/powernv/opal-imc.c
@@ -255,6 +255,7 @@ static int opal_imc_counters_probe(struct platform_device 
*pdev)
 {
struct device_node *imc_dev = pdev->dev.of_node;
int pmu_count = 0, domain;
+   bool core_imc_reg = false, thread_imc_reg = false;
u32 type;
 
/*
@@ -292,6 +293,10 @@ static int opal_imc_counters_probe(struct platform_device 
*pdev)
if (!imc_pmu_create(imc_dev, pmu_count, domain)) {
if (domain == IMC_DOMAIN_NEST)
pmu_count++;
+   if (domain == IMC_DOMAIN_CORE)
+   core_imc_reg = true;
+   if (domain == IMC_DOMAIN_THREAD)
+   thread_imc_reg = true;
}
}
 
@@ -299,6 +304,10 @@ static int opal_imc_counters_probe(struct platform_device 
*pdev)
if (pmu_count == 0)
debugfs_remove_recursive(imc_debugfs_parent);
 
+   /* If core imc is not registered, unregister thread-imc */
+   if (!core_imc_reg && thread_imc_reg)
+   unregister_thread_imc();
+
return 0;
 }
 
-- 
2.7.4



[PATCH 2/4] powerpc/perf: Replace the direct return with goto statement

2018-04-09 Thread Anju T Sudhakar
Replace the direct return statement in imc_mem_init() with goto,
to adhere to the kernel coding style.

Signed-off-by: Anju T Sudhakar <a...@linux.vnet.ibm.com>
---
 arch/powerpc/perf/imc-pmu.c | 18 ++
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index 258b0f4..1b285cd 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -1236,7 +1236,7 @@ static int imc_mem_init(struct imc_pmu *pmu_ptr, struct 
device_node *parent,
int pmu_index)
 {
const char *s;
-   int nr_cores, cpu, res;
+   int nr_cores, cpu, res = -ENOMEM;
 
if (of_property_read_string(parent, "name", ))
return -ENODEV;
@@ -1246,7 +1246,7 @@ static int imc_mem_init(struct imc_pmu *pmu_ptr, struct 
device_node *parent,
/* Update the pmu name */
pmu_ptr->pmu.name = kasprintf(GFP_KERNEL, "%s%s_imc", "nest_", 
s);
if (!pmu_ptr->pmu.name)
-   return -ENOMEM;
+   goto err;
 
/* Needed for hotplug/migration */
if (!per_nest_pmu_arr) {
@@ -1254,7 +1254,7 @@ static int imc_mem_init(struct imc_pmu *pmu_ptr, struct 
device_node *parent,
sizeof(struct imc_pmu *),
GFP_KERNEL);
if (!per_nest_pmu_arr)
-   return -ENOMEM;
+   goto err;
}
per_nest_pmu_arr[pmu_index] = pmu_ptr;
break;
@@ -1262,21 +1262,21 @@ static int imc_mem_init(struct imc_pmu *pmu_ptr, struct 
device_node *parent,
/* Update the pmu name */
pmu_ptr->pmu.name = kasprintf(GFP_KERNEL, "%s%s", s, "_imc");
if (!pmu_ptr->pmu.name)
-   return -ENOMEM;
+   goto err;
 
nr_cores = DIV_ROUND_UP(num_present_cpus(), threads_per_core);
pmu_ptr->mem_info = kcalloc(nr_cores, sizeof(struct 
imc_mem_info),
GFP_KERNEL);
 
if (!pmu_ptr->mem_info)
-   return -ENOMEM;
+   goto err;
 
core_imc_refc = kcalloc(nr_cores, sizeof(struct imc_pmu_ref),
GFP_KERNEL);
 
if (!core_imc_refc) {
kfree(pmu_ptr->mem_info);
-   return -ENOMEM;
+   goto err;
}
 
core_imc_pmu = pmu_ptr;
@@ -1285,14 +1285,14 @@ static int imc_mem_init(struct imc_pmu *pmu_ptr, struct 
device_node *parent,
/* Update the pmu name */
pmu_ptr->pmu.name = kasprintf(GFP_KERNEL, "%s%s", s, "_imc");
if (!pmu_ptr->pmu.name)
-   return -ENOMEM;
+   goto err;
 
thread_imc_mem_size = pmu_ptr->counter_mem_size;
for_each_online_cpu(cpu) {
res = thread_imc_mem_alloc(cpu, 
pmu_ptr->counter_mem_size);
if (res) {
cleanup_all_thread_imc_memory();
-   return res;
+   goto err;
}
}
 
@@ -1302,6 +1302,8 @@ static int imc_mem_init(struct imc_pmu *pmu_ptr, struct 
device_node *parent,
}
 
return 0;
+err:
+   return res;
 }
 
 /*
-- 
2.7.4



[PATCH 1/4] powerpc/perf: Rearrange memory freeing in imc init

2018-04-09 Thread Anju T Sudhakar
When any of the IMC (In-Memory Collection counter) devices fail
to initialize, imc_common_mem_free() frees set of memory. In doing so,
pmu_ptr pointer is also freed. But pmu_ptr pointer is used in subsequent
function (imc_common_cpuhp_mem_free()) which is wrong. Patch here reorders
the code to avoid such access.

Also free the memory which is dynamically allocated during imc initialization,
wherever required.

Signed-off-by: Anju T Sudhakar <a...@linux.vnet.ibm.com>
---
test matrix and static checker run details are updated in the cover letter
patch is based on 
https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git (branch: 
merge)

 arch/powerpc/perf/imc-pmu.c   | 32 ---
 arch/powerpc/platforms/powernv/opal-imc.c | 13 ++---
 2 files changed, 27 insertions(+), 18 deletions(-)

diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index d7532e7..258b0f4 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -1153,7 +1153,7 @@ static void cleanup_all_core_imc_memory(void)
/* mem_info will never be NULL */
for (i = 0; i < nr_cores; i++) {
if (ptr[i].vbase)
-   free_pages((u64)ptr->vbase, get_order(size));
+   free_pages((u64)ptr[i].vbase, get_order(size));
}
 
kfree(ptr);
@@ -1191,7 +1191,6 @@ static void imc_common_mem_free(struct imc_pmu *pmu_ptr)
if (pmu_ptr->attr_groups[IMC_EVENT_ATTR])
kfree(pmu_ptr->attr_groups[IMC_EVENT_ATTR]->attrs);
kfree(pmu_ptr->attr_groups[IMC_EVENT_ATTR]);
-   kfree(pmu_ptr);
 }
 
 /*
@@ -1208,6 +1207,7 @@ static void imc_common_cpuhp_mem_free(struct imc_pmu 
*pmu_ptr)

cpuhp_remove_state(CPUHP_AP_PERF_POWERPC_NEST_IMC_ONLINE);
kfree(nest_imc_refc);
kfree(per_nest_pmu_arr);
+   per_nest_pmu_arr = NULL;
}
 
if (nest_pmus > 0)
@@ -1319,10 +1319,8 @@ int init_imc_pmu(struct device_node *parent, struct 
imc_pmu *pmu_ptr, int pmu_id
int ret;
 
ret = imc_mem_init(pmu_ptr, parent, pmu_idx);
-   if (ret) {
-   imc_common_mem_free(pmu_ptr);
-   return ret;
-   }
+   if (ret)
+   goto err_free_mem;
 
switch (pmu_ptr->domain) {
case IMC_DOMAIN_NEST:
@@ -1337,7 +1335,9 @@ int init_imc_pmu(struct device_node *parent, struct 
imc_pmu *pmu_ptr, int pmu_id
ret = init_nest_pmu_ref();
if (ret) {
mutex_unlock(_init_lock);
-   goto err_free;
+   kfree(per_nest_pmu_arr);
+   per_nest_pmu_arr = NULL;
+   goto err_free_mem;
}
/* Register for cpu hotplug notification. */
ret = nest_pmu_cpumask_init();
@@ -1345,7 +1345,8 @@ int init_imc_pmu(struct device_node *parent, struct 
imc_pmu *pmu_ptr, int pmu_id
mutex_unlock(_init_lock);
kfree(nest_imc_refc);
kfree(per_nest_pmu_arr);
-   goto err_free;
+   per_nest_pmu_arr = NULL;
+   goto err_free_mem;
}
}
nest_pmus++;
@@ -1355,7 +1356,7 @@ int init_imc_pmu(struct device_node *parent, struct 
imc_pmu *pmu_ptr, int pmu_id
ret = core_imc_pmu_cpumask_init();
if (ret) {
cleanup_all_core_imc_memory();
-   return ret;
+   goto err_free_mem;
}
 
break;
@@ -1363,7 +1364,7 @@ int init_imc_pmu(struct device_node *parent, struct 
imc_pmu *pmu_ptr, int pmu_id
ret = thread_imc_cpu_init();
if (ret) {
cleanup_all_thread_imc_memory();
-   return ret;
+   goto err_free_mem;
}
 
break;
@@ -1373,23 +1374,24 @@ int init_imc_pmu(struct device_node *parent, struct 
imc_pmu *pmu_ptr, int pmu_id
 
ret = update_events_in_group(parent, pmu_ptr);
if (ret)
-   goto err_free;
+   goto err_free_cpuhp_mem;
 
ret = update_pmu_ops(pmu_ptr);
if (ret)
-   goto err_free;
+   goto err_free_cpuhp_mem;
 
ret = perf_pmu_register(_ptr->pmu, pmu_ptr->pmu.name, -1);
if (ret)
-   goto err_free;
+   goto err_free_cpuhp_mem;
 
pr_info("%s performance monitor hardware support registered\n",
pmu_p

[PATCH 0/4] powerpc/perf: IMC Cleanups

2018-04-09 Thread Anju T Sudhakar
This patch series includes some cleanups and Unregistration of  
thread-imc pmu, if the kernel does not have core-imc registered.

The entire patch set has been verified using the static checker smatch. 
Command used:   
$ make ARCH=powerpc CHECK="/smatch -p=kernel"  C=1 vmlinux | tee warns.txt

Tests Done: 

* Fail core-imc at init:
nest-imc - working  
cpuhotplug - works as expected  
thread-imc - not registered 

* Fail thread-imc at init:  
nest-imc - works
core-imc - works
cpuhotplug - works  

* Fail nest-imc at init 

core-imc - works
thread-imc -works   
cpuhotplug - works  

* Fail only one nest unit (say for mcs23)   

Other nest-units - works
core-imc - works
thread-imc - works  
cpuhotplug - works. 


* Kexec works   

The first three patches in this series addresses the comments by Dan Carpenter. 


Anju T Sudhakar (4):
  powerpc/perf: Rearrange memory freeing in imc init
  powerpc/perf: Replace the direct return with goto statement
  powerpc/perf: Return appropriate value for unknown domain
  powerpc/perf: Unregister thread-imc if core-imc not supported

 arch/powerpc/include/asm/imc-pmu.h|  1 +
 arch/powerpc/perf/imc-pmu.c   | 64 +++
 arch/powerpc/platforms/powernv/opal-imc.c | 22 +--
 3 files changed, 60 insertions(+), 27 deletions(-)

-- 
2.7.4



Re: [PATCH v3] powerpc/kernel/sysfs: Export ldbar spr to sysfs

2018-03-12 Thread Anju T Sudhakar

Hi,


On Tuesday 06 March 2018 04:35 PM, Michael Ellerman wrote:

Anju T Sudhakar <a...@linux.vnet.ibm.com> writes:


diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c
index 4437c70..caefb64 100644
--- a/arch/powerpc/kernel/sysfs.c
+++ b/arch/powerpc/kernel/sysfs.c
@@ -757,6 +759,9 @@ static int register_cpu_online(unsigned int cpu)
device_create_file(s, _attrs[i]);
  
  #ifdef CONFIG_PPC64

+   if (cpu_has_feature(CPU_FTR_ARCH_300))
+   device_create_file(s, _attr_ldbar);

Is this register readable in supervisor state?


This is a nice catch, thanks. :)
The guest kernel can not access the register, it is only readable in the 
hypervisor state.
I will resend the patch with a condition check so that this spr will not 
get registered for guest kernel.



Regards,
Anju


cheers





Re: [bug report] powerpc/perf: Add nest IMC PMU support

2018-02-07 Thread Anju T Sudhakar

Hi Dan Carpenter,


On Wednesday 31 January 2018 08:55 PM, Dan Carpenter wrote:

Hello Anju T Sudhakar,

The patch 885dcd709ba9: "powerpc/perf: Add nest IMC PMU support" from
Jul 19, 2017, leads to the following static checker warning:

arch/powerpc/perf/imc-pmu.c:1393 init_imc_pmu()
warn: 'pmu_ptr' was already freed.

arch/powerpc/perf/imc-pmu.c
   1317  int init_imc_pmu(struct device_node *parent, struct imc_pmu *pmu_ptr, 
int pmu_idx)
   1318  {
   1319  int ret;
   1320
   1321  ret = imc_mem_init(pmu_ptr, parent, pmu_idx);
   1322  if (ret) {
   1323  imc_common_mem_free(pmu_ptr);
   1324  return ret;
   1325  }

Change this to:

if (ret)
goto err_free_mpu_ptr;

Or something instead of a direct return.  That's more normal kernel
style.

   1326
   1327  switch (pmu_ptr->domain) {
   1328  case IMC_DOMAIN_NEST:
   1329  /*
   1330  * Nest imc pmu need only one cpu per chip, we 
initialize the
   1331  * cpumask for the first nest imc pmu and use the same 
for the
   1332  * rest. To handle the cpuhotplug callback unregister, 
we track
   1333  * the number of nest pmus in "nest_pmus".
   1334  */
   1335  mutex_lock(_init_lock);
   1336  if (nest_pmus == 0) {
   1337  ret = init_nest_pmu_ref();
   1338  if (ret) {
   1339  mutex_unlock(_init_lock);
   1340  goto err_free;
   1341  }
   1342  /* Register for cpu hotplug notification. */
   1343  ret = nest_pmu_cpumask_init();
   1344  if (ret) {
   1345  mutex_unlock(_init_lock);
   1346  kfree(nest_imc_refc);
   1347  kfree(per_nest_pmu_arr);
   1348  goto err_free;
   1349  }
   1350  }
   1351  nest_pmus++;
   1352  mutex_unlock(_init_lock);
   1353  break;
   1354  case IMC_DOMAIN_CORE:
   1355  ret = core_imc_pmu_cpumask_init();
   1356  if (ret) {
   1357  cleanup_all_core_imc_memory();
   1358  return ret;

These direct returns don't look correct...

   1359  }
   1360
   1361  break;
   1362  case IMC_DOMAIN_THREAD:
   1363  ret = thread_imc_cpu_init();
   1364  if (ret) {
   1365  cleanup_all_thread_imc_memory();
   1366  return ret;
   1367  }
   1368
   1369  break;
   1370  default:
   1371  return  -1; /* Unknown domain */

This one certainly looks like a memory leak.  Plus -1 is -EPERM which is
probably not the correct error code.


   1372  }
   1373
   1374  ret = update_events_in_group(parent, pmu_ptr);
   1375  if (ret)
   1376  goto err_free;
   1377
   1378  ret = update_pmu_ops(pmu_ptr);
   1379  if (ret)
   1380  goto err_free;
   1381
   1382  ret = perf_pmu_register(_ptr->pmu, pmu_ptr->pmu.name, -1);
   1383  if (ret)
   1384  goto err_free;
   1385
   1386  pr_info("%s performance monitor hardware support registered\n",
   1387  
pmu_ptr->pmu.name);
   1388
   1389  return 0;
   1390
   1391  err_free:
   1392  imc_common_mem_free(pmu_ptr);
   1393  imc_common_cpuhp_mem_free(pmu_ptr);
   ^^^
This is a use after free, it should be in the reverse order.

err_free_cpuhp:
imc_common_cpuhp_mem_free(pmu_ptr);
err_free_pmu_ptr:
imc_common_mem_free(pmu_ptr);

   1394  return ret;
   1395  }

regards,
dan carpenter




Apologies for the delayed response, I just got back from vacation.

Thank you for pointing out this, I will rework the code and send.


Regards,
Anju



[PATCH v2] platform/powernv: Add debugfs interface for imc-mode and imc-command

2017-12-12 Thread Anju T Sudhakar
In memory Collection (IMC) counter pmu driver controls the ucode's execution
state. At the system boot, IMC perf driver pause the ucode. Ucode state is
changed to "running" only when any of the nest units are monitored or profiled
using perf tool.  

Nest units support only limited set of hardware counters and ucode is always
programmed in the "production mode" ("accumulation") mode. This mode is
configured to provide key performance metric data for most of the nest units.   
  

But ucode also supports other modes which would be used for "debug" to drill
down specific nest units. That is, ucode when switched to "powerbus" debug  
mode (for example), will dynamically reconfigure the nest counters to target
only "powerbus" related events in the hardware counters. This allows the IMC
nest unit to focus on powerbus related transactions in the system in more
detail. At this point, production mode events may or may not be counted.
  

IMC nest counters has both in-band (ucode access) and out of band access to it. 
Since not all nest counter configurations are supported by ucode, out of band   
tools are used to characterize other nest counter configurations.   

Patch provides an interface via "debugfs" to enable the switching of ucode
modes in the system. To switch ucode mode, one has to first pause the microcode 
(imc_cmd), and then write the target mode value to the "imc_mode" file. 
  

Proposed Approach   
=== 

In the proposed approach, the function (export_imc_mode_and_cmd) which creates  
 
the debugfs interface for imc mode and command is implemented in opal-imc.c.
Thus we can use imc_get_mem_addr() to get the homer base address for each chip. 

The interface to expose imc mode and command is required only if we have nest
pmu units registered. Employing the existing data structures to track whether
we have any nest units registered will require to extend data from perf side
to opal-imc.c. Instead an integer is introduced to hold that information by
counting successful nest unit registration. Debugfs interface is removed
based on the integer count.   

Example for the interface:  

root@:/sys/kernel/debug/imc# ls 
imc_cmd_0  imc_cmd_8  imc_mode_0  imc_mode_8   

Changes from v1 -> v2
- The latest commit by Rajarshi Das adds the control block offset in the   
  ima-catalog file. So if there is cb_offset specified in the   
  ima-catalog, that is used to export the imc mode/command, otherwise   
  IMC_CNTL_BLK_OFFSET is used. 

Signed-off-by: Anju T Sudhakar <a...@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/imc-pmu.h|  7 +++
 arch/powerpc/platforms/powernv/opal-imc.c | 77 +++
 2 files changed, 84 insertions(+)

diff --git a/arch/powerpc/include/asm/imc-pmu.h 
b/arch/powerpc/include/asm/imc-pmu.h
index fad0e6f..e760401 100644
--- a/arch/powerpc/include/asm/imc-pmu.h
+++ b/arch/powerpc/include/asm/imc-pmu.h
@@ -35,6 +35,13 @@
 #define THREAD_IMC_ENABLE   0x8000ULL
 
 /*
+ * For debugfs interface for imc-mode and imc-command
+ */
+#define IMC_CNTL_BLK_OFFSET0x3FC00
+#define IMC_CNTL_BLK_CMD_OFFSET8
+#define IMC_CNTL_BLK_MODE_OFFSET   32
+
+/*
  * Structure to hold memory address information for imc units.
  */
 struct imc_mem_info {
diff --git a/arch/powerpc/platforms/powernv/opal-imc.c 
b/arch/powerpc/platforms/powernv/opal-imc.c
index 465ea10..dd4c9b8 100644
--- a/arch/powerpc/platforms/powernv/opal-imc.c
+++ b/arch/powerpc/platforms/powernv/opal-imc.c
@@ -21,6 +21,78 @@
 #include 
 #include 
 #include 
+#include 
+
+static struct dentry *imc_debugfs_parent;
+
+/* Helpers to export imc command and mode via debugfs */
+static int imc_mem_get(void *data, u64 *val)
+{

[PATCH 2/3] powerpc/perf: IMC code cleanup with some code refactoring

2017-12-10 Thread Anju T Sudhakar
Factor out memory freeing part for attribute elements from
imc_common_cpuhp_mem_free().

Signed-off-by: Anju T Sudhakar <a...@linux.vnet.ibm.com>
---
 arch/powerpc/perf/imc-pmu.c | 31 ---
 1 file changed, 20 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index ef7f9dd..71f425f 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -1157,6 +1157,15 @@ static void cleanup_all_thread_imc_memory(void)
}
 }
 
+/* Function to free the attr_groups which are dynamically allocated */
+static void imc_common_mem_free(struct imc_pmu *pmu_ptr)
+{
+   if (pmu_ptr->attr_groups[IMC_EVENT_ATTR])
+   kfree(pmu_ptr->attr_groups[IMC_EVENT_ATTR]->attrs);
+   kfree(pmu_ptr->attr_groups[IMC_EVENT_ATTR]);
+   kfree(pmu_ptr);
+}
+
 /*
  * Common function to unregister cpu hotplug callback and
  * free the memory.
@@ -1189,13 +1198,6 @@ static void imc_common_cpuhp_mem_free(struct imc_pmu 
*pmu_ptr)
cpuhp_remove_state(CPUHP_AP_PERF_POWERPC_THREAD_IMC_ONLINE);
cleanup_all_thread_imc_memory();
}
-
-   /* Only free the attr_groups which are dynamically allocated  */
-   if (pmu_ptr->attr_groups[IMC_EVENT_ATTR])
-   kfree(pmu_ptr->attr_groups[IMC_EVENT_ATTR]->attrs);
-   kfree(pmu_ptr->attr_groups[IMC_EVENT_ATTR]);
-   kfree(pmu_ptr);
-   return;
 }
 
 
@@ -1244,8 +1246,10 @@ static int imc_mem_init(struct imc_pmu *pmu_ptr, struct 
device_node *parent,
core_imc_refc = kcalloc(nr_cores, sizeof(struct imc_pmu_ref),
GFP_KERNEL);
 
-   if (!core_imc_refc)
+   if (!core_imc_refc) {
+   kfree(pmu_ptr->mem_info);
return -ENOMEM;
+   }
 
core_imc_pmu = pmu_ptr;
break;
@@ -1258,8 +1262,10 @@ static int imc_mem_init(struct imc_pmu *pmu_ptr, struct 
device_node *parent,
thread_imc_mem_size = pmu_ptr->counter_mem_size;
for_each_online_cpu(cpu) {
res = thread_imc_mem_alloc(cpu, 
pmu_ptr->counter_mem_size);
-   if (res)
+   if (res) {
+   cleanup_all_thread_imc_memory();
return res;
+   }
}
 
break;
@@ -1285,8 +1291,10 @@ int init_imc_pmu(struct device_node *parent, struct 
imc_pmu *pmu_ptr, int pmu_id
int ret;
 
ret = imc_mem_init(pmu_ptr, parent, pmu_idx);
-   if (ret)
-   goto err_free;
+   if (ret) {
+   imc_common_mem_free(pmu_ptr);
+   return ret;
+   }
 
switch (pmu_ptr->domain) {
case IMC_DOMAIN_NEST:
@@ -1353,6 +1361,7 @@ int init_imc_pmu(struct device_node *parent, struct 
imc_pmu *pmu_ptr, int pmu_id
return 0;
 
 err_free:
+   imc_common_mem_free(pmu_ptr);
imc_common_cpuhp_mem_free(pmu_ptr);
return ret;
 }
-- 
2.7.4



[PATCH 3/3] powerpc/perf: Passs struct imc_events as a parameter to imc_parse_event()

2017-12-10 Thread Anju T Sudhakar
Remove the allocation of struct imc_events from imc_parse_event(). Instead pass
imc_events as a parameter to imc_parse_event(), which is a pointer to a slot in
the array allocated in update_events_in_group().

Reported by: Dan Carpenter ("powerpc/perf: Fix a sizeof() typo so we allocate 
less memory")
Suggested-by: Michael Ellerman <m...@ellerman.id.au>
Signed-off-by: Anju T Sudhakar <a...@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/imc-pmu.h |  2 +-
 arch/powerpc/perf/imc-pmu.c| 66 +++---
 2 files changed, 41 insertions(+), 27 deletions(-)

diff --git a/arch/powerpc/include/asm/imc-pmu.h 
b/arch/powerpc/include/asm/imc-pmu.h
index fad0e6f..080731d 100644
--- a/arch/powerpc/include/asm/imc-pmu.h
+++ b/arch/powerpc/include/asm/imc-pmu.h
@@ -71,7 +71,7 @@ struct imc_events {
 struct imc_pmu {
struct pmu pmu;
struct imc_mem_info *mem_info;
-   struct imc_events **events;
+   struct imc_events *events;
/*
 * Attribute groups for the PMU. Slot 0 used for
 * format attribute, slot 1 used for cpusmask attribute,
diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index 71f425f..5cb1f31 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -116,17 +116,13 @@ static struct attribute *device_str_attr_create(const 
char *name, const char *st
return >attr.attr;
 }
 
-struct imc_events *imc_parse_event(struct device_node *np, const char *scale,
- const char *unit, const char *prefix, u32 
base)
+static int imc_parse_event(struct device_node *np, const char *scale,
+ const char *unit, const char *prefix,
+ u32 base, struct imc_events *event)
 {
-   struct imc_events *event;
const char *s;
u32 reg;
 
-   event = kzalloc(sizeof(struct imc_events), GFP_KERNEL);
-   if (!event)
-   return NULL;
-
if (of_property_read_u32(np, "reg", ))
goto error;
/* Add the base_reg value to the "reg" */
@@ -157,14 +153,32 @@ struct imc_events *imc_parse_event(struct device_node 
*np, const char *scale,
goto error;
}
 
-   return event;
+   return 0;
 error:
kfree(event->unit);
kfree(event->scale);
kfree(event->name);
-   kfree(event);
+   return -EINVAL;
+}
+
+/*
+ * imc_free_events: Function to cleanup the events list, having
+ * "nr_entries".
+ */
+static void imc_free_events(struct imc_events *events, int nr_entries)
+{
+   int i;
+
+   /* Nothing to clean, return */
+   if (!events)
+   return;
+   for (i = 0; i < nr_entries; i++) {
+   kfree(events[i].unit);
+   kfree(events[i].scale);
+   kfree(events[i].name);
+   }
 
-   return NULL;
+   kfree(events);
 }
 
 /*
@@ -176,9 +190,8 @@ static int update_events_in_group(struct device_node *node, 
struct imc_pmu *pmu)
struct attribute_group *attr_group;
struct attribute **attrs, *dev_str;
struct device_node *np, *pmu_events;
-   struct imc_events *ev;
u32 handle, base_reg;
-   int i=0, j=0, ct;
+   int i = 0, j = 0, ct, ret;
const char *prefix, *g_scale, *g_unit;
const char *ev_val_str, *ev_scale_str, *ev_unit_str;
 
@@ -216,15 +229,17 @@ static int update_events_in_group(struct device_node 
*node, struct imc_pmu *pmu)
ct = 0;
/* Parse the events and update the struct */
for_each_child_of_node(pmu_events, np) {
-   ev = imc_parse_event(np, g_scale, g_unit, prefix, base_reg);
-   if (ev)
-   pmu->events[ct++] = ev;
+   ret = imc_parse_event(np, g_scale, g_unit, prefix, base_reg, 
>events[ct]);
+   if (!ret)
+   ct++;
}
 
/* Allocate memory for attribute group */
attr_group = kzalloc(sizeof(*attr_group), GFP_KERNEL);
-   if (!attr_group)
+   if (!attr_group) {
+   imc_free_events(pmu->events, ct);
return -ENOMEM;
+   }
 
/*
 * Allocate memory for attributes.
@@ -237,31 +252,31 @@ static int update_events_in_group(struct device_node 
*node, struct imc_pmu *pmu)
attrs = kcalloc(((ct * 3) + 1), sizeof(struct attribute *), GFP_KERNEL);
if (!attrs) {
kfree(attr_group);
-   kfree(pmu->events);
+   imc_free_events(pmu->events, ct);
return -ENOMEM;
}
 
attr_group->name = "events";
attr_group->attrs = attrs;
do {
-   ev_val_str = kasprintf(GFP_KERNEL, "event=0x%x", 
pmu->events[i]->value);
-   dev_str = device_str_attr_create(pmu->

[PATCH 1/3] powerpc/perf: Remove thread_imc_pmu global variable from

2017-12-10 Thread Anju T Sudhakar
Remove the global variable 'thread_imc_pmu', since it is not used in the code.

Signed-off-by: Anju T Sudhakar <a...@linux.vnet.ibm.com>
---
 arch/powerpc/perf/imc-pmu.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index 4eb9e2b..ef7f9dd 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -40,7 +40,6 @@ static struct imc_pmu *core_imc_pmu;
 /* Thread IMC data structures and variables */
 
 static DEFINE_PER_CPU(u64 *, thread_imc_mem);
-static struct imc_pmu *thread_imc_pmu;
 static int thread_imc_mem_size;
 
 struct imc_pmu *imc_event_to_pmu(struct perf_event *event)
@@ -1263,7 +1262,6 @@ static int imc_mem_init(struct imc_pmu *pmu_ptr, struct 
device_node *parent,
return res;
}
 
-   thread_imc_pmu = pmu_ptr;
break;
default:
return -EINVAL;
-- 
2.7.4



[PATCH 0/3] IMC code clean up and refactoring

2017-12-10 Thread Anju T Sudhakar
The first patch removes the unused variable in the code for 
IMC(In-memory collection counters).

The second patch does some code refactoring.

The third patch in the series make struct imc_events as a parameter to the
function imc_parse_event().

Anju T Sudhakar (3):
  powerpc/perf: Remove thread_imc_pmu global variable from
imc code
  powerpc/perf: IMC code cleanup with some code refactoring
  powerpc/perf: Passs struct imc_events as a parameter to
imc_parse_event()

 arch/powerpc/include/asm/imc-pmu.h |  2 +-
 arch/powerpc/perf/imc-pmu.c| 99 +++---
 2 files changed, 61 insertions(+), 40 deletions(-)

-- 
2.7.4



  1   2   3   >