Re: [PATCH 4/4] tools/perf: Support pipeline stage cycles for powerpc

2021-03-17 Thread Athira Rajeev
On 16-Mar-2021, at 4:48 AM, Jiri Olsa  wrote:On Mon, Mar 15, 2021 at 01:22:09PM +0530, Athira Rajeev wrote:SNIP++static char *setup_dynamic_sort_keys(char *str)+{+	unsigned int j;++	if (sort__mode == SORT_MODE__MEMORY)+		for (j = 0; j < ARRAY_SIZE(dynamic_sort_keys_mem); j++)+			if (arch_support_dynamic_key(dynamic_sort_keys_mem[j])) {+str = suffix_if_not_in(dynamic_sort_keys_mem[j], str);+if (str == NULL)+	return str;+			}++	return str;+}+static int __setup_sorting(struct evlist *evlist){	char *str;@@ -3050,6 +3085,12 @@ static int __setup_sorting(struct evlist *evlist)		}	}+	str = setup_dynamic_sort_keys(str);+	if (str == NULL) {+		pr_err("Not enough memory to setup dynamic sort keys");+		return -ENOMEM;+	}hum, so this is basicaly overloading the default_mem_sort_order forarchitecture, right?then I think it'd be easier just overload default_mem_sort_order directlyI was thinking more about adding extra (arch specific) loop tosort_dimension__add or somehow add arch's specific stuff tomemory_sort_dimensionsHi Jiri,Above patch was to append additional sort keys to sort order based onsort mode and architecture support. I had initially thought of defining twoorders ( say default_mem_sort_order plus mem_sort_order_pstage ). But ifnew sort keys gets added for mem mode in future, we will need to keepupdating both orders. So preferred the approach of "appending" supported sortkeys to default order.Following your thought on using "sort_dimension__add", I tried below approachwhich is easier. The new sort dimension "p_stage_cyc" is presently only supportedon powerpc. For unsupported platforms, we don't want to display itin the perf report output columns. Hence added check in sort_dimension__add()and skip the sort key incase its not applicable for particular arch.Please help to check if below approach looks fine.diff --git a/tools/perf/arch/powerpc/util/event.c b/tools/perf/arch/powerpc/util/event.cindex b80fbee83b6e..7205767d75eb 100644--- a/tools/perf/arch/powerpc/util/event.c+++ b/tools/perf/arch/powerpc/util/event.c@@ -44,3 +44,10 @@ const char *arch_perf_header_entry__add(const char *se_header) 		return "Dispatch Cyc"; 	return se_header; }++int arch_support_sort_key(const char *sort_key)+{+	if (!strcmp(sort_key, "p_stage_cyc"))+		return 1;+	return 0;+}diff --git a/tools/perf/util/event.h b/tools/perf/util/event.hindex 65f89e80916f..612a92aaaefb 100644--- a/tools/perf/util/event.h+++ b/tools/perf/util/event.h@@ -429,5 +429,6 @@ char *get_page_size_name(u64 size, char *str); void arch_perf_parse_sample_weight(struct perf_sample *data, const __u64 *array, u64 type); void arch_perf_synthesize_sample_weight(const struct perf_sample *data, __u64 *array, u64 type); const char *arch_perf_header_entry__add(const char *se_header);+int arch_support_sort_key(const char *sort_key); #endif /* __PERF_RECORD_H */diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.cindex cbb3899e7eca..d8b0b0b43a81 100644--- a/tools/perf/util/sort.c+++ b/tools/perf/util/sort.c@@ -47,6 +47,7 @@ regex_t		ignore_callees_regex; int		have_ignore_callees = 0; enum sort_mode	sort__mode = SORT_MODE__NORMAL; const char	*dynamic_headers[] = {"local_ins_lat", "p_stage_cyc"};+const char	*arch_specific_sort_keys[] = {"p_stage_cyc"}; /*  * Replaces all occurrences of a char used with the:@@ -1837,6 +1838,11 @@ struct sort_dimension { 	int			taken; };+int __weak arch_support_sort_key(const char *sort_key __maybe_unused)+{+	return 0;+}+ const char * __weak arch_perf_header_entry__add(const char *se_header) { 	return se_header;@@ -2773,6 +2779,18 @@ int sort_dimension__add(struct perf_hpp_list *list, const char *tok, { 	unsigned int i, j;+	/* Check to see if there are any arch specific+	 * sort dimensions not applicable for the current+	 * architecture. If so, Skip that sort key since+	 * we don't want to display it in the output fields.+	 */+	for (j = 0; j < ARRAY_SIZE(arch_specific_sort_keys); j++) {+		if (!strcmp(arch_specific_sort_keys[j], tok) &&+!arch_support_sort_key(tok)) {+			return 0;+		}+	}+ 	for (i = 0; i < ARRAY_SIZE(common_sort_dimensions); i++) { 		struct sort_dimension *sd = _sort_dimensions[i];— 2.26.2ThanksAthirajirka

Re: [PATCH 4/4] tools/perf: Support pipeline stage cycles for powerpc

2021-03-17 Thread Jiri Olsa
On Wed, Mar 17, 2021 at 05:01:27PM +0530, Athira Rajeev wrote:
>  class="ApplePlainTextBody"> class="ApplePlainTextBody">On 16-Mar-2021, at 
> 4:48 AM, Jiri Olsa jo...@redhat.com wrote:On Mon, Mar 15, 
> 2021 at 01:22:09PM +0530, Athira Rajeev wrote:SNIP type="cite">++static char *setup_dynamic_sort_keys(char 
> *str)+{+   
> unsigned int j;++ style="white-space:pre">   if (sort__mode == 
> SORT_MODE__MEMORY)+  
>  for 
> (j = 0; j  ARRAY_SIZE(dynamic_sort_keys_mem); j++)+ class="Apple-tab-span" style="white-space:pre">  class="Apple-tab-span" style="white-space:pre"> class="Apple-tab-span" style="white-space:pre">if 
> (arch_support_dynamic_key(dynamic_sort_keys_mem[j])) {+ class="Apple-tab-span" style="white-space:pre">   class="Apple-tab-span" style="white-space:pre"> class="Apple-tab-span" style="white-space:pre"> class="Apple-tab-span" style="white-space:pre">str = 
> suffix_if_not_in(dynamic_sort_keys_mem[j], str);+ class="Apple-tab-span" style="white-space:pre">  class="Apple-tab-span" style="white-space:pre"> class="Apple-tab-span" style="white-space:pre"> class="Apple-tab-span" style="white-space:pre">if (str == 
> NULL)+   
>  class="Apple-tab-span" style="white-space:pre"> class="Apple-tab-span" style="white-space:pre"> class="Apple-tab-span" style="white-space:pre">return 
> str;+
>  class="Apple-tab-span" style="white-space:pre">}++ class="Apple-tab-span" style="white-space:pre"> return 
> str;+}+static int __setup_sorting(struct evlist 
> *evlist){
> char *str;@@ -3050,6 +3085,12 @@ static int __setup_sorting(struct 
> evlist *evlist)  
> 
> }   
> }+  
> str = setup_dynamic_sort_keys(str);+ style="white-space:pre">if (str == NULL) {+ class="Apple-tab-span" style="white-space:pre">  class="Apple-tab-span" style="white-space:pre">pr_err("Not enough 
> memory to setup dynamic sort keys");+ style="white-space:pre"> style="white-space:pre">return -ENOMEM;+ class="Apple-tab-span" style="white-space:pre">
> }hum, so this is basicaly overloading the 
> default_mem_sort_order forarchitecture, right?then I think it'd 
> be easier just overload default_mem_sort_order directlyI was thinking 
> more about adding extra (arch specific) loop tosort_dimension__add or 
> somehow add arch's specific stuff 
> tomemory_sort_dimensionsHi Jiri,Above patch 
> was to append additional sort keys to sort order based onsort mode and 
> architecture support. I had initially thought of defining twoorders ( say 
> default_mem_sort_order plus mem_sort_order_pstage ). But ifnew sort keys 
> gets added for mem mode in future, we will need to keepupdating both 
> orders. So preferred the approach of "appending" supported sortkeys to 
> default order.Following your thought on using "sort_dimension__add", 
> I tried below approachwhich is easier. The new sort dimension 
> "p_stage_cyc" is presently only supportedon powerpc. For unsupported 
> platforms, we don't want to display itin the perf report output columns. 
> Hence added check in sort_dimension__add()and skip the sort key incase 
> its not applicable for particular arch.Please help to check if below 
> approach looks fine.diff --git 
> a/tools/perf/arch/powerpc/util/event.c 
> b/tools/perf/arch/powerpc/util/event.cindex b80fbee83b6e..7205767d75eb 
> 100644--- a/tools/perf/arch/powerpc/util/event.c+++ 
> b/tools/perf/arch/powerpc/util/event.c@@ -44,3 +44,10 @@ const char 
> *arch_perf_header_entry__add(const char *se_header)  class="Apple-tab-span" style="white-space:pre">   class="Apple-tab-span" style="white-space:pre">return "Dispatch 
> Cyc";  return 
> se_header; }++int arch_support_sort_key(const char 
> *sort_key)+{+ 
> if (!strcmp(sort_key, "p_stage_cyc"))+ style="white-space:pre">   style="white-space:pre">return 1;+ style="white-space:pre">  return 0;+}diff --git 
> a/tools/perf/util/event.h b/tools/perf/util/event.hindex 
> 65f89e80916f..612a92aaaefb 100644--- a/tools/perf/util/event.h+++ 
> b/tools/perf/util/event.h@@ -429,5 +429,6 @@ char *get_page_size_name(u64 
> size, char *str); void arch_perf_parse_sample_weight(struct perf_sample 
> *data, const __u64 *array, u64 type); void 
> arch_perf_synthesize_sample_weight(const struct perf_sample *data, __u64 
> *array, u64 type); const char *arch_perf_header_entry__add(const char 
> *se_header);+int arch_support_sort_key(const char *sort_key); 
> #endif /* __PERF_RECORD_H */diff --git a/tools/perf/util/sort.c 
> b/tools/perf/util/sort.cindex cbb3899e7eca..d8b0b0b43a81 100644--- 
> a/tools/perf/util/sort.c+++ b/tools/perf/util/sort.c@@ -47,6 +47,7 @@ 
> regex_t  class="Apple-tab-span" style="white-space:pre">
> ignore_callees_regex; int style="white-space:pre">style="white-space:pre">have_ignore_callees = 0; enum 
> sort_mode 
> sort__mode = SORT_MODE__NORMAL; const char class="Apple-tab-span" style="white-space:pre">  
> *dynamic_headers[] = 

Re: [PATCH 4/4] tools/perf: Support pipeline stage cycles for powerpc

2021-03-15 Thread Jiri Olsa
On Mon, Mar 15, 2021 at 01:22:09PM +0530, Athira Rajeev wrote:

SNIP

> +
> +static char *setup_dynamic_sort_keys(char *str)
> +{
> + unsigned int j;
> +
> + if (sort__mode == SORT_MODE__MEMORY)
> + for (j = 0; j < ARRAY_SIZE(dynamic_sort_keys_mem); j++)
> + if (arch_support_dynamic_key(dynamic_sort_keys_mem[j])) 
> {
> + str = 
> suffix_if_not_in(dynamic_sort_keys_mem[j], str);
> + if (str == NULL)
> + return str;
> + }
> +
> + return str;
> +}
> +
>  static int __setup_sorting(struct evlist *evlist)
>  {
>   char *str;
> @@ -3050,6 +3085,12 @@ static int __setup_sorting(struct evlist *evlist)
>   }
>   }
>  
> + str = setup_dynamic_sort_keys(str);
> + if (str == NULL) {
> + pr_err("Not enough memory to setup dynamic sort keys");
> + return -ENOMEM;
> + }

hum, so this is basicaly overloading the default_mem_sort_order for
architecture, right?

then I think it'd be easier just overload default_mem_sort_order directly

I was thinking more about adding extra (arch specific) loop to
sort_dimension__add or somehow add arch's specific stuff to
memory_sort_dimensions

jirka



Re: [PATCH 4/4] tools/perf: Support pipeline stage cycles for powerpc

2021-03-15 Thread Athira Rajeev



> On 12-Mar-2021, at 6:26 PM, Jiri Olsa  wrote:
> 
> On Tue, Mar 09, 2021 at 09:04:00AM -0500, Athira Rajeev wrote:
>> The pipeline stage cycles details can be recorded on powerpc from
>> the contents of Performance Monitor Unit (PMU) registers. On
>> ISA v3.1 platform, sampling registers exposes the cycles spent in
>> different pipeline stages. Patch adds perf tools support to present
>> two of the cycle counter information along with memory latency (weight).
>> 
>> Re-use the field 'ins_lat' for storing the first pipeline stage cycle.
>> This is stored in 'var2_w' field of 'perf_sample_weight'.
>> 
>> Add a new field 'p_stage_cyc' to store the second pipeline stage cycle
>> which is stored in 'var3_w' field of perf_sample_weight.
>> 
>> Add new sort function 'Pipeline Stage Cycle' and include this in
>> default_mem_sort_order[]. This new sort function may be used to denote
>> some other pipeline stage in another architecture. So add this to
>> list of sort entries that can have dynamic header string.
>> 
>> Signed-off-by: Athira Rajeev 
>> ---
>> tools/perf/Documentation/perf-report.txt |  1 +
>> tools/perf/arch/powerpc/util/event.c | 18 --
>> tools/perf/util/event.h  |  1 +
>> tools/perf/util/hist.c   | 11 ---
>> tools/perf/util/hist.h   |  1 +
>> tools/perf/util/session.c|  4 +++-
>> tools/perf/util/sort.c   | 24 ++--
>> tools/perf/util/sort.h   |  2 ++
>> 8 files changed, 54 insertions(+), 8 deletions(-)
>> 
>> diff --git a/tools/perf/Documentation/perf-report.txt 
>> b/tools/perf/Documentation/perf-report.txt
>> index f546b5e9db05..9691d9c227ba 100644
>> --- a/tools/perf/Documentation/perf-report.txt
>> +++ b/tools/perf/Documentation/perf-report.txt
>> @@ -112,6 +112,7 @@ OPTIONS
>>  - ins_lat: Instruction latency in core cycles. This is the global 
>> instruction
>>latency
>>  - local_ins_lat: Local instruction latency version
>> +- p_stage_cyc: Number of cycles spent in a pipeline stage.
> 
> please specify in here that it's ppc only

Ok Sure,

> 
> SNIP
> 
>> +struct sort_entry sort_p_stage_cyc = {
>> +.se_header  = "Pipeline Stage Cycle",
>> +.se_cmp = sort__global_p_stage_cyc_cmp,
>> +.se_snprintf= hist_entry__p_stage_cyc_snprintf,
>> +.se_width_idx   = HISTC_P_STAGE_CYC,
>> +};
>> +
>> struct sort_entry sort_mem_daddr_sym = {
>>  .se_header  = "Data Symbol",
>>  .se_cmp = sort__daddr_cmp,
>> @@ -1853,6 +1872,7 @@ static void sort_dimension_add_dynamic_header(struct 
>> sort_dimension *sd)
>>  DIM(SORT_CODE_PAGE_SIZE, "code_page_size", sort_code_page_size),
>>  DIM(SORT_LOCAL_INS_LAT, "local_ins_lat", sort_local_ins_lat),
>>  DIM(SORT_GLOBAL_INS_LAT, "ins_lat", sort_global_ins_lat),
>> +DIM(SORT_P_STAGE_CYC, "p_stage_cyc", sort_p_stage_cyc),
> 
> this might be out of scope for this patch, but would it make sense
> to add arch specific sort dimension? so the specific column is
> not even visible on arch that it's not supported on
> 

Hi Jiri,

Thanks for the suggestions.

Below is an approach I came up with for adding dynamic sort key based on 
architecture support.
With this patch, perf report for mem mode will display new sort key only in 
supported archs. 
Please help to review if this approach looks good. I have created this on top 
of my current set. If this looks fine, 
I can include this in version2 patch set.

From 8ebbe6ae802d895103335899e4e60dde5e562f33 Mon Sep 17 00:00:00 2001
From: Athira Rajeev 
Date: Mon, 15 Mar 2021 02:33:28 +
Subject: [PATCH] tools/perf: Add dynamic sort dimensions for mem mode

Add dynamic sort dimensions for mem mode.

Signed-off-by: Athira Rajeev 
---
 tools/perf/arch/powerpc/util/event.c |  7 +
 tools/perf/util/event.h  |  1 +
 tools/perf/util/sort.c   | 43 +++-
 3 files changed, 50 insertions(+), 1 deletion(-)

diff --git a/tools/perf/arch/powerpc/util/event.c 
b/tools/perf/arch/powerpc/util/event.c
index b80fbee83b6e..fddfc288c415 100644
--- a/tools/perf/arch/powerpc/util/event.c
+++ b/tools/perf/arch/powerpc/util/event.c
@@ -44,3 +44,10 @@ const char *arch_perf_header_entry__add(const char 
*se_header)
return "Dispatch Cyc";
return se_header;
 }
+
+int arch_support_dynamic_key(const char *sort_key)
+{
+   if (!strcmp(sort_key, "p_stage_cyc"))
+   return 1;
+   return 0;
+}
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index 65f89e80916f..6cd4bf54dbdc 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -429,5 +429,6 @@ char *get_page_size_name(u64 size, char *str);
 void arch_perf_parse_sample_weight(struct perf_sample *data, const __u64 
*array, u64 type);
 void arch_perf_synthesize_sample_weight(const struct perf_sample *data, __u64 
*array, u64 type);
 const char 

Re: [PATCH 4/4] tools/perf: Support pipeline stage cycles for powerpc

2021-03-12 Thread Jiri Olsa
On Tue, Mar 09, 2021 at 09:04:00AM -0500, Athira Rajeev wrote:
> The pipeline stage cycles details can be recorded on powerpc from
> the contents of Performance Monitor Unit (PMU) registers. On
> ISA v3.1 platform, sampling registers exposes the cycles spent in
> different pipeline stages. Patch adds perf tools support to present
> two of the cycle counter information along with memory latency (weight).
> 
> Re-use the field 'ins_lat' for storing the first pipeline stage cycle.
> This is stored in 'var2_w' field of 'perf_sample_weight'.
> 
> Add a new field 'p_stage_cyc' to store the second pipeline stage cycle
> which is stored in 'var3_w' field of perf_sample_weight.
> 
> Add new sort function 'Pipeline Stage Cycle' and include this in
> default_mem_sort_order[]. This new sort function may be used to denote
> some other pipeline stage in another architecture. So add this to
> list of sort entries that can have dynamic header string.
> 
> Signed-off-by: Athira Rajeev 
> ---
>  tools/perf/Documentation/perf-report.txt |  1 +
>  tools/perf/arch/powerpc/util/event.c | 18 --
>  tools/perf/util/event.h  |  1 +
>  tools/perf/util/hist.c   | 11 ---
>  tools/perf/util/hist.h   |  1 +
>  tools/perf/util/session.c|  4 +++-
>  tools/perf/util/sort.c   | 24 ++--
>  tools/perf/util/sort.h   |  2 ++
>  8 files changed, 54 insertions(+), 8 deletions(-)
> 
> diff --git a/tools/perf/Documentation/perf-report.txt 
> b/tools/perf/Documentation/perf-report.txt
> index f546b5e9db05..9691d9c227ba 100644
> --- a/tools/perf/Documentation/perf-report.txt
> +++ b/tools/perf/Documentation/perf-report.txt
> @@ -112,6 +112,7 @@ OPTIONS
>   - ins_lat: Instruction latency in core cycles. This is the global 
> instruction
> latency
>   - local_ins_lat: Local instruction latency version
> + - p_stage_cyc: Number of cycles spent in a pipeline stage.

please specify in here that it's ppc only

SNIP

> +struct sort_entry sort_p_stage_cyc = {
> + .se_header  = "Pipeline Stage Cycle",
> + .se_cmp = sort__global_p_stage_cyc_cmp,
> + .se_snprintf= hist_entry__p_stage_cyc_snprintf,
> + .se_width_idx   = HISTC_P_STAGE_CYC,
> +};
> +
>  struct sort_entry sort_mem_daddr_sym = {
>   .se_header  = "Data Symbol",
>   .se_cmp = sort__daddr_cmp,
> @@ -1853,6 +1872,7 @@ static void sort_dimension_add_dynamic_header(struct 
> sort_dimension *sd)
>   DIM(SORT_CODE_PAGE_SIZE, "code_page_size", sort_code_page_size),
>   DIM(SORT_LOCAL_INS_LAT, "local_ins_lat", sort_local_ins_lat),
>   DIM(SORT_GLOBAL_INS_LAT, "ins_lat", sort_global_ins_lat),
> + DIM(SORT_P_STAGE_CYC, "p_stage_cyc", sort_p_stage_cyc),

this might be out of scope for this patch, but would it make sense
to add arch specific sort dimension? so the specific column is
not even visible on arch that it's not supported on


>  };
>  
>  #undef DIM
> diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h
> index 63f67a3f3630..23b20cbbc846 100644
> --- a/tools/perf/util/sort.h
> +++ b/tools/perf/util/sort.h
> @@ -51,6 +51,7 @@ struct he_stat {
>   u64 period_guest_us;
>   u64 weight;
>   u64 ins_lat;
> + u64 p_stage_cyc;
>   u32 nr_events;
>  };
>  
> @@ -234,6 +235,7 @@ enum sort_type {
>   SORT_CODE_PAGE_SIZE,
>   SORT_LOCAL_INS_LAT,
>   SORT_GLOBAL_INS_LAT,
> + SORT_P_STAGE_CYC,

we could have the whole 'SORT_PEPELINE_STAGE_CYC',
so it's more obvious

thanks,
jirka



[PATCH 4/4] tools/perf: Support pipeline stage cycles for powerpc

2021-03-09 Thread Athira Rajeev
The pipeline stage cycles details can be recorded on powerpc from
the contents of Performance Monitor Unit (PMU) registers. On
ISA v3.1 platform, sampling registers exposes the cycles spent in
different pipeline stages. Patch adds perf tools support to present
two of the cycle counter information along with memory latency (weight).

Re-use the field 'ins_lat' for storing the first pipeline stage cycle.
This is stored in 'var2_w' field of 'perf_sample_weight'.

Add a new field 'p_stage_cyc' to store the second pipeline stage cycle
which is stored in 'var3_w' field of perf_sample_weight.

Add new sort function 'Pipeline Stage Cycle' and include this in
default_mem_sort_order[]. This new sort function may be used to denote
some other pipeline stage in another architecture. So add this to
list of sort entries that can have dynamic header string.

Signed-off-by: Athira Rajeev 
---
 tools/perf/Documentation/perf-report.txt |  1 +
 tools/perf/arch/powerpc/util/event.c | 18 --
 tools/perf/util/event.h  |  1 +
 tools/perf/util/hist.c   | 11 ---
 tools/perf/util/hist.h   |  1 +
 tools/perf/util/session.c|  4 +++-
 tools/perf/util/sort.c   | 24 ++--
 tools/perf/util/sort.h   |  2 ++
 8 files changed, 54 insertions(+), 8 deletions(-)

diff --git a/tools/perf/Documentation/perf-report.txt 
b/tools/perf/Documentation/perf-report.txt
index f546b5e9db05..9691d9c227ba 100644
--- a/tools/perf/Documentation/perf-report.txt
+++ b/tools/perf/Documentation/perf-report.txt
@@ -112,6 +112,7 @@ OPTIONS
- ins_lat: Instruction latency in core cycles. This is the global 
instruction
  latency
- local_ins_lat: Local instruction latency version
+   - p_stage_cyc: Number of cycles spent in a pipeline stage.
 
By default, comm, dso and symbol keys are used.
(i.e. --sort comm,dso,symbol)
diff --git a/tools/perf/arch/powerpc/util/event.c 
b/tools/perf/arch/powerpc/util/event.c
index f49d32c2c8ae..b80fbee83b6e 100644
--- a/tools/perf/arch/powerpc/util/event.c
+++ b/tools/perf/arch/powerpc/util/event.c
@@ -18,8 +18,11 @@ void arch_perf_parse_sample_weight(struct perf_sample *data,
weight.full = *array;
if (type & PERF_SAMPLE_WEIGHT)
data->weight = weight.full;
-   else
+   else {
data->weight = weight.var1_dw;
+   data->ins_lat = weight.var2_w;
+   data->p_stage_cyc = weight.var3_w;
+   }
 }
 
 void arch_perf_synthesize_sample_weight(const struct perf_sample *data,
@@ -27,6 +30,17 @@ void arch_perf_synthesize_sample_weight(const struct 
perf_sample *data,
 {
*array = data->weight;
 
-   if (type & PERF_SAMPLE_WEIGHT_STRUCT)
+   if (type & PERF_SAMPLE_WEIGHT_STRUCT) {
*array &= 0x;
+   *array |= ((u64)data->ins_lat << 32);
+   }
+}
+
+const char *arch_perf_header_entry__add(const char *se_header)
+{
+   if (!strcmp(se_header, "Local INSTR Latency"))
+   return "Finish Cyc";
+   else if (!strcmp(se_header, "Pipeline Stage Cycle"))
+   return "Dispatch Cyc";
+   return se_header;
 }
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index 89b149e2e70a..65f89e80916f 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -147,6 +147,7 @@ struct perf_sample {
u8  cpumode;
u16 misc;
u16 ins_lat;
+   u16 p_stage_cyc;
bool no_hw_idx; /* No hw_idx collected in branch_stack */
char insn[MAX_INSN];
void *raw_data;
diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index c82f5fc26af8..9299ee535518 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -211,6 +211,7 @@ void hists__calc_col_len(struct hists *hists, struct 
hist_entry *h)
hists__new_col_len(hists, HISTC_MEM_BLOCKED, 10);
hists__new_col_len(hists, HISTC_LOCAL_INS_LAT, 13);
hists__new_col_len(hists, HISTC_GLOBAL_INS_LAT, 13);
+   hists__new_col_len(hists, HISTC_P_STAGE_CYC, 13);
if (symbol_conf.nanosecs)
hists__new_col_len(hists, HISTC_TIME, 16);
else
@@ -289,13 +290,14 @@ static long hist_time(unsigned long htime)
 }
 
 static void he_stat__add_period(struct he_stat *he_stat, u64 period,
-   u64 weight, u64 ins_lat)
+   u64 weight, u64 ins_lat, u64 p_stage_cyc)
 {
 
he_stat->period += period;
he_stat->weight += weight;
he_stat->nr_events  += 1;
he_stat->ins_lat+= ins_lat;
+   he_stat->p_stage_cyc+= p_stage_cyc;
 }
 
 static void he_stat__add_stat(struct he_stat *dest, struct he_stat *src)
@@ -308,6 +310,7 @@ static void he_stat__add_stat(struct he_stat *dest, struct 
he_stat *src)
dest->nr_events += src->nr_events;