Re: [PATCH v2 0/2] perf cs-etm: Set time on synthesised samples to preserve ordering

2021-04-19 Thread James Clark



On 16/04/2021 18:16, Arnaldo Carvalho de Melo wrote:
> Em Fri, Apr 16, 2021 at 09:07:09AM -0600, Mathieu Poirier escreveu:
>> Hi James,
>>
>> On Fri, Apr 16, 2021 at 01:56:30PM +0300, James Clark wrote:
>>> Changes since v1:
>>>  * Improved variable name from etm_timestamp -> cs_timestamp
>>>  * Fixed ordering of Signed-off-by
>>>
>>
>> You forgot to add the RB and AB you received.  Since Arnaldo is responsible 
>> for
>> the perf tools subsystem, please send another revision.
>  
> 
> 
> Yep, please collect Reported-by and Acked-by as you go sending new
> versions of a patchset, the last one I don't have a problem collecting
> myself, but if you have to resend, please collect the feedback tags.
> 
> - Arnaldo
> 

Ok thanks, I will keep that in mind for the future. I wasn't sure if they still
applied or not as it was a new version.

Thanks
James

>> Thanks,
>> Mathieu
>>
>>> James Clark (2):
>>>   perf cs-etm: Refactor timestamp variable names
>>>   perf cs-etm: Set time on synthesised samples to preserve ordering
>>>
>>>  .../perf/util/cs-etm-decoder/cs-etm-decoder.c | 18 +++
>>>  tools/perf/util/cs-etm.c  | 52 ++-
>>>  tools/perf/util/cs-etm.h  |  4 +-
>>>  3 files changed, 39 insertions(+), 35 deletions(-)
>>>
>>> -- 
>>> 2.28.0
>>>
> 


Re: [PATCH v4 4/6] perf arm-spe: Assign kernel time to synthesized event

2021-04-16 Thread James Clark



On 15/04/2021 18:23, Leo Yan wrote:
> On Thu, Apr 15, 2021 at 05:46:31PM +0300, James Clark wrote:
>>
>>
>> On 12/04/2021 12:10, Leo Yan wrote:
>>> In current code, it assigns the arch timer counter to the synthesized
>>> samples Arm SPE trace, thus the samples don't contain the kernel time
>>> but only contain the raw counter value.
>>>
>>> To fix the issue, this patch converts the timer counter to kernel time
>>> and assigns it to sample timestamp.
>>>
>>> Signed-off-by: Leo Yan 
>>> ---
>>>  tools/perf/util/arm-spe.c | 2 +-
>>>  1 file changed, 1 insertion(+), 1 deletion(-)
>>>
>>> diff --git a/tools/perf/util/arm-spe.c b/tools/perf/util/arm-spe.c
>>> index 23714cf0380e..c13a89f06ab8 100644
>>> --- a/tools/perf/util/arm-spe.c
>>> +++ b/tools/perf/util/arm-spe.c
>>> @@ -234,7 +234,7 @@ static void arm_spe_prep_sample(struct arm_spe *spe,
>>> struct arm_spe_record *record = &speq->decoder->record;
>>>  
>>> if (!spe->timeless_decoding)
>>> -   sample->time = speq->timestamp;
>>> +   sample->time = tsc_to_perf_time(record->timestamp, &spe->tc);
>>
>>
>> I noticed that in arm_spe_recording_options() the TIME sample bit is set 
>> regardless of any options.
>> I don't know of a way to remove this, and if there isn't, does that mean 
>> that all the code in this
>> file that looks at spe->timeless_decoding is untested and has never been hit?
>>
>> Unless there is a way to get a perf file with only the AUXTRACE event and no 
>> others? I think that one
>> might have no timestamp set. Otherwise other events will always have 
>> timestamps so spe->timeless_decoding
>> is always false.
> 
> Good point.  To be honest, I never noticed this issue until you
> mentioned this.
> 
> We should fix for the "timeless" flow; and it's questionable for the
> function arm_spe_recording_options(), except for setting
> PERF_SAMPLE_TIME, it also hard codes for setting
> PERF_SAMPLE_CPU and PERF_SAMPLE_TID.  Might need to carefully go
> through this function.
> 

Yeah, it's not strictly related to your change, which is definitely an 
improvement.
But maybe we should have a look at the SPE implementation relating to 
timestamps as a whole.

> Thanks,
> Leo
> 


[PATCH v2 1/2] perf cs-etm: Refactor timestamp variable names

2021-04-16 Thread James Clark
Remove ambiguity in variable names relating to timestamps.
A later commit will save the sample kernel timestamp in one
of the etm structs, so name all elements appropriately to
avoid confusion.

This is also removes some ambiguity arising from the fact
that the --timestamp argument to perf record refers to
sample kernel timestamps, and the /timestamp/ event modifier
refers to CS timestamps, so the term is overloaded.

Signed-off-by: James Clark 
---
 .../perf/util/cs-etm-decoder/cs-etm-decoder.c | 18 
 tools/perf/util/cs-etm.c  | 42 +--
 tools/perf/util/cs-etm.h  |  4 +-
 3 files changed, 31 insertions(+), 33 deletions(-)

diff --git a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c 
b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c
index 059bcec3f651..b01d363b9301 100644
--- a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c
+++ b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c
@@ -276,13 +276,13 @@ cs_etm_decoder__do_soft_timestamp(struct cs_etm_queue 
*etmq,
  const uint8_t trace_chan_id)
 {
/* No timestamp packet has been received, nothing to do */
-   if (!packet_queue->timestamp)
+   if (!packet_queue->cs_timestamp)
return OCSD_RESP_CONT;
 
-   packet_queue->timestamp = packet_queue->next_timestamp;
+   packet_queue->cs_timestamp = packet_queue->next_cs_timestamp;
 
/* Estimate the timestamp for the next range packet */
-   packet_queue->next_timestamp += packet_queue->instr_count;
+   packet_queue->next_cs_timestamp += packet_queue->instr_count;
packet_queue->instr_count = 0;
 
/* Tell the front end which traceid_queue needs attention */
@@ -308,8 +308,8 @@ cs_etm_decoder__do_hard_timestamp(struct cs_etm_queue *etmq,
 * Function do_soft_timestamp() will report the value to the front end,
 * hence asking the decoder to keep decoding rather than stopping.
 */
-   if (packet_queue->timestamp) {
-   packet_queue->next_timestamp = elem->timestamp;
+   if (packet_queue->cs_timestamp) {
+   packet_queue->next_cs_timestamp = elem->timestamp;
return OCSD_RESP_CONT;
}
 
@@ -320,8 +320,8 @@ cs_etm_decoder__do_hard_timestamp(struct cs_etm_queue *etmq,
 * which instructions started by subtracting the number of instructions
 * executed to the timestamp.
 */
-   packet_queue->timestamp = elem->timestamp - packet_queue->instr_count;
-   packet_queue->next_timestamp = elem->timestamp;
+   packet_queue->cs_timestamp = elem->timestamp - 
packet_queue->instr_count;
+   packet_queue->next_cs_timestamp = elem->timestamp;
packet_queue->instr_count = 0;
 
/* Tell the front end which traceid_queue needs attention */
@@ -334,8 +334,8 @@ cs_etm_decoder__do_hard_timestamp(struct cs_etm_queue *etmq,
 static void
 cs_etm_decoder__reset_timestamp(struct cs_etm_packet_queue *packet_queue)
 {
-   packet_queue->timestamp = 0;
-   packet_queue->next_timestamp = 0;
+   packet_queue->cs_timestamp = 0;
+   packet_queue->next_cs_timestamp = 0;
packet_queue->instr_count = 0;
 }
 
diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c
index 7e63e7dedc33..533f6f2f0685 100644
--- a/tools/perf/util/cs-etm.c
+++ b/tools/perf/util/cs-etm.c
@@ -38,8 +38,6 @@
 #include 
 #include "util/synthetic-events.h"
 
-#define MAX_TIMESTAMP (~0ULL)
-
 struct cs_etm_auxtrace {
struct auxtrace auxtrace;
struct auxtrace_queues queues;
@@ -86,7 +84,7 @@ struct cs_etm_queue {
struct cs_etm_decoder *decoder;
struct auxtrace_buffer *buffer;
unsigned int queue_nr;
-   u8 pending_timestamp;
+   u8 pending_timestamp_chan_id;
u64 offset;
const unsigned char *buf;
size_t buf_len, buf_used;
@@ -208,7 +206,7 @@ void cs_etm__etmq_set_traceid_queue_timestamp(struct 
cs_etm_queue *etmq,
 * be more than one channel per cs_etm_queue, we need to specify
 * what traceID queue needs servicing.
 */
-   etmq->pending_timestamp = trace_chan_id;
+   etmq->pending_timestamp_chan_id = trace_chan_id;
 }
 
 static u64 cs_etm__etmq_get_timestamp(struct cs_etm_queue *etmq,
@@ -216,22 +214,22 @@ static u64 cs_etm__etmq_get_timestamp(struct cs_etm_queue 
*etmq,
 {
struct cs_etm_packet_queue *packet_queue;
 
-   if (!etmq->pending_timestamp)
+   if (!etmq->pending_timestamp_chan_id)
return 0;
 
if (trace_chan_id)
-   *trace_chan_id = etmq->pending_timestamp;
+   *trace_chan_id = etmq->pending_timestamp_chan_id;
 
packet_queue = cs_etm__etmq_get_packet_queue(etmq,
-etmq->pending_timestamp);
+

[PATCH v2 2/2] perf cs-etm: Set time on synthesised samples to preserve ordering

2021-04-16 Thread James Clark
The following attribute is set when synthesising samples in
timed decoding mode:

attr.sample_type |= PERF_SAMPLE_TIME;

This results in new samples that appear to have timestamps but
because we don't assign any timestamps to the samples, when the
resulting inject file is opened again, the synthesised samples
will be on the wrong side of the MMAP or COMM events.

For example this results in the samples being associated with
the perf binary, rather than the target of the record:

perf record -e cs_etm/@tmc_etr0/u top
perf inject -i perf.data -o perf.inject --itrace=i100il
perf report -i perf.inject

Where 'Command' == perf should show as 'top':

# Overhead  Command  Source Shared Object  Source Symbol   Target 
Symbol   Basic Block Cycles
#   ...    ..  
..  ..
#
31.08%  perf [unknown] [.] 0x0040c3f8  [.] 
0x0040c3e8  -

If the perf.data file is opened directly with perf, without the
inject step, then this already works correctly because the
events are synthesised after the COMM and MMAP events and
no second sorting happens. Re-sorting only happens when opening
the perf.inject file for the second time so timestamps are
needed.

Using the timestamp from the AUX record mirrors the current
behaviour when opening directly with perf, because the events
are generated on the call to cs_etm__process_queues().

Co-developed-by: Al Grant 
Signed-off-by: Al Grant 
Signed-off-by: James Clark 
---
 tools/perf/util/cs-etm.c | 10 --
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c
index 533f6f2f0685..e5c1a1b22a2a 100644
--- a/tools/perf/util/cs-etm.c
+++ b/tools/perf/util/cs-etm.c
@@ -54,6 +54,7 @@ struct cs_etm_auxtrace {
u8 sample_instructions;
 
int num_cpu;
+   u64 latest_kernel_timestamp;
u32 auxtrace_type;
u64 branches_sample_type;
u64 branches_id;
@@ -1192,6 +1193,8 @@ static int cs_etm__synth_instruction_sample(struct 
cs_etm_queue *etmq,
event->sample.header.misc = cs_etm__cpu_mode(etmq, addr);
event->sample.header.size = sizeof(struct perf_event_header);
 
+   if (!etm->timeless_decoding)
+   sample.time = etm->latest_kernel_timestamp;
sample.ip = addr;
sample.pid = tidq->pid;
sample.tid = tidq->tid;
@@ -1248,6 +1251,8 @@ static int cs_etm__synth_branch_sample(struct 
cs_etm_queue *etmq,
event->sample.header.misc = cs_etm__cpu_mode(etmq, ip);
event->sample.header.size = sizeof(struct perf_event_header);
 
+   if (!etm->timeless_decoding)
+   sample.time = etm->latest_kernel_timestamp;
sample.ip = ip;
sample.pid = tidq->pid;
sample.tid = tidq->tid;
@@ -2412,9 +2417,10 @@ static int cs_etm__process_event(struct perf_session 
*session,
else if (event->header.type == PERF_RECORD_SWITCH_CPU_WIDE)
return cs_etm__process_switch_cpu_wide(etm, event);
 
-   if (!etm->timeless_decoding &&
-   event->header.type == PERF_RECORD_AUX)
+   if (!etm->timeless_decoding && event->header.type == PERF_RECORD_AUX) {
+   etm->latest_kernel_timestamp = sample_kernel_timestamp;
return cs_etm__process_queues(etm);
+   }
 
return 0;
 }
-- 
2.28.0



[PATCH v2 0/2] perf cs-etm: Set time on synthesised samples to preserve ordering

2021-04-16 Thread James Clark
Changes since v1:
 * Improved variable name from etm_timestamp -> cs_timestamp
 * Fixed ordering of Signed-off-by

James Clark (2):
  perf cs-etm: Refactor timestamp variable names
  perf cs-etm: Set time on synthesised samples to preserve ordering

 .../perf/util/cs-etm-decoder/cs-etm-decoder.c | 18 +++
 tools/perf/util/cs-etm.c  | 52 ++-
 tools/perf/util/cs-etm.h  |  4 +-
 3 files changed, 39 insertions(+), 35 deletions(-)

-- 
2.28.0



Re: [PATCH 2/2] perf cs-etm: Set time on synthesised samples to preserve ordering

2021-04-16 Thread James Clark



On 15/04/2021 22:54, Mathieu Poirier wrote:
> On Wed, Apr 14, 2021 at 05:39:19PM +0300, James Clark wrote:
>> The following attribute is set when synthesising samples in
>> timed decoding mode:
>>
>> attr.sample_type |= PERF_SAMPLE_TIME;
>>
>> This results in new samples that appear to have timestamps but
>> because we don't assign any timestamps to the samples, when the
>> resulting inject file is opened again, the synthesised samples
>> will be on the wrong side of the MMAP or COMM events.
>>
> 
> I understand the problem.  Once again an issue caused by CS and the kernel
> having a different view of time. 
> 
>> For example this results in the samples being associated with
>> the perf binary, rather than the target of the record:
>>
>> perf record -e cs_etm/@tmc_etr0/u top
>> perf inject -i perf.data -o perf.inject --itrace=i100il
>> perf report -i perf.inject
>>
>> Where 'Command' == perf should show as 'top':
>>
>> # Overhead  Command  Source Shared Object  Source Symbol   
>> Target Symbol   Basic Block Cycles
>> #   ...    ..  
>> ..  ..
>> #
>> 31.08%  perf [unknown] [.] 0x0040c3f8  [.] 
>> 0x0040c3e8  -
>>
>> If the perf.data file is opened directly with perf, without the
>> inject step, then this already works correctly because the
>> events are synthesised after the COMM and MMAP events and
>> no second sorting happens. Re-sorting only happens when opening
>> the perf.inject file for the second time so timestamps are
>> needed.
>>
>> Using the timestamp from the AUX record mirrors the current
>> behaviour when opening directly with perf, because the events
>> are generated on the call to cs_etm__process_queues().
>>
>> Signed-off-by: James Clark 
>> Co-developed-by: Al Grant 
>> Signed-off-by: Al Grant 
> 
> Suzuki is correct, your name has to appear after Al's.
> 
>> ---
>>  tools/perf/util/cs-etm.c | 10 --
>>  1 file changed, 8 insertions(+), 2 deletions(-)
>>
>> diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c
>> index c25da2ffa8f3..d0fa9dce47f1 100644
>> --- a/tools/perf/util/cs-etm.c
>> +++ b/tools/perf/util/cs-etm.c
>> @@ -54,6 +54,7 @@ struct cs_etm_auxtrace {
>>  u8 sample_instructions;
>>  
>>  int num_cpu;
>> +u64 latest_kernel_timestamp;
>>  u32 auxtrace_type;
>>  u64 branches_sample_type;
>>  u64 branches_id;
>> @@ -1192,6 +1193,8 @@ static int cs_etm__synth_instruction_sample(struct 
>> cs_etm_queue *etmq,
>>  event->sample.header.misc = cs_etm__cpu_mode(etmq, addr);
>>  event->sample.header.size = sizeof(struct perf_event_header);
>>  
>> +if (!etm->timeless_decoding)
>> +sample.time = etm->latest_kernel_timestamp;
>>  sample.ip = addr;
>>  sample.pid = tidq->pid;
>>  sample.tid = tidq->tid;
>> @@ -1248,6 +1251,8 @@ static int cs_etm__synth_branch_sample(struct 
>> cs_etm_queue *etmq,
>>  event->sample.header.misc = cs_etm__cpu_mode(etmq, ip);
>>  event->sample.header.size = sizeof(struct perf_event_header);
>>  
>> +if (!etm->timeless_decoding)
>> +sample.time = etm->latest_kernel_timestamp;
>>  sample.ip = ip;
>>  sample.pid = tidq->pid;
>>  sample.tid = tidq->tid;
>> @@ -2412,9 +2417,10 @@ static int cs_etm__process_event(struct perf_session 
>> *session,
>>  else if (event->header.type == PERF_RECORD_SWITCH_CPU_WIDE)
>>  return cs_etm__process_switch_cpu_wide(etm, event);
>>  
>> -if (!etm->timeless_decoding &&
>> -event->header.type == PERF_RECORD_AUX)
>> +if (!etm->timeless_decoding && event->header.type == PERF_RECORD_AUX) {
>> +etm->latest_kernel_timestamp = sample_kernel_timestamp;
> 
> It will be fun to fix this when 8.4 comes out but for now it's the best we've
> got.
> 

Thanks for the reviews Leo, Mathieu and Suzuki. Yes I think for 8.4 we can also 
do something
very similar to Leo's "perf arm-spe: Bail out if the trace is later than perf 
event"
patch where decoding is paused until the other events with later timestamps
have been received. At the moment the CS decoding happens all at once.

I will submit a new set with the fixes and better variable name.

James

> Reviewed-by: Mathieu Poirier 
> 
>>  return cs_etm__process_queues(etm);
>> +}
>>  
>>  return 0;
>>  }
>> -- 
>> 2.28.0
>>


Re: [PATCH 2/2] perf cs-etm: Set time on synthesised samples to preserve ordering

2021-04-16 Thread James Clark



On 15/04/2021 17:33, Leo Yan wrote:
> Hi James,
> 
> On Thu, Apr 15, 2021 at 03:51:46PM +0300, James Clark wrote:
> 
> [...]
> 
>>> For the orignal perf data file with "--per-thread" option, the decoder
>>> runs into the condition for "etm->timeless_decoding"; and it doesn't
>>> contain ETM timestamp.
>>>
>>> Afterwards, the injected perf data file also misses ETM timestamp and
>>> hit the condition "etm->timeless_decoding".
>>>
>>> So I am confusing why the original perf data can be processed properly
>>> but fails to handle the injected perf data file.
>>
>> Hi Leo,
>>
>> My patch only deals with per-cpu mode. With per-thread mode everything is 
>> already working
>> because _none_ of the events have timestamps because they are not enabled by 
>> default:
>>
>>  /* In per-cpu case, always need the time of mmap events etc */
>>  if (!perf_cpu_map__empty(cpus))
>>  evsel__set_sample_bit(tracking_evsel, TIME);
>>
>> When none of the events have timestamps, I think perf doesn't use the 
>> ordering code in
>> ordered-events.c. So when the inject file is opened, the events are read in 
>> file order.
> 
> The explination makes sense to me.  One thinking: if the original file
> doesn't use the ordered event, is it possible for the injected file to
> not use the ordered event as well?

Yes if you inject on a file with no timestamps and then open it, then the 
function queue_event()
in ordered_events.c is not hit.

If you create a file based on one with timestamps, then the queue_event() 
function is hit
even on the injected file.

The relevant bit of code is here:

if (tool->ordered_events) {
u64 timestamp = -1ULL;

ret = evlist__parse_sample_timestamp(evlist, event, ×tamp);
if (ret && ret != -1)
return ret;

ret = perf_session__queue_event(session, event, timestamp, 
file_offset);
if (ret != -ETIME)
return ret;
}

return perf_session__deliver_event(session, event, tool, file_offset);

If tool->ordered_events is set AND the timestamp for the sample parses to be 
non zero
and non -1:

if (!timestamp || timestamp == ~0ULL)
return -ETIME;

Then the event is added into the queue, otherwise it goes straight through to 
perf_session__deliver_event()
The ordering can be disabled manually with tool->ordered_events and 
--disable-order and is also disabled
with --dump-raw-trace.

It seems like processing the file only really works when all events are 
unordered but in the right order,
or ordered with the right timestamps set.

> 
> Could you confirm Intel-pt can work well for per-cpu mode for inject
> file?

Yes it seems like synthesised samples are assigned sensible timestamps.

perf record -e intel_pt//u top
perf inject -i perf.data -o perf-intel-per-cpu.inject.data 
--itrace=i100i --strip
perf report -i perf-intel-per-cpu.inject.data -D

Results in the correct binary and DSO names and the SAMPLE timestamp is after 
the COMM:

0 381165621595220 0x1200 [0x38]: PERF_RECORD_COMM exec: top:20173/20173

...

2 381165622169297 0x13b0 [0x38]: PERF_RECORD_SAMPLE(IP, 0x2): 
20173/20173: 0x7fdaa14abf53 period: 100 addr: 0
... thread: top:20173
.. dso: /lib/x86_64-linux-gnu/ld-2.27.so

Per-thread also works, but no samples or events have timestamps.

> 
>> So it's not really about --per-thread vs per-cpu mode, it's actually about 
>> whether
>> PERF_SAMPLE_TIME is set, which is set as a by-product of per-cpu mode.
>>
>> I hope I understood your question properly.
> 
> Thanks for info, sorry if I miss any info you have elaborated.
> 
> Leo
> 


Re: [PATCH v4 1/6] perf arm-spe: Remove unused enum value ARM_SPE_PER_CPU_MMAPS

2021-04-15 Thread James Clark



On 15/04/2021 17:41, Leo Yan wrote:
> Hi James,
> 
> On Thu, Apr 15, 2021 at 05:13:36PM +0300, James Clark wrote:
>> On 12/04/2021 12:10, Leo Yan wrote:
>>> The enum value 'ARM_SPE_PER_CPU_MMAPS' is never used so remove it.
>>
>> Hi Leo,
>>
>> I think this causes an error when attempting to open a newly recorded file
>> with an old version of perf. The value ARM_SPE_AUXTRACE_PRIV_MAX is used 
>> here:
>>
>>  size_t min_sz = sizeof(u64) * ARM_SPE_AUXTRACE_PRIV_MAX;
>>  struct perf_record_time_conv *tc = &session->time_conv;
>>  struct arm_spe *spe;
>>  int err;
>>
>>  if (auxtrace_info->header.size < sizeof(struct 
>> perf_record_auxtrace_info) +
>>  min_sz)
>>  return -EINVAL;
>>
>> And removing ARM_SPE_PER_CPU_MMAPS changes the value of 
>> ARM_SPE_AUXTRACE_PRIV_MAX.
>>
>> At least I think that's what's causing the problem. I get this error:
>>
>>  ./perf report -i per-thread-spe-time.data
>>  0x1c0 [0x18]: failed to process type: 70 [Invalid argument]
>>  Error:
>>  failed to process sample
>>  # To display the perf.data header info, please use 
>> --header/--header-only options.
>>  #
> 
> Yes, when working on this patch I had concern as well.
> 
> I carefully thought that the perf tool should be backwards-compatible,
> but there have no requirement for forwards-compatibility.  This is the
> main reason why I kept this patch.
> 
> If you or anyone could confirm the forwards-compatibility is required,
> it's quite fine for me to drop this patch.
> 

Personally, I can easily imagine sending a file to someone to open with an 
older version and it causing
friction where it could be easily avoided. And it even made testing a bit more 
difficult because
I wanted to compare opening the same file with the patched and un-patched 
version. But if there
is no hard requirement I can't really put too much pressure to not remove it.

> Thanks a lot for the reviewing and testing!
> Leo
> 


Re: [PATCH v4 4/6] perf arm-spe: Assign kernel time to synthesized event

2021-04-15 Thread James Clark



On 12/04/2021 12:10, Leo Yan wrote:
> In current code, it assigns the arch timer counter to the synthesized
> samples Arm SPE trace, thus the samples don't contain the kernel time
> but only contain the raw counter value.
> 
> To fix the issue, this patch converts the timer counter to kernel time
> and assigns it to sample timestamp.
> 
> Signed-off-by: Leo Yan 
> ---
>  tools/perf/util/arm-spe.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/tools/perf/util/arm-spe.c b/tools/perf/util/arm-spe.c
> index 23714cf0380e..c13a89f06ab8 100644
> --- a/tools/perf/util/arm-spe.c
> +++ b/tools/perf/util/arm-spe.c
> @@ -234,7 +234,7 @@ static void arm_spe_prep_sample(struct arm_spe *spe,
>   struct arm_spe_record *record = &speq->decoder->record;
>  
>   if (!spe->timeless_decoding)
> - sample->time = speq->timestamp;
> + sample->time = tsc_to_perf_time(record->timestamp, &spe->tc);


I noticed that in arm_spe_recording_options() the TIME sample bit is set 
regardless of any options.
I don't know of a way to remove this, and if there isn't, does that mean that 
all the code in this
file that looks at spe->timeless_decoding is untested and has never been hit?

Unless there is a way to get a perf file with only the AUXTRACE event and no 
others? I think that one
might have no timestamp set. Otherwise other events will always have timestamps 
so spe->timeless_decoding
is always false.



>  
>   sample->ip = record->from_ip;
>   sample->cpumode = arm_spe_cpumode(spe, sample->ip);
> 


Re: [PATCH v4 0/6] perf arm-spe: Enable timestamp

2021-04-15 Thread James Clark
Hi Leo,

I was looking at testing this on N1SDP and I thought I would try the round trip 
with perf inject and
then perf report but saw that perf inject with SPE always results in an error 
(unrelated to your change)

 -> ./perf report -i per-thread-spe-time.inject.data
0x1328 [0x8]: failed to process type: 9 [Bad address]
Error:
failed to process sample


Do you have any test suggestions other than looking at the raw data?

Thanks
James

On 12/04/2021 12:10, Leo Yan wrote:
> This patch set is to enable timestamp for Arm SPE trace.  It reads out
> TSC parameters from the TIME_CONV event, the parameters are used for
> conversion between timer counter and kernel time and which is applied
> for Arm SPE samples.
> 
> This version dropped the change for adding hardware clock parameters
> into auxtrace info, alternatively, it utilizes the TIME_CONV event to
> extract the clock parameters which is used for timestamp calculation.
> 
> This patch set can be clearly applied on perf/core branch with:
> 
>   commit 2c0cb9f56020 ("perf test: Add a shell test for 'perf stat 
> --bpf-counters' new option")
> 
> Ths patch series has been tested on Hisilicon D06 platform.
> 
> Changes from v3:
> * Let to be backwards-compatible for TIME_CONV event (Adrian).
> 
> Changes from v2:
> * Changed to use TIME_CONV event for extracting clock parameters (Al).
> 
> Changes from v1:
> * Rebased patch series on the latest perf/core branch;
> * Fixed the patch for dumping TSC parameters to support both the
>   older and new auxtrace info format.
> 
> 
> Leo Yan (6):
>   perf arm-spe: Remove unused enum value ARM_SPE_PER_CPU_MMAPS
>   perf arm-spe: Save clock parameters from TIME_CONV event
>   perf arm-spe: Convert event kernel time to counter value
>   perf arm-spe: Assign kernel time to synthesized event
>   perf arm-spe: Bail out if the trace is later than perf event
>   perf arm-spe: Don't wait for PERF_RECORD_EXIT event
> 
>  tools/perf/util/arm-spe.c | 74 +--
>  tools/perf/util/arm-spe.h |  1 -
>  2 files changed, 64 insertions(+), 11 deletions(-)
> 


Re: [PATCH v4 1/6] perf arm-spe: Remove unused enum value ARM_SPE_PER_CPU_MMAPS

2021-04-15 Thread James Clark



On 12/04/2021 12:10, Leo Yan wrote:
> The enum value 'ARM_SPE_PER_CPU_MMAPS' is never used so remove it.

Hi Leo,

I think this causes an error when attempting to open a newly recorded file
with an old version of perf. The value ARM_SPE_AUXTRACE_PRIV_MAX is used here:

size_t min_sz = sizeof(u64) * ARM_SPE_AUXTRACE_PRIV_MAX;
struct perf_record_time_conv *tc = &session->time_conv;
struct arm_spe *spe;
int err;

if (auxtrace_info->header.size < sizeof(struct 
perf_record_auxtrace_info) +
min_sz)
return -EINVAL;

And removing ARM_SPE_PER_CPU_MMAPS changes the value of 
ARM_SPE_AUXTRACE_PRIV_MAX.

At least I think that's what's causing the problem. I get this error:

./perf report -i per-thread-spe-time.data
0x1c0 [0x18]: failed to process type: 70 [Invalid argument]
Error:
failed to process sample
# To display the perf.data header info, please use 
--header/--header-only options.
#

James

> 
> Signed-off-by: Leo Yan 
> ---
>  tools/perf/util/arm-spe.h | 1 -
>  1 file changed, 1 deletion(-)
> 
> diff --git a/tools/perf/util/arm-spe.h b/tools/perf/util/arm-spe.h
> index 98d3235781c3..105ce0ea0a01 100644
> --- a/tools/perf/util/arm-spe.h
> +++ b/tools/perf/util/arm-spe.h
> @@ -11,7 +11,6 @@
>  
>  enum {
>   ARM_SPE_PMU_TYPE,
> - ARM_SPE_PER_CPU_MMAPS,
>   ARM_SPE_AUXTRACE_PRIV_MAX,
>  };
>  
> 


Re: [PATCH 2/2] perf cs-etm: Set time on synthesised samples to preserve ordering

2021-04-15 Thread James Clark



On 15/04/2021 15:39, Leo Yan wrote:
> On Wed, Apr 14, 2021 at 05:41:46PM +0300, James Clark wrote:
>> Hi,
>>
>> For this change, I also tried removing the setting of PERF_SAMPLE_TIME in 
>> cs_etm__synth_events(). In theory, this would remove the sorting when 
>> opening the file, but the change doesn't affect when the built-in events are 
>> saved to the inject file. Resulting in events like MMAP and COMM with 
>> timestamps, but the synthesised events without. This results in the same 
>> issue of the synthesised events appearing before the COMM and MMAP events. 
>> If it was possible to somehow tell perf to remove timestamps from built-in 
>> events, removing PERF_SAMPLE_TIME would probably be the right solution, 
>> because we don't set sample.time.
>>
>> For Arm v8.4 we will have the kernel time in the etm timestamps, so an if 
>> can be added to switch between this behaviour and the next (more correct) 
>> one depending on the hardware. 
>>
>> On the subject of timestamps, but not related to this change, some 
>> combinations of timestamp options aren't working. For example:
>>
>> perf record -e cs_etm/time,@tmc_etr0/u --per-thread
>> or  perf record -e cs_etm/@tmc_etr0/u --timestamp --per-thread
>>
>> These don't work because of the assumption that etm->timeless_decoding == 
>> --per-thread
>> and kernel timestamps enabled (/time/ or --timestamp) == etm timestamps 
>> enabled (/timestamp/), which isn't necessarily true.
>>
>> This can be made to work with a few code changes for cs_etm/time,timestamp/u 
>> --per-thread, but cs_etm/time/u --per-thread could be a bit more work. 
>> Changes involved would be using "per_cpu_mmaps" in some places instead of 
>> etm->timeless_decoding, and also setting etm->timeless_decoding based on 
>> whether there are any etm timestamps, not kernel ones. Although to search 
>> for any etm timestamp would involve a full decode ahead of time which might 
>> not be feasible (or maybe just checking the options, although that's not how 
>> it's done in cs_etm__is_timeless_decoding() currently).
> 
> Confirm for one thing:
> 
> For the orignal perf data file with "--per-thread" option, the decoder
> runs into the condition for "etm->timeless_decoding"; and it doesn't
> contain ETM timestamp.
> 
> Afterwards, the injected perf data file also misses ETM timestamp and
> hit the condition "etm->timeless_decoding".
> 
> So I am confusing why the original perf data can be processed properly
> but fails to handle the injected perf data file.

Hi Leo,

My patch only deals with per-cpu mode. With per-thread mode everything is 
already working
because _none_ of the events have timestamps because they are not enabled by 
default:

/* In per-cpu case, always need the time of mmap events etc */
if (!perf_cpu_map__empty(cpus))
evsel__set_sample_bit(tracking_evsel, TIME);

When none of the events have timestamps, I think perf doesn't use the ordering 
code in
ordered-events.c. So when the inject file is opened, the events are read in 
file order.
In file order, MMAP and COMM events come first, because they were encountered 
before the
AUX record where we generated synthetic events.

So it's not really about --per-thread vs per-cpu mode, it's actually about 
whether
PERF_SAMPLE_TIME is set, which is set as a by-product of per-cpu mode.

I hope I understood your question properly.

James


> 
> Thanks,
> Leo
> 
>> Or, we could force /time/ and /timestamp/ options to always be enabled 
>> together in the record stage. 
>>
>>
>> Thanks
>> James
>>
>> On 14/04/2021 17:39, James Clark wrote:
>>> The following attribute is set when synthesising samples in
>>> timed decoding mode:
>>>
>>> attr.sample_type |= PERF_SAMPLE_TIME;
>>>
>>> This results in new samples that appear to have timestamps but
>>> because we don't assign any timestamps to the samples, when the
>>> resulting inject file is opened again, the synthesised samples
>>> will be on the wrong side of the MMAP or COMM events.
>>>
>>> For example this results in the samples being associated with
>>> the perf binary, rather than the target of the record:
>>>
>>> perf record -e cs_etm/@tmc_etr0/u top
>>> perf inject -i perf.data -o perf.inject --itrace=i100il
>>> perf report -i perf.inject
>>>
>>> Where 'Command' == perf should show as 'top':
>>>
>>> # Overhead  Command

Re: [PATCH 2/2] perf cs-etm: Set time on synthesised samples to preserve ordering

2021-04-14 Thread James Clark
Hi,

For this change, I also tried removing the setting of PERF_SAMPLE_TIME in 
cs_etm__synth_events(). In theory, this would remove the sorting when opening 
the file, but the change doesn't affect when the built-in events are saved to 
the inject file. Resulting in events like MMAP and COMM with timestamps, but 
the synthesised events without. This results in the same issue of the 
synthesised events appearing before the COMM and MMAP events. If it was 
possible to somehow tell perf to remove timestamps from built-in events, 
removing PERF_SAMPLE_TIME would probably be the right solution, because we 
don't set sample.time.

For Arm v8.4 we will have the kernel time in the etm timestamps, so an if can 
be added to switch between this behaviour and the next (more correct) one 
depending on the hardware. 

On the subject of timestamps, but not related to this change, some combinations 
of timestamp options aren't working. For example:

perf record -e cs_etm/time,@tmc_etr0/u --per-thread
or  perf record -e cs_etm/@tmc_etr0/u --timestamp --per-thread

These don't work because of the assumption that etm->timeless_decoding == 
--per-thread
and kernel timestamps enabled (/time/ or --timestamp) == etm timestamps enabled 
(/timestamp/), which isn't necessarily true.

This can be made to work with a few code changes for cs_etm/time,timestamp/u 
--per-thread, but cs_etm/time/u --per-thread could be a bit more work. Changes 
involved would be using "per_cpu_mmaps" in some places instead of 
etm->timeless_decoding, and also setting etm->timeless_decoding based on 
whether there are any etm timestamps, not kernel ones. Although to search for 
any etm timestamp would involve a full decode ahead of time which might not be 
feasible (or maybe just checking the options, although that's not how it's done 
in cs_etm__is_timeless_decoding() currently).

Or, we could force /time/ and /timestamp/ options to always be enabled together 
in the record stage. 


Thanks
James

On 14/04/2021 17:39, James Clark wrote:
> The following attribute is set when synthesising samples in
> timed decoding mode:
> 
> attr.sample_type |= PERF_SAMPLE_TIME;
> 
> This results in new samples that appear to have timestamps but
> because we don't assign any timestamps to the samples, when the
> resulting inject file is opened again, the synthesised samples
> will be on the wrong side of the MMAP or COMM events.
> 
> For example this results in the samples being associated with
> the perf binary, rather than the target of the record:
> 
> perf record -e cs_etm/@tmc_etr0/u top
> perf inject -i perf.data -o perf.inject --itrace=i100il
> perf report -i perf.inject
> 
> Where 'Command' == perf should show as 'top':
> 
> # Overhead  Command  Source Shared Object  Source Symbol   Target 
> Symbol   Basic Block Cycles
> #   ...    ..  
> ..  ..
> #
> 31.08%  perf [unknown] [.] 0x0040c3f8  [.] 
> 0x0040c3e8  -
> 
> If the perf.data file is opened directly with perf, without the
> inject step, then this already works correctly because the
> events are synthesised after the COMM and MMAP events and
> no second sorting happens. Re-sorting only happens when opening
> the perf.inject file for the second time so timestamps are
> needed.
> 
> Using the timestamp from the AUX record mirrors the current
> behaviour when opening directly with perf, because the events
> are generated on the call to cs_etm__process_queues().
> 
> Signed-off-by: James Clark 
> Co-developed-by: Al Grant 
> Signed-off-by: Al Grant 
> ---
>  tools/perf/util/cs-etm.c | 10 --
>  1 file changed, 8 insertions(+), 2 deletions(-)
> 
> diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c
> index c25da2ffa8f3..d0fa9dce47f1 100644
> --- a/tools/perf/util/cs-etm.c
> +++ b/tools/perf/util/cs-etm.c
> @@ -54,6 +54,7 @@ struct cs_etm_auxtrace {
>   u8 sample_instructions;
>  
>   int num_cpu;
> + u64 latest_kernel_timestamp;
>   u32 auxtrace_type;
>   u64 branches_sample_type;
>   u64 branches_id;
> @@ -1192,6 +1193,8 @@ static int cs_etm__synth_instruction_sample(struct 
> cs_etm_queue *etmq,
>   event->sample.header.misc = cs_etm__cpu_mode(etmq, addr);
>   event->sample.header.size = sizeof(struct perf_event_header);
>  
> + if (!etm->timeless_decoding)
> + sample.time = etm->latest_kernel_timestamp;
>   sample.ip = addr;
>   sample.pid = tidq->pid;
>   sample.tid = tidq->tid;
> @@ -1248,6 +1251,8 @@ static int cs_etm__synth_branch_sample(struct 
> cs_et

[PATCH 2/2] perf cs-etm: Set time on synthesised samples to preserve ordering

2021-04-14 Thread James Clark
The following attribute is set when synthesising samples in
timed decoding mode:

attr.sample_type |= PERF_SAMPLE_TIME;

This results in new samples that appear to have timestamps but
because we don't assign any timestamps to the samples, when the
resulting inject file is opened again, the synthesised samples
will be on the wrong side of the MMAP or COMM events.

For example this results in the samples being associated with
the perf binary, rather than the target of the record:

perf record -e cs_etm/@tmc_etr0/u top
perf inject -i perf.data -o perf.inject --itrace=i100il
perf report -i perf.inject

Where 'Command' == perf should show as 'top':

# Overhead  Command  Source Shared Object  Source Symbol   Target 
Symbol   Basic Block Cycles
#   ...    ..  
..  ..
#
31.08%  perf [unknown] [.] 0x0040c3f8  [.] 
0x0040c3e8  -

If the perf.data file is opened directly with perf, without the
inject step, then this already works correctly because the
events are synthesised after the COMM and MMAP events and
no second sorting happens. Re-sorting only happens when opening
the perf.inject file for the second time so timestamps are
needed.

Using the timestamp from the AUX record mirrors the current
behaviour when opening directly with perf, because the events
are generated on the call to cs_etm__process_queues().

Signed-off-by: James Clark 
Co-developed-by: Al Grant 
Signed-off-by: Al Grant 
---
 tools/perf/util/cs-etm.c | 10 --
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c
index c25da2ffa8f3..d0fa9dce47f1 100644
--- a/tools/perf/util/cs-etm.c
+++ b/tools/perf/util/cs-etm.c
@@ -54,6 +54,7 @@ struct cs_etm_auxtrace {
u8 sample_instructions;
 
int num_cpu;
+   u64 latest_kernel_timestamp;
u32 auxtrace_type;
u64 branches_sample_type;
u64 branches_id;
@@ -1192,6 +1193,8 @@ static int cs_etm__synth_instruction_sample(struct 
cs_etm_queue *etmq,
event->sample.header.misc = cs_etm__cpu_mode(etmq, addr);
event->sample.header.size = sizeof(struct perf_event_header);
 
+   if (!etm->timeless_decoding)
+   sample.time = etm->latest_kernel_timestamp;
sample.ip = addr;
sample.pid = tidq->pid;
sample.tid = tidq->tid;
@@ -1248,6 +1251,8 @@ static int cs_etm__synth_branch_sample(struct 
cs_etm_queue *etmq,
event->sample.header.misc = cs_etm__cpu_mode(etmq, ip);
event->sample.header.size = sizeof(struct perf_event_header);
 
+   if (!etm->timeless_decoding)
+   sample.time = etm->latest_kernel_timestamp;
sample.ip = ip;
sample.pid = tidq->pid;
sample.tid = tidq->tid;
@@ -2412,9 +2417,10 @@ static int cs_etm__process_event(struct perf_session 
*session,
else if (event->header.type == PERF_RECORD_SWITCH_CPU_WIDE)
return cs_etm__process_switch_cpu_wide(etm, event);
 
-   if (!etm->timeless_decoding &&
-   event->header.type == PERF_RECORD_AUX)
+   if (!etm->timeless_decoding && event->header.type == PERF_RECORD_AUX) {
+   etm->latest_kernel_timestamp = sample_kernel_timestamp;
return cs_etm__process_queues(etm);
+   }
 
return 0;
 }
-- 
2.28.0



[PATCH 1/2] perf cs-etm: Refactor timestamp variable names

2021-04-14 Thread James Clark
Remove ambiguity in variable names relating to timestamps.
A later commit will save the sample kernel timestamp in one
of the etm structs, so name all elements appropriately to
avoid confusion.

This is also removes some ambiguity arising from the fact
that the --timestamp argument to perf record refers to
sample kernel timestamps, and the /timestamp/ event modifier
refers to etm timestamps, so the term is overloaded.

Signed-off-by: James Clark 
---
 .../perf/util/cs-etm-decoder/cs-etm-decoder.c | 18 
 tools/perf/util/cs-etm.c  | 42 +--
 tools/perf/util/cs-etm.h  |  4 +-
 3 files changed, 31 insertions(+), 33 deletions(-)

diff --git a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c 
b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c
index 059bcec3f651..055cb93eca59 100644
--- a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c
+++ b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c
@@ -276,13 +276,13 @@ cs_etm_decoder__do_soft_timestamp(struct cs_etm_queue 
*etmq,
  const uint8_t trace_chan_id)
 {
/* No timestamp packet has been received, nothing to do */
-   if (!packet_queue->timestamp)
+   if (!packet_queue->etm_timestamp)
return OCSD_RESP_CONT;
 
-   packet_queue->timestamp = packet_queue->next_timestamp;
+   packet_queue->etm_timestamp = packet_queue->next_etm_timestamp;
 
/* Estimate the timestamp for the next range packet */
-   packet_queue->next_timestamp += packet_queue->instr_count;
+   packet_queue->next_etm_timestamp += packet_queue->instr_count;
packet_queue->instr_count = 0;
 
/* Tell the front end which traceid_queue needs attention */
@@ -308,8 +308,8 @@ cs_etm_decoder__do_hard_timestamp(struct cs_etm_queue *etmq,
 * Function do_soft_timestamp() will report the value to the front end,
 * hence asking the decoder to keep decoding rather than stopping.
 */
-   if (packet_queue->timestamp) {
-   packet_queue->next_timestamp = elem->timestamp;
+   if (packet_queue->etm_timestamp) {
+   packet_queue->next_etm_timestamp = elem->timestamp;
return OCSD_RESP_CONT;
}
 
@@ -320,8 +320,8 @@ cs_etm_decoder__do_hard_timestamp(struct cs_etm_queue *etmq,
 * which instructions started by subtracting the number of instructions
 * executed to the timestamp.
 */
-   packet_queue->timestamp = elem->timestamp - packet_queue->instr_count;
-   packet_queue->next_timestamp = elem->timestamp;
+   packet_queue->etm_timestamp = elem->timestamp - 
packet_queue->instr_count;
+   packet_queue->next_etm_timestamp = elem->timestamp;
packet_queue->instr_count = 0;
 
/* Tell the front end which traceid_queue needs attention */
@@ -334,8 +334,8 @@ cs_etm_decoder__do_hard_timestamp(struct cs_etm_queue *etmq,
 static void
 cs_etm_decoder__reset_timestamp(struct cs_etm_packet_queue *packet_queue)
 {
-   packet_queue->timestamp = 0;
-   packet_queue->next_timestamp = 0;
+   packet_queue->etm_timestamp = 0;
+   packet_queue->next_etm_timestamp = 0;
packet_queue->instr_count = 0;
 }
 
diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c
index 7e63e7dedc33..c25da2ffa8f3 100644
--- a/tools/perf/util/cs-etm.c
+++ b/tools/perf/util/cs-etm.c
@@ -38,8 +38,6 @@
 #include 
 #include "util/synthetic-events.h"
 
-#define MAX_TIMESTAMP (~0ULL)
-
 struct cs_etm_auxtrace {
struct auxtrace auxtrace;
struct auxtrace_queues queues;
@@ -86,7 +84,7 @@ struct cs_etm_queue {
struct cs_etm_decoder *decoder;
struct auxtrace_buffer *buffer;
unsigned int queue_nr;
-   u8 pending_timestamp;
+   u8 pending_timestamp_chan_id;
u64 offset;
const unsigned char *buf;
size_t buf_len, buf_used;
@@ -208,7 +206,7 @@ void cs_etm__etmq_set_traceid_queue_timestamp(struct 
cs_etm_queue *etmq,
 * be more than one channel per cs_etm_queue, we need to specify
 * what traceID queue needs servicing.
 */
-   etmq->pending_timestamp = trace_chan_id;
+   etmq->pending_timestamp_chan_id = trace_chan_id;
 }
 
 static u64 cs_etm__etmq_get_timestamp(struct cs_etm_queue *etmq,
@@ -216,22 +214,22 @@ static u64 cs_etm__etmq_get_timestamp(struct cs_etm_queue 
*etmq,
 {
struct cs_etm_packet_queue *packet_queue;
 
-   if (!etmq->pending_timestamp)
+   if (!etmq->pending_timestamp_chan_id)
return 0;
 
if (trace_chan_id)
-   *trace_chan_id = etmq->pending_timestamp;
+   *trace_chan_id = etmq->pending_timestamp_chan_id;
 
packet_queue = cs_etm__etmq_get_packet_queue(etmq,
-etmq->pending_timestamp);
+  

Re: [PATCH RESEND WITH CCs v3 4/4] perf tools: determine if LR is the return address

2021-03-26 Thread James Clark



On 22/03/2021 13:57, Alexandre Truong wrote:
> Hi Arnaldo,
> 
> Thanks for your reply.
> 
> I profiled a few applications and here are the results.
> 
> ---
>     |   |    |
>     |
>     App |   Before the patch    | With the patch (% of increase) | if 
> only LR is recorded (% of increase) |
>     |   |    |
>     |
> ---
>     |   |    |
>     |
>   firefox   | nb of samples: 834k   |   nb of samples: 685k (+70%)   |   
> nb of samples: 671k (+10%)   |
>     | size:  101 MB |   size:  141 MB    |   
> size:  90 MB |

Hi Alex,

I think the 70% increase when recording all of the registers is too much, so we 
should continue with the change
to only record the link register. In my opinion a 5% - 10% increase by enabling 
LR recording by default wouldn't
be noticeable, so it's worth doing. But obviously we'll need other opinions too.

Do you also have the average stack lengths for these figures?

James

> ---
>     |   |    |
>     |
>     htop    | nb of samples: 500k   |   nb of samples: 443k (+71%)   |   
> nb of samples: 504k (+7%)    |
>     | size:  69 MB  |   size:  105 MB    |   
> size:  75 MB |
> ---
>     |   |    |
>     |
>     top | nb of samples: 500k   |   nb of samples: 521k (+70%)   |   
> nb of samples: 481k (+7%)    |
>     | size:  69 MB  |   size:  122 MB    |   
> size:  71 MB |
> ---
>     |   |    |
>     |
> thunderbird | nb of samples: 266k   |   nb of samples: 271k (+43%)   |   
> nb of samples: 269k (+5%)    |
>     | size:  31 MB  |   size:  45 MB |   
> size:  33 MB |
> ---
> 
> What do you think of these results ?
> Should there be a selectable mode specified in .perfconfig ?
> I will investigate in reducing the size of the file as well. As, we only need 
> the link register in theory.
> 
> Also for the compilation on different platforms
> What do you think of this fix ?
> 
> diff --git a/tools/arch/arm64/include/uapi/asm/perf_regs.h 
> b/tools/arch/arm64/include/uapi/asm/perf_regs.h
> index d54daafa89e3..15b6805202c1 100644
> --- a/tools/arch/arm64/include/uapi/asm/perf_regs.h
> +++ b/tools/arch/arm64/include/uapi/asm/perf_regs.h
> @@ -2,7 +2,7 @@
>  #ifndef _ASM_ARM64_PERF_REGS_H
>  #define _ASM_ARM64_PERF_REGS_H
>  
> -enum perf_event_arm_regs {
> +enum perf_event_arm64_regs {
>     PERF_REG_ARM64_X0,
>     PERF_REG_ARM64_X1,
>     PERF_REG_ARM64_X2,
> diff --git a/tools/perf/util/arm-frame-pointer-unwind-support.c 
> b/tools/perf/util/arm-frame-pointer-unwind-support.c
> index 964efd08e72e..0104477a762a 100644
> --- a/tools/perf/util/arm-frame-pointer-unwind-support.c
> +++ b/tools/perf/util/arm-frame-pointer-unwind-support.c
> @@ -1,6 +1,5 @@
>  // SPDX-License-Identifier: GPL-2.0
>  #include "../arch/arm64/include/uapi/asm/perf_regs.h"
> -#include "arch/arm64/include/perf_regs.h"
>  #include "event.h"
>  #include "arm-frame-pointer-unwind-support.h"
>  #include "callchain.h"
> @@ -13,8 +12,9 @@ struct entries {
>  
>  static bool get_leaf_frame_caller_enabled(struct perf_sample *sample)
>  {
> +   unsigned long long arm64_regs_mask = ((1ULL << PERF_REG_ARM64_MAX) - 
> 1);
>     return callchain_param.record_mode == CALLCHAIN_FP && 
> sample->user_regs.regs
> -       && sampl

Re: [PATCH RESEND WITH CCs v3 4/4] perf tools: determine if LR is the return address

2021-03-05 Thread James Clark
I've tested this patchset on a few different applications and have seen it 
significantly improve
quality of frame pointer stacks on aarch64. For example with GDB 10 and default 
build options,
'bfd_calc_gnu_debuglink_crc32' is a leaf function, and its caller 'gdb_bfd_crc' 
is ommitted,
but with the patchset it is included. I've also confirmed that this is correct 
from looking at
the source code.

Before:

# Children  Self  Command  Shared Object   
Symbol  





#     ...  ..  
...
#
34.55% 0.00%  gdb-100  gdb-100 [.] 
_start
   0.78%
_start
__libc_start_main
main
gdb_main
captured_command_loop
gdb_do_one_event
check_async_event_handlers
fetch_inferior_event
inferior_event_handler
do_all_inferior_continuations
attach_post_wait
post_create_inferior
svr4_solib_create_inferior_hook
solib_add
solib_read_symbols
symbol_file_add_with_addrs
read_symbols
elf_symfile_read
find_separate_debug_file_by_debuglink[abi:cxx11]
find_separate_debug_file
separate_debug_file_exists
gdb_bfd_crc
bfd_calc_gnu_debuglink_crc32

After:

# Children  Self  Command  Shared Object   
Symbol  





#     ...  ..  
...
#
34.55% 0.00%  gdb-100  gdb-100 [.] 
_start
   0.78%
_start
__libc_start_main
main
gdb_main
captured_command_loop
gdb_do_one_event
check_async_event_handlers
fetch_inferior_event
inferior_event_handler
do_all_inferior_continuations
attach_post_wait
post_create_inferior
svr4_solib_create_inferior_hook
solib_add
solib_read_symbols
symbol_file_add_with_addrs
read_symbols
elf_symfile_read
find_separate_debug_file_by_debuglink[abi:cxx11]
find_separate_debug_file
separate_debug_file_exists
get_file_crc   <- leaf frame caller 
added
bfd_calc_gnu_debuglink_crc32

There is a question about whether the overhead of recording all the registers 
is acceptable, for
filesize and time. We could make it a manual step, at the cost of not showing 
better frame pointer
stacks by default.

Tested-by: James Clark 

On 04/03/2021 18:32, Alexandre Truong wrote:
> On arm64 and frame pointer mode (e.g: perf record --callgraph fp),
> use dwarf unwind info to check if the link register is the return
> address in order to inject it to the frame pointer stack.
> 
> Write the following application:
> 
>   int a = 10;
> 
>   void f2(void)
>   {
>   for (int i = 0; i < 100; i++)
>   a *= a;
>   }
> 
>   void f1()
>   {
>   for (int i = 0; i < 10; i++)
>   f2();
>   }
> 
>   int main (void)
>   {
>   f1();
>   return 0;
&g

Re: [PATCH 3/7] perf cs-etm: Save aux records in each etm queue

2021-03-01 Thread James Clark



On 27/02/2021 09:10, Leo Yan wrote:
> On Fri, Feb 12, 2021 at 04:45:09PM +0200, James Clark wrote:
>> The aux records will be used set the bounds of decoding in a
>> later commit. In the future we may also want to use the flags
>> of each record to control decoding.
>>
>> Do these need to be saved in their entirety, or can pointers
>> to each record safely be saved instead for later access?
> 
> Rather than introudcing the perf record list, I just wander if we can
> use easier method to fix this problem.  So below is the rough idea
> (though I don't really verify it):
> 
> The essential information we need is what's the valid buffer length can
> be used for decoding.  Though cs_etm_queue::buf_len tracks the buffer
> length, but it's the buffer length is for the whole AUX buffer, and
> which belongs to multiple "PERF_RECORD_AUX" events.  So we cannot decode
> at once for the whole trace data in the AUX trace buffer, on the other
> hand, the incoming "PERF_RECORD_AUX" event can guide the CoreSight
> decoder it should decode how much buffer size.  At the end, the trace
> data can be decoded step by step based on the incoming "PERF_RECORD_AUX"
> events.
> 
> I'd like to propose to add a new field "cs_etm_queue::buf_rec_len", it
> stands for the record length based on the RECORD_AUX event.  In
> theory, this value should be always less than "cs_etm_queue::buf_len".
> 
> When every time the "PERF_RECORD_AUX" event is coming, we find out the
> corresponding queue (so this can be applied for "1:1" or "N:1" models
> for source and sink), and accumulate "perf_record_aux::aux_size" into
> "cs_etm_queue::buf_rec_len".
> 
> At the decoder side, it decreases "etmq->buf_rec_len" until to zero for
> the current round of decoding (see cs_etm__decode_data_block()).  Since
> all the "PERF_RECORD_AUX" event will be processed before
> "PERF_RECORD_EXIT" event, so we don't worry the tail trace data will be
> ignored.
> 
> The main reason for this suggestion is it don't need to change the
> significant logic in current code.  I will try to do experiment for this
> idea and share back.
> 
> James, if you think I miss anything, please correct me as needed.
> Thanks!
> 

This is an interesting idea, I think we could push decoded packets into the
min heap as the aux records are received, and not do anything with them until
the end of the data is reached. That way instead of saving aux records, we'd
save the result of the decode for each aux record.

Currently each cs_etm_queue has a cs_etm_traceid_queue/cs_etm_packet_queue for 
each
stream, but that would have to be changed to have multiple ones because multiple
packets could be decoded to get through the whole aux record.

It would be a similarly sized change, and could also have a bigger impact on
memory. So I'm not sure if it would help to reduce the changes, but it is 
possible.

James

> Leo
> 
>> Signed-off-by: James Clark 
>> ---
>>  tools/perf/util/cs-etm.c | 32 +---
>>  1 file changed, 29 insertions(+), 3 deletions(-)
>>
>> diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c
>> index 8f8b448632fb..88b541b2a804 100644
>> --- a/tools/perf/util/cs-etm.c
>> +++ b/tools/perf/util/cs-etm.c
>> @@ -92,12 +92,16 @@ struct cs_etm_queue {
>>  /* Conversion between traceID and index in traceid_queues array */
>>  struct intlist *traceid_queues_list;
>>  struct cs_etm_traceid_queue **traceid_queues;
>> +int aux_record_list_len;
>> +int aux_record_list_idx;
>> +struct perf_record_aux *aux_record_list;
>>  };
>>  
>>  /* RB tree for quick conversion between traceID and metadata pointers */
>>  static struct intlist *traceid_list;
>>  
>> -static int cs_etm__update_queues(struct cs_etm_auxtrace *etm, int cpu);
>> +static int cs_etm__update_queues(struct cs_etm_auxtrace *etm, int cpu,
>> + struct perf_record_aux *aux_record);
>>  static int cs_etm__process_queues(struct cs_etm_auxtrace *etm);
>>  static int cs_etm__process_timeless_queues(struct cs_etm_auxtrace *etm,
>> pid_t tid);
>> @@ -585,6 +589,7 @@ static void cs_etm__free_queue(void *priv)
>>  
>>  cs_etm_decoder__free(etmq->decoder);
>>  cs_etm__free_traceid_queues(etmq);
>> +free(etmq->aux_record_list);
>>  free(etmq);
>>  }
>>  
>> @@ -759,6 +764,19 @@ static struct cs_etm_queue *cs_etm__alloc_queue(struct 
>> cs_etm_auxtrace *etm)
&

Re: [PATCH 2/7] perf cs-etm: Only search timestamp in current sample's queue.

2021-03-01 Thread James Clark



On 20/02/2021 13:50, Leo Yan wrote:
> On Fri, Feb 12, 2021 at 04:45:08PM +0200, James Clark wrote:
>> Change initial timestamp search to only operate on the queue
>> related to the current event. In a later change the bounds
>> of the aux record will also be used to reset the decoder and
>> the record is only relevant to a single queue.
> 
> I roughly understand this patch tries to establish the mechanism for
> timstamp search per CPU, but I am struggling to understand what's issue
> you try to address.
> 
> So could you give more description for "what's current issue?" and
> "why need to use this way to fix the issue?".  This would be very
> appreciated.

Hi Leo,

The issue is that the aux records used to reset the decoder are associated
with a specific CPU/aux queue. Currently when any new data is received, all
queues are searched for a timestamp. We can't do it that way any more because
the aux records aren't available yet.

The reason to fix it this way is because now we can only do decode when
an aux record is received. This will happen multiple times, and will also
be cpu/queue specific.
 
> 
>> This change makes some files that had coresight data
>> but didn't syntesise any events start working and generating
>> events. I'm not sure of the reason for that. I'd expect this
>> change to only affect the ordering of events.
> 
> This seems to me that this patch introduces regression.

I'm wondering if it is a regression, or accidentally fixing a bug.
It doesn't seem like it's possible to go from not generating any samples to
generating lots without accidentally fixing an existing issue. If there is
valid data there, what would be stopping it from generating any samples?

I do need to look into this more closely though to find the real reason for
it, which will probably shed more light on it.

> 
>> Signed-off-by: James Clark 
>> ---
>>  tools/perf/util/cs-etm.c | 30 ++
>>  1 file changed, 14 insertions(+), 16 deletions(-)
>>
>> diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c
>> index 27894facae5e..8f8b448632fb 100644
>> --- a/tools/perf/util/cs-etm.c
>> +++ b/tools/perf/util/cs-etm.c
>> @@ -97,7 +97,7 @@ struct cs_etm_queue {
>>  /* RB tree for quick conversion between traceID and metadata pointers */
>>  static struct intlist *traceid_list;
>>  
>> -static int cs_etm__update_queues(struct cs_etm_auxtrace *etm);
>> +static int cs_etm__update_queues(struct cs_etm_auxtrace *etm, int cpu);
>>  static int cs_etm__process_queues(struct cs_etm_auxtrace *etm);
>>  static int cs_etm__process_timeless_queues(struct cs_etm_auxtrace *etm,
>> pid_t tid);
>> @@ -524,7 +524,6 @@ static void cs_etm__dump_event(struct cs_etm_auxtrace 
>> *etm,
>>  static int cs_etm__flush_events(struct perf_session *session,
>>  struct perf_tool *tool)
>>  {
>> -int ret;
>>  struct cs_etm_auxtrace *etm = container_of(session->auxtrace,
>> struct cs_etm_auxtrace,
>> auxtrace);
>> @@ -534,11 +533,6 @@ static int cs_etm__flush_events(struct perf_session 
>> *session,
>>  if (!tool->ordered_events)
>>  return -EINVAL;
>>  
>> -ret = cs_etm__update_queues(etm);
>> -
>> -if (ret < 0)
>> -return ret;
>> -
> 
> When flush events, it means the trace data is discontinuous or at the
> end of trace data.  If the trace data is discontinuous, we need to use
> cs_etm__update_queues() to create new queues.  So if we remove the
> calling cs_etm__update_queues(), I suspect it cannot handle the
> discontinuous trace data anymore.

Do you know how to force perf to record data like this? From my experience
etm->queues.new_data is only set once when the file is first opened.

> 
>>  if (etm->timeless_decoding)
>>  return cs_etm__process_timeless_queues(etm, -1);
>>  
>> @@ -851,10 +845,7 @@ static int cs_etm__setup_queue(struct cs_etm_auxtrace 
>> *etm,
>>  etmq->queue_nr = queue_nr;
>>  etmq->offset = 0;
>>  
>> -if (etm->timeless_decoding)
>> -return 0;
>> -else
>> -return cs_etm__search_first_timestamp(etmq);
>> +return 0;
>>  }
>>  
>>  static int cs_etm__setup_queues(struct cs_etm_auxtrace *etm)
>> @@ -874,14 +865,20 @@ static int cs_etm__setup_queues(struct cs_etm_auxtrace 
>> *etm)
>>  return 0;
>>  

Re: [PATCH 0/7] Split Coresight decode by aux records

2021-03-01 Thread James Clark


On 24/02/2021 18:13, Mathieu Poirier wrote:
> Good day James,
> 
> I have received your patchset and added it to my queue.  On the flip side it
> will be 3 to 4 weeks (from today) before I get a chance to look at it.  As 
> such
> I suggest you don't wait on me before addressing the issues found by Leo.
> 

Ok, thanks Mathieu. I found that it's only working in --per-thread mode by
coincidence of my input file. So I would suggest to not look too thoroughly
until I have submitted v2. It should also probably still be an RFC rather than 
PATCH.

Thanks
James

> Thanks,
> Mathieu
> 
> On Fri, Feb 12, 2021 at 04:45:06PM +0200, James Clark wrote:
>> Hi All,
>>
>> Since my previous RFC, I've fixed --per-thread mode and solved
>> most of the open questions. I've also changed --dump-raw-trace
>> to use the same code path so it's also working now.
>>
>> I think the only open questions are:
>>   * General approach
>>   * If aux records need to be saved, or if they can be pulled
>> from elsewhere.
>>
>> I've also tested perf inject which is now working with troublesome
>> files.
>>
>> Thanks
>> James
>>
>> James Clark (7):
>>   perf cs-etm: Split up etm queue setup function
>>   perf cs-etm: Only search timestamp in current sample's queue.
>>   perf cs-etm: Save aux records in each etm queue
>>   perf cs-etm: don't process queues until cs_etm__flush_events
>>   perf cs-etm: split decode by aux records.
>>   perf cs-etm: Use existing decode code path for --dump-raw-trace
>>   perf cs-etm: Suppress printing when resetting decoder
>>
>>  .../perf/util/cs-etm-decoder/cs-etm-decoder.c |  10 +-
>>  tools/perf/util/cs-etm.c  | 300 ++
>>  2 files changed, 168 insertions(+), 142 deletions(-)
>>
>> -- 
>> 2.28.0
>>


[PATCH 7/7] perf cs-etm: Suppress printing when resetting decoder

2021-02-12 Thread James Clark
The decoder is quite noisy when being reset. Now that dump-raw-trace
uses a code path that resets the decoder rather than creating a new
one, printing has to be suppressed to not flood the output.

Signed-off-by: James Clark 
---
 tools/perf/util/cs-etm-decoder/cs-etm-decoder.c | 10 +++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c 
b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c
index 3f4bc4050477..e0d530d94e1e 100644
--- a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c
+++ b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c
@@ -32,6 +32,7 @@
 struct cs_etm_decoder {
void *data;
void (*packet_printer)(const char *msg);
+   bool suppress_printing;
dcd_tree_handle_t dcd_tree;
cs_etm_mem_cb_type mem_access;
ocsd_datapath_resp_t prev_return;
@@ -71,9 +72,10 @@ int cs_etm_decoder__reset(struct cs_etm_decoder *decoder)
ocsd_datapath_resp_t dp_ret;
 
decoder->prev_return = OCSD_RESP_CONT;
-
+   decoder->suppress_printing = true;
dp_ret = ocsd_dt_process_data(decoder->dcd_tree, OCSD_OP_RESET,
  0, 0, NULL, NULL);
+   decoder->suppress_printing = false;
if (OCSD_DATA_RESP_IS_FATAL(dp_ret))
return -1;
 
@@ -143,8 +145,10 @@ static void cs_etm_decoder__print_str_cb(const void 
*p_context,
 const char *msg,
 const int str_len)
 {
-   if (p_context && str_len)
-   ((struct cs_etm_decoder *)p_context)->packet_printer(msg);
+   const struct cs_etm_decoder *decoder = p_context;
+
+   if (p_context && str_len && !decoder->suppress_printing)
+   decoder->packet_printer(msg);
 }
 
 static int
-- 
2.28.0



[PATCH 4/7] perf cs-etm: don't process queues until cs_etm__flush_events

2021-02-12 Thread James Clark
To make sure processing happens in the correct order, queue processing
shouldn't start until every aux queue has had its first timestamp found.

Now that we're only searching for timestamps within each aux record, we
need to wait until all aux records are delivered before starting the
processing.

Signed-off-by: James Clark 
---
 tools/perf/util/cs-etm.c | 4 
 1 file changed, 4 deletions(-)

diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c
index 88b541b2a804..5ab037c2dabe 100644
--- a/tools/perf/util/cs-etm.c
+++ b/tools/perf/util/cs-etm.c
@@ -2398,10 +2398,6 @@ static int cs_etm__process_event(struct perf_session 
*session,
else if (event->header.type == PERF_RECORD_SWITCH_CPU_WIDE)
return cs_etm__process_switch_cpu_wide(etm, event);
 
-   if (!etm->timeless_decoding &&
-   event->header.type == PERF_RECORD_AUX)
-   return cs_etm__process_queues(etm);
-
return 0;
 }
 
-- 
2.28.0



[PATCH 2/7] perf cs-etm: Only search timestamp in current sample's queue.

2021-02-12 Thread James Clark
Change initial timestamp search to only operate on the queue
related to the current event. In a later change the bounds
of the aux record will also be used to reset the decoder and
the record is only relevant to a single queue.

This change makes some files that had coresight data
but didn't syntesise any events start working and generating
events. I'm not sure of the reason for that. I'd expect this
change to only affect the ordering of events.

Signed-off-by: James Clark 
---
 tools/perf/util/cs-etm.c | 30 ++
 1 file changed, 14 insertions(+), 16 deletions(-)

diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c
index 27894facae5e..8f8b448632fb 100644
--- a/tools/perf/util/cs-etm.c
+++ b/tools/perf/util/cs-etm.c
@@ -97,7 +97,7 @@ struct cs_etm_queue {
 /* RB tree for quick conversion between traceID and metadata pointers */
 static struct intlist *traceid_list;
 
-static int cs_etm__update_queues(struct cs_etm_auxtrace *etm);
+static int cs_etm__update_queues(struct cs_etm_auxtrace *etm, int cpu);
 static int cs_etm__process_queues(struct cs_etm_auxtrace *etm);
 static int cs_etm__process_timeless_queues(struct cs_etm_auxtrace *etm,
   pid_t tid);
@@ -524,7 +524,6 @@ static void cs_etm__dump_event(struct cs_etm_auxtrace *etm,
 static int cs_etm__flush_events(struct perf_session *session,
struct perf_tool *tool)
 {
-   int ret;
struct cs_etm_auxtrace *etm = container_of(session->auxtrace,
   struct cs_etm_auxtrace,
   auxtrace);
@@ -534,11 +533,6 @@ static int cs_etm__flush_events(struct perf_session 
*session,
if (!tool->ordered_events)
return -EINVAL;
 
-   ret = cs_etm__update_queues(etm);
-
-   if (ret < 0)
-   return ret;
-
if (etm->timeless_decoding)
return cs_etm__process_timeless_queues(etm, -1);
 
@@ -851,10 +845,7 @@ static int cs_etm__setup_queue(struct cs_etm_auxtrace *etm,
etmq->queue_nr = queue_nr;
etmq->offset = 0;
 
-   if (etm->timeless_decoding)
-   return 0;
-   else
-   return cs_etm__search_first_timestamp(etmq);
+   return 0;
 }
 
 static int cs_etm__setup_queues(struct cs_etm_auxtrace *etm)
@@ -874,14 +865,20 @@ static int cs_etm__setup_queues(struct cs_etm_auxtrace 
*etm)
return 0;
 }
 
-static int cs_etm__update_queues(struct cs_etm_auxtrace *etm)
+static int cs_etm__update_queues(struct cs_etm_auxtrace *etm, int cpu)
 {
+   int ret;
if (etm->queues.new_data) {
etm->queues.new_data = false;
-   return cs_etm__setup_queues(etm);
+   ret = cs_etm__setup_queues(etm);
+   if (ret)
+   return ret;
}
 
-   return 0;
+   if (!etm->timeless_decoding)
+   return 
cs_etm__search_first_timestamp(etm->queues.queue_array[cpu].priv);
+   else
+   return 0;
 }
 
 static inline
@@ -2358,8 +2355,9 @@ static int cs_etm__process_event(struct perf_session 
*session,
else
timestamp = 0;
 
-   if (timestamp || etm->timeless_decoding) {
-   err = cs_etm__update_queues(etm);
+   if ((timestamp || etm->timeless_decoding)
+   && event->header.type == PERF_RECORD_AUX) {
+   err = cs_etm__update_queues(etm, sample->cpu);
if (err)
return err;
}
-- 
2.28.0



[PATCH 3/7] perf cs-etm: Save aux records in each etm queue

2021-02-12 Thread James Clark
The aux records will be used set the bounds of decoding in a
later commit. In the future we may also want to use the flags
of each record to control decoding.

Do these need to be saved in their entirety, or can pointers
to each record safely be saved instead for later access?

Signed-off-by: James Clark 
---
 tools/perf/util/cs-etm.c | 32 +---
 1 file changed, 29 insertions(+), 3 deletions(-)

diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c
index 8f8b448632fb..88b541b2a804 100644
--- a/tools/perf/util/cs-etm.c
+++ b/tools/perf/util/cs-etm.c
@@ -92,12 +92,16 @@ struct cs_etm_queue {
/* Conversion between traceID and index in traceid_queues array */
struct intlist *traceid_queues_list;
struct cs_etm_traceid_queue **traceid_queues;
+   int aux_record_list_len;
+   int aux_record_list_idx;
+   struct perf_record_aux *aux_record_list;
 };
 
 /* RB tree for quick conversion between traceID and metadata pointers */
 static struct intlist *traceid_list;
 
-static int cs_etm__update_queues(struct cs_etm_auxtrace *etm, int cpu);
+static int cs_etm__update_queues(struct cs_etm_auxtrace *etm, int cpu,
+struct perf_record_aux *aux_record);
 static int cs_etm__process_queues(struct cs_etm_auxtrace *etm);
 static int cs_etm__process_timeless_queues(struct cs_etm_auxtrace *etm,
   pid_t tid);
@@ -585,6 +589,7 @@ static void cs_etm__free_queue(void *priv)
 
cs_etm_decoder__free(etmq->decoder);
cs_etm__free_traceid_queues(etmq);
+   free(etmq->aux_record_list);
free(etmq);
 }
 
@@ -759,6 +764,19 @@ static struct cs_etm_queue *cs_etm__alloc_queue(struct 
cs_etm_auxtrace *etm)
return NULL;
 }
 
+static int cs_etm__save_aux_record(struct cs_etm_queue *etmq,
+  struct perf_record_aux *aux_record)
+{
+   etmq->aux_record_list = reallocarray(etmq->aux_record_list,
+ etmq->aux_record_list_len+1,
+ sizeof(*etmq->aux_record_list));
+   if (!etmq->aux_record_list)
+   return -ENOMEM;
+
+   etmq->aux_record_list[etmq->aux_record_list_len++] = *aux_record;
+   return 0;
+}
+
 static int cs_etm__search_first_timestamp(struct cs_etm_queue *etmq)
 {
int ret = 0;
@@ -865,7 +883,7 @@ static int cs_etm__setup_queues(struct cs_etm_auxtrace *etm)
return 0;
 }
 
-static int cs_etm__update_queues(struct cs_etm_auxtrace *etm, int cpu)
+static int cs_etm__update_queues(struct cs_etm_auxtrace *etm, int cpu, struct 
perf_record_aux *aux)
 {
int ret;
if (etm->queues.new_data) {
@@ -875,6 +893,14 @@ static int cs_etm__update_queues(struct cs_etm_auxtrace 
*etm, int cpu)
return ret;
}
 
+   /* In timeless mode, cpu is set to -1, and a single aux buffer is 
filled */
+   if (cpu < 0)
+   cpu = 0;
+
+   ret = cs_etm__save_aux_record(etm->queues.queue_array[cpu].priv, aux);
+   if (ret)
+   return ret;
+
if (!etm->timeless_decoding)
return 
cs_etm__search_first_timestamp(etm->queues.queue_array[cpu].priv);
else
@@ -2357,7 +2383,7 @@ static int cs_etm__process_event(struct perf_session 
*session,
 
if ((timestamp || etm->timeless_decoding)
&& event->header.type == PERF_RECORD_AUX) {
-   err = cs_etm__update_queues(etm, sample->cpu);
+   err = cs_etm__update_queues(etm, sample->cpu, &event->aux);
if (err)
return err;
}
-- 
2.28.0



[PATCH 6/7] perf cs-etm: Use existing decode code path for --dump-raw-trace

2021-02-12 Thread James Clark
Previously the dump mode created a new decoder for each buffer
and had a different decode loop to the main code paths.

This change uses more of the existing code path which tracks
aux records and resets the decoder between each one. Unfortunately
the decoder is quite noisy when being reset, so printing has to
be suppressed around each call to reset.

Signed-off-by: James Clark 
---
 tools/perf/util/cs-etm.c | 91 
 1 file changed, 36 insertions(+), 55 deletions(-)

diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c
index 3026fcf50b5d..dc4885794859 100644
--- a/tools/perf/util/cs-etm.c
+++ b/tools/perf/util/cs-etm.c
@@ -474,56 +474,21 @@ static int cs_etm__init_decoder_params(struct 
cs_etm_decoder_params *d_params,
return ret;
 }
 
-static void cs_etm__dump_event(struct cs_etm_auxtrace *etm,
-  struct auxtrace_buffer *buffer)
+static void cs_etm__dump_event(struct cs_etm_queue *etmq)
 {
-   int ret;
const char *color = PERF_COLOR_BLUE;
-   struct cs_etm_decoder_params d_params;
-   struct cs_etm_trace_params *t_params;
-   struct cs_etm_decoder *decoder;
-   size_t buffer_used = 0;
+
+   if (etmq->aux_record_list_idx >= etmq->aux_record_list_len)
+   return;
 
fprintf(stdout, "\n");
color_fprintf(stdout, color,
 ". ... CoreSight ETM Trace data: size %zu bytes\n",
-buffer->size);
-
-   /* Use metadata to fill in trace parameters for trace decoder */
-   t_params = zalloc(sizeof(*t_params) * etm->num_cpu);
-
-   if (!t_params)
-   return;
-
-   if (cs_etm__init_trace_params(t_params, etm))
-   goto out_free;
-
-   /* Set decoder parameters to simply print the trace packets */
-   if (cs_etm__init_decoder_params(&d_params, NULL,
-   CS_ETM_OPERATION_PRINT))
-   goto out_free;
-
-   decoder = cs_etm_decoder__new(etm->num_cpu, &d_params, t_params);
+etmq->aux_record_list[etmq->aux_record_list_idx].aux_size);
 
-   if (!decoder)
-   goto out_free;
-   do {
-   size_t consumed;
-
-   ret = cs_etm_decoder__process_data_block(
-   decoder, buffer->offset,
-   &((u8 *)buffer->data)[buffer_used],
-   buffer->size - buffer_used, &consumed);
-   if (ret)
+   while (1)
+   if (cs_etm__decode_data_block(etmq) <= 0)
break;
-
-   buffer_used += consumed;
-   } while (buffer_used < buffer->size);
-
-   cs_etm_decoder__free(decoder);
-
-out_free:
-   zfree(&t_params);
 }
 
 static int cs_etm__flush_events(struct perf_session *session,
@@ -735,9 +700,15 @@ static struct cs_etm_queue *cs_etm__alloc_queue(struct 
cs_etm_auxtrace *etm)
goto out_free;
 
/* Set decoder parameters to decode trace packets */
-   if (cs_etm__init_decoder_params(&d_params, etmq,
-   CS_ETM_OPERATION_DECODE))
-   goto out_free;
+   if (dump_trace) {
+   if (cs_etm__init_decoder_params(&d_params, NULL,
+   CS_ETM_OPERATION_PRINT))
+   goto out_free;
+   } else {
+   if (cs_etm__init_decoder_params(&d_params, etmq,
+   CS_ETM_OPERATION_DECODE))
+   goto out_free;
+   }
 
etmq->decoder = cs_etm_decoder__new(etm->num_cpu, &d_params, t_params);
 
@@ -903,7 +874,7 @@ static int cs_etm__update_queues(struct cs_etm_auxtrace 
*etm, int cpu, struct pe
if (ret)
return ret;
 
-   if (!etm->timeless_decoding)
+   if (!etm->timeless_decoding && !dump_trace)
return 
cs_etm__search_first_timestamp(etm->queues.queue_array[cpu].priv);
else
return 0;
@@ -1647,6 +1618,9 @@ static int cs_etm__get_data_block(struct cs_etm_queue 
*etmq)
 {
int ret;
 
+   if (etmq->aux_record_list_idx >= etmq->aux_record_list_len)
+   return 0;
+
if (etmq->aux_record_list[etmq->aux_record_list_idx].aux_size <= 0) {
etmq->aux_record_list_idx++;
ret = cs_etm_decoder__reset(etmq->decoder);
@@ -2387,10 +2361,7 @@ static int cs_etm__process_event(struct perf_session 
*session,
   struct cs_etm_auxtrace,
   auxtrace);
 
-   if (dump_trace)
-   return 0;
-
-   if (!tool->ordered_events) {
+   if (!tool->ordered_events && !dump_trace) {
  

[PATCH 5/7] perf cs-etm: split decode by aux records.

2021-02-12 Thread James Clark
The trace data between aux records is not continuous, so the decoder
must be reset between each record to ensure that parsing happens
correctly and without any early exits.

Signed-off-by: James Clark 
---
 tools/perf/util/cs-etm.c | 109 +++
 1 file changed, 64 insertions(+), 45 deletions(-)

diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c
index 5ab037c2dabe..3026fcf50b5d 100644
--- a/tools/perf/util/cs-etm.c
+++ b/tools/perf/util/cs-etm.c
@@ -95,6 +95,7 @@ struct cs_etm_queue {
int aux_record_list_len;
int aux_record_list_idx;
struct perf_record_aux *aux_record_list;
+   bool timestamp_found;
 };
 
 /* RB tree for quick conversion between traceID and metadata pointers */
@@ -784,6 +785,9 @@ static int cs_etm__search_first_timestamp(struct 
cs_etm_queue *etmq)
unsigned int cs_queue_nr;
u8 trace_chan_id;
 
+   if (etmq->timestamp_found)
+   return 0;
+
/*
 * We are under a CPU-wide trace scenario.  As such we need to know
 * when the code that generated the traces started to execute so that
@@ -792,56 +796,54 @@ static int cs_etm__search_first_timestamp(struct 
cs_etm_queue *etmq)
 * timestamp.  The timestamp is then added to the auxtrace min heap
 * in order to know what nibble (of all the etmqs) to decode first.
 */
-   while (1) {
-   /*
-* Fetch an aux_buffer from this etmq.  Bail if no more
-* blocks or an error has been encountered.
-*/
-   ret = cs_etm__get_data_block(etmq);
-   if (ret <= 0)
-   return ret;
-
-   /*
-* Run decoder on the trace block.  The decoder will stop when
-* encountering a timestamp, a full packet queue or the end of
-* trace for that block.
-*/
-   ret = cs_etm__decode_data_block(etmq);
-   if (ret)
-   return ret;
+   /*
+* Fetch an aux_buffer from this etmq.  Bail if no more
+* blocks or an error has been encountered.
+*/
+   ret = cs_etm__get_data_block(etmq);
+   if (ret <= 0)
+   return ret;
 
-   /*
-* Function cs_etm_decoder__do_{hard|soft}_timestamp() does all
-* the timestamp calculation for us.
-*/
-   timestamp = cs_etm__etmq_get_timestamp(etmq, &trace_chan_id);
+   /*
+* Run decoder on the trace block.  The decoder will stop when
+* encountering a timestamp, a full packet queue or the end of
+* trace for that block.
+*/
+   ret = cs_etm__decode_data_block(etmq);
+   if (ret)
+   return ret;
 
-   /* We found a timestamp, no need to continue. */
-   if (timestamp)
-   break;
+   /*
+* Function cs_etm_decoder__do_{hard|soft}_timestamp() does all
+* the timestamp calculation for us.
+*/
+   timestamp = cs_etm__etmq_get_timestamp(etmq, &trace_chan_id);
 
+   /* We found a timestamp, no need to continue. */
+   if (timestamp) {
/*
-* We didn't find a timestamp so empty all the traceid packet
-* queues before looking for another timestamp packet, either
-* in the current data block or a new one.  Packets that were
-* just decoded are useless since no timestamp has been
-* associated with them.  As such simply discard them.
+* We have a timestamp.  Add it to the min heap to reflect when
+* instructions conveyed by the range packets of this traceID 
queue
+* started to execute.  Once the same has been done for all the 
traceID
+* queues of each etmq, redenring and decoding can start in
+* chronological order.
+*
+* Note that packets decoded above are still in the traceID's 
packet
+* queue and will be processed in cs_etm__process_queues().
 */
-   cs_etm__clear_all_packet_queues(etmq);
+   etmq->timestamp_found = true;
+   cs_queue_nr = TO_CS_QUEUE_NR(etmq->queue_nr, trace_chan_id);
+   return auxtrace_heap__add(&etmq->etm->heap, cs_queue_nr, 
timestamp);
}
-
/*
-* We have a timestamp.  Add it to the min heap to reflect when
-* instructions conveyed by the range packets of this traceID queue
-* started to execute.  Once the same has been done for all the traceID
-* queues of each etmq, redenring and decoding can start in
-* chronological order.
-*
-* Note that packets decoded above are still in the traceID's packet
-* queue

[PATCH 0/7] Split Coresight decode by aux records

2021-02-12 Thread James Clark
Hi All,

Since my previous RFC, I've fixed --per-thread mode and solved
most of the open questions. I've also changed --dump-raw-trace
to use the same code path so it's also working now.

I think the only open questions are:
  * General approach
  * If aux records need to be saved, or if they can be pulled
from elsewhere.

I've also tested perf inject which is now working with troublesome
files.

Thanks
James

James Clark (7):
  perf cs-etm: Split up etm queue setup function
  perf cs-etm: Only search timestamp in current sample's queue.
  perf cs-etm: Save aux records in each etm queue
  perf cs-etm: don't process queues until cs_etm__flush_events
  perf cs-etm: split decode by aux records.
  perf cs-etm: Use existing decode code path for --dump-raw-trace
  perf cs-etm: Suppress printing when resetting decoder

 .../perf/util/cs-etm-decoder/cs-etm-decoder.c |  10 +-
 tools/perf/util/cs-etm.c  | 300 ++
 2 files changed, 168 insertions(+), 142 deletions(-)

-- 
2.28.0



[PATCH 1/7] perf cs-etm: Split up etm queue setup function

2021-02-12 Thread James Clark
Refactor the function into separate allocation and
timestamp search parts. Later the timestamp search
will be done multiple times.

Signed-off-by: James Clark 
---
 tools/perf/util/cs-etm.c | 60 +---
 1 file changed, 31 insertions(+), 29 deletions(-)

diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c
index a2a369e2fbb6..27894facae5e 100644
--- a/tools/perf/util/cs-etm.c
+++ b/tools/perf/util/cs-etm.c
@@ -765,33 +765,12 @@ static struct cs_etm_queue *cs_etm__alloc_queue(struct 
cs_etm_auxtrace *etm)
return NULL;
 }
 
-static int cs_etm__setup_queue(struct cs_etm_auxtrace *etm,
-  struct auxtrace_queue *queue,
-  unsigned int queue_nr)
+static int cs_etm__search_first_timestamp(struct cs_etm_queue *etmq)
 {
int ret = 0;
+   u64 timestamp;
unsigned int cs_queue_nr;
u8 trace_chan_id;
-   u64 timestamp;
-   struct cs_etm_queue *etmq = queue->priv;
-
-   if (list_empty(&queue->head) || etmq)
-   goto out;
-
-   etmq = cs_etm__alloc_queue(etm);
-
-   if (!etmq) {
-   ret = -ENOMEM;
-   goto out;
-   }
-
-   queue->priv = etmq;
-   etmq->etm = etm;
-   etmq->queue_nr = queue_nr;
-   etmq->offset = 0;
-
-   if (etm->timeless_decoding)
-   goto out;
 
/*
 * We are under a CPU-wide trace scenario.  As such we need to know
@@ -808,7 +787,7 @@ static int cs_etm__setup_queue(struct cs_etm_auxtrace *etm,
 */
ret = cs_etm__get_data_block(etmq);
if (ret <= 0)
-   goto out;
+   return ret;
 
/*
 * Run decoder on the trace block.  The decoder will stop when
@@ -817,7 +796,7 @@ static int cs_etm__setup_queue(struct cs_etm_auxtrace *etm,
 */
ret = cs_etm__decode_data_block(etmq);
if (ret)
-   goto out;
+   return ret;
 
/*
 * Function cs_etm_decoder__do_{hard|soft}_timestamp() does all
@@ -849,10 +828,33 @@ static int cs_etm__setup_queue(struct cs_etm_auxtrace 
*etm,
 * Note that packets decoded above are still in the traceID's packet
 * queue and will be processed in cs_etm__process_queues().
 */
-   cs_queue_nr = TO_CS_QUEUE_NR(queue_nr, trace_chan_id);
-   ret = auxtrace_heap__add(&etm->heap, cs_queue_nr, timestamp);
-out:
-   return ret;
+   cs_queue_nr = TO_CS_QUEUE_NR(etmq->queue_nr, trace_chan_id);
+   return auxtrace_heap__add(&etmq->etm->heap, cs_queue_nr, timestamp);
+}
+
+static int cs_etm__setup_queue(struct cs_etm_auxtrace *etm,
+  struct auxtrace_queue *queue,
+  unsigned int queue_nr)
+{
+   struct cs_etm_queue *etmq = queue->priv;
+
+   if (list_empty(&queue->head) || etmq)
+   return 0;
+
+   etmq = cs_etm__alloc_queue(etm);
+
+   if (!etmq)
+   return -ENOMEM;
+
+   queue->priv = etmq;
+   etmq->etm = etm;
+   etmq->queue_nr = queue_nr;
+   etmq->offset = 0;
+
+   if (etm->timeless_decoding)
+   return 0;
+   else
+   return cs_etm__search_first_timestamp(etmq);
 }
 
 static int cs_etm__setup_queues(struct cs_etm_auxtrace *etm)
-- 
2.28.0



Re: [PATCH 1/8] perf arm-spe: Enable sample type PERF_SAMPLE_DATA_SRC

2021-02-11 Thread James Clark



On 22/01/2021 14:51, Arnaldo Carvalho de Melo wrote:
> Em Tue, Jan 19, 2021 at 04:46:51PM +0200, James Clark escreveu:
>> From: Leo Yan 
>>
>> This patch is to enable sample type PERF_SAMPLE_DATA_SRC for Arm SPE in
>> the perf data, when output the tracing data, it tells tools that it
>> contains data source in the memory event.
>>
>> Signed-off-by: Leo Yan 
>> Signed-off-by: James Clark 
> 
> I see two Signed-off-by, ok, any Reviewed-by?
> 
> - Arnaldo

Hi Arnaldo,

I have submitted v2 and added my reviewed-by and tested-by.

I didn't change any of the authors as Leo suggested because I only
modified the last two patches which we dropped anyway to not show
any misleading PID data when run from a container.


Thanks
James

> 
>> Cc: Peter Zijlstra 
>> Cc: Ingo Molnar 
>> Cc: Arnaldo Carvalho de Melo 
>> Cc: Mark Rutland 
>> Cc: Alexander Shishkin 
>> Cc: Jiri Olsa 
>> Cc: Namhyung Kim 
>> Cc: John Garry 
>> Cc: Will Deacon 
>> Cc: Mathieu Poirier 
>> Cc: Al Grant 
>> Cc: Andre Przywara 
>> Cc: Wei Li 
>> Cc: Tan Xiaojun 
>> Cc: Adrian Hunter 
>> ---
>>  tools/perf/util/arm-spe.c | 2 +-
>>  1 file changed, 1 insertion(+), 1 deletion(-)
>>
>> diff --git a/tools/perf/util/arm-spe.c b/tools/perf/util/arm-spe.c
>> index 8901a1656a41..b134516e890b 100644
>> --- a/tools/perf/util/arm-spe.c
>> +++ b/tools/perf/util/arm-spe.c
>> @@ -803,7 +803,7 @@ arm_spe_synth_events(struct arm_spe *spe, struct 
>> perf_session *session)
>>  attr.type = PERF_TYPE_HARDWARE;
>>  attr.sample_type = evsel->core.attr.sample_type & PERF_SAMPLE_MASK;
>>  attr.sample_type |= PERF_SAMPLE_IP | PERF_SAMPLE_TID |
>> -PERF_SAMPLE_PERIOD;
>> +PERF_SAMPLE_PERIOD | PERF_SAMPLE_DATA_SRC;
>>  if (spe->timeless_decoding)
>>  attr.sample_type &= ~(u64)PERF_SAMPLE_TIME;
>>  else
>> -- 
>> 2.28.0
>>
> 


[PATCH v2 6/6] perf arm-spe: Set sample's data source field

2021-02-11 Thread James Clark
From: Leo Yan 

The sample structure contains the field 'data_src' which is used to
tell the data operation attributions, e.g. operation type is loading or
storing, cache level, it's snooping or remote accessing, etc.  At the
end, the 'data_src' will be parsed by perf mem/c2c tools to display
human readable strings.

This patch is to fill the 'data_src' field in the synthesized samples
base on different types.  Currently perf tool can display statistics for
L1/L2/L3 caches but it doesn't support the 'last level cache'.  To fit
to current implementation, 'data_src' field uses L3 cache for last level
cache.

Before this commit, perf mem report looks like this:
# Samples: 75K of event 'l1d-miss'
# Total weight : 75951
# Sort order   : 
local_weight,mem,sym,dso,symbol_daddr,dso_daddr,snoop,tlb,locked
#
# Overhead   Samples  Local Weight  Memory access Symbol
  Shared Object Data Symbol Data Object   Snoop 
TLB access
#         
..    ..  
    ...
#
81.56% 61945  0 N/A   [.] 
0x09d8  serial_c  [.] [unknown] 
N/A   N/A
18.44% 14003  0 N/A   [.] 
0x0828  serial_c  [.] [unknown] 
N/A   N/A

Now on a system with Arm SPE, addresses and access types are displayed:

# Samples: 75K of event 'l1d-miss'
# Total weight : 75951
# Sort order   : 
local_weight,mem,sym,dso,symbol_daddr,dso_daddr,snoop,tlb,locked
#
# Overhead   Samples  Local Weight  Memory access Symbol
  Shared Object Data Symbol Data Object  Snoop  
   TLB access
#         
..    ..  ...  
  ..
#
 0.43%   324  0 L1 miss   [.] 
0x09d8  serial_c  [.] 0x80794e00  anon N/A  
 Walker hit
 0.42%   322  0 L1 miss   [.] 
0x09d8  serial_c  [.] 0x80794580  anon     N/A  
     Walker hit

Signed-off-by: Leo Yan 
Signed-off-by: James Clark 
Reviewed-by: James Clark 
Tested-by: James Clark 
Cc: Peter Zijlstra 
Cc: Ingo Molnar 
Cc: Arnaldo Carvalho de Melo 
Cc: Mark Rutland 
Cc: Alexander Shishkin 
Cc: Jiri Olsa 
Cc: Namhyung Kim 
Cc: John Garry 
Cc: Will Deacon 
Cc: Mathieu Poirier 
Cc: Al Grant 
Cc: Andre Przywara 
Cc: Wei Li 
Cc: Adrian Hunter 
---
 tools/perf/util/arm-spe.c | 69 ++-
 1 file changed, 60 insertions(+), 9 deletions(-)

diff --git a/tools/perf/util/arm-spe.c b/tools/perf/util/arm-spe.c
index 5550906486d8..27a0b9dfe22d 100644
--- a/tools/perf/util/arm-spe.c
+++ b/tools/perf/util/arm-spe.c
@@ -261,7 +261,7 @@ arm_spe_deliver_synth_event(struct arm_spe *spe,
 }
 
 static int arm_spe__synth_mem_sample(struct arm_spe_queue *speq,
-u64 spe_events_id)
+u64 spe_events_id, u64 data_src)
 {
struct arm_spe *spe = speq->spe;
struct arm_spe_record *record = &speq->decoder->record;
@@ -274,6 +274,7 @@ static int arm_spe__synth_mem_sample(struct arm_spe_queue 
*speq,
sample.stream_id = spe_events_id;
sample.addr = record->virt_addr;
sample.phys_addr = record->phys_addr;
+   sample.data_src = data_src;
 
return arm_spe_deliver_synth_event(spe, speq, event, &sample);
 }
@@ -307,21 +308,66 @@ static bool arm_spe__is_memory_event(enum 
arm_spe_sample_type type)
return false;
 }
 
+static u64 arm_spe__synth_data_source(const struct arm_spe_record *record)
+{
+   union perf_mem_data_src data_src = { 0 };
+
+   if (record->op == ARM_SPE_LD)
+   data_src.mem_op = PERF_MEM_OP_LOAD;
+   else
+   data_src.mem_op = PERF_MEM_OP_STORE;
+
+   if (record->type & (ARM_SPE_LLC_ACCESS | ARM_SPE_LLC_MISS)) {
+   data_src.mem_lvl = PERF_MEM_LVL_L3;
+
+   if (record->type & ARM_SPE_LLC_MISS)
+   data_src.mem_lvl |= PERF_MEM_LVL_MISS;
+   else
+   data_src.mem_lvl |= PERF_MEM_LVL_HIT;
+   } else if (record->type & (ARM_SPE_L1D_ACCESS | ARM_SPE_L1D_MISS)) {
+   data_src.mem_lvl = PERF_MEM_LVL_L1;
+
+   if (record->type & ARM_SPE_L1D_MISS)
+   data_src.mem_lvl |= PERF_MEM_LVL_MISS;
+

[PATCH v2 5/6] perf arm-spe: Synthesize memory event

2021-02-11 Thread James Clark
From: Leo Yan 

The memory event can deliver two benefits:

- The first benefit is the memory event can give out global view for
  memory accessing, rather than organizing events with scatter mode
  (e.g. uses separate event for L1 cache, last level cache, etc) which
  which can only display a event for single memory type, memory events
  include all memory accessing so it can display the data accessing
  cross memory levels in the same view;

- The second benefit is the sample generation might introduce a big
  overhead and need to wait for long time for Perf reporting, we can
  specify itrace option '--itrace=M' to filter out other events and only
  output memory events, this can significantly reduce the overhead
  caused by generating samples.

This patch is to enable memory event for Arm SPE.

Signed-off-by: Leo Yan 
Signed-off-by: James Clark 
Reviewed-by: James Clark 
Tested-by: James Clark 
Cc: Peter Zijlstra 
Cc: Ingo Molnar 
Cc: Arnaldo Carvalho de Melo 
Cc: Mark Rutland 
Cc: Alexander Shishkin 
Cc: Jiri Olsa 
Cc: Namhyung Kim 
Cc: John Garry 
Cc: Will Deacon 
Cc: Mathieu Poirier 
Cc: Al Grant 
Cc: Andre Przywara 
Cc: Wei Li 
Cc: Adrian Hunter 
---
 tools/perf/util/arm-spe.c | 30 ++
 1 file changed, 30 insertions(+)

diff --git a/tools/perf/util/arm-spe.c b/tools/perf/util/arm-spe.c
index 578725344603..5550906486d8 100644
--- a/tools/perf/util/arm-spe.c
+++ b/tools/perf/util/arm-spe.c
@@ -53,6 +53,7 @@ struct arm_spe {
u8  sample_tlb;
u8  sample_branch;
u8  sample_remote_access;
+   u8  sample_memory;
 
u64 l1d_miss_id;
u64 l1d_access_id;
@@ -62,6 +63,7 @@ struct arm_spe {
u64 tlb_access_id;
u64 branch_miss_id;
u64 remote_access_id;
+   u64 memory_id;
 
u64 kernel_start;
 
@@ -293,6 +295,18 @@ static int arm_spe__synth_branch_sample(struct 
arm_spe_queue *speq,
return arm_spe_deliver_synth_event(spe, speq, event, &sample);
 }
 
+#define SPE_MEM_TYPE   (ARM_SPE_L1D_ACCESS | ARM_SPE_L1D_MISS | \
+ARM_SPE_LLC_ACCESS | ARM_SPE_LLC_MISS | \
+ARM_SPE_REMOTE_ACCESS)
+
+static bool arm_spe__is_memory_event(enum arm_spe_sample_type type)
+{
+   if (type & SPE_MEM_TYPE)
+   return true;
+
+   return false;
+}
+
 static int arm_spe_sample(struct arm_spe_queue *speq)
 {
const struct arm_spe_record *record = &speq->decoder->record;
@@ -354,6 +368,12 @@ static int arm_spe_sample(struct arm_spe_queue *speq)
return err;
}
 
+   if (spe->sample_memory && arm_spe__is_memory_event(record->type)) {
+   err = arm_spe__synth_mem_sample(speq, spe->memory_id);
+   if (err)
+   return err;
+   }
+
return 0;
 }
 
@@ -917,6 +937,16 @@ arm_spe_synth_events(struct arm_spe *spe, struct 
perf_session *session)
id += 1;
}
 
+   if (spe->synth_opts.mem) {
+   spe->sample_memory = true;
+
+   err = arm_spe_synth_event(session, &attr, id);
+   if (err)
+   return err;
+   spe->memory_id = id;
+   arm_spe_set_event_name(evlist, id, "memory");
+   }
+
return 0;
 }
 
-- 
2.28.0



[PATCH v2 4/6] perf arm-spe: Fill address info for samples

2021-02-11 Thread James Clark
From: Leo Yan 

To properly handle memory and branch samples, this patch divides into
two functions for generating samples: arm_spe__synth_mem_sample() is for
synthesizing memory and TLB samples; arm_spe__synth_branch_sample() is
to synthesize branch samples.

Arm SPE backend decoder has passed virtual and physical address through
packets, the address info is stored into the synthesize samples in the
function arm_spe__synth_mem_sample().

Signed-off-by: Leo Yan 
Signed-off-by: James Clark 
Reviewed-by: James Clark 
Tested-by: James Clark 
Cc: Peter Zijlstra 
Cc: Ingo Molnar 
Cc: Arnaldo Carvalho de Melo 
Cc: Mark Rutland 
Cc: Alexander Shishkin 
Cc: Jiri Olsa 
Cc: Namhyung Kim 
Cc: John Garry 
Cc: Will Deacon 
Cc: Mathieu Poirier 
Cc: Al Grant 
Cc: Andre Przywara 
Cc: Wei Li 
Cc: Adrian Hunter 
---
 tools/perf/util/arm-spe.c | 52 +++
 1 file changed, 31 insertions(+), 21 deletions(-)

diff --git a/tools/perf/util/arm-spe.c b/tools/perf/util/arm-spe.c
index b134516e890b..578725344603 100644
--- a/tools/perf/util/arm-spe.c
+++ b/tools/perf/util/arm-spe.c
@@ -235,7 +235,6 @@ static void arm_spe_prep_sample(struct arm_spe *spe,
sample->cpumode = arm_spe_cpumode(spe, sample->ip);
sample->pid = speq->pid;
sample->tid = speq->tid;
-   sample->addr = record->to_ip;
sample->period = 1;
sample->cpu = speq->cpu;
 
@@ -259,18 +258,37 @@ arm_spe_deliver_synth_event(struct arm_spe *spe,
return ret;
 }
 
-static int
-arm_spe_synth_spe_events_sample(struct arm_spe_queue *speq,
-   u64 spe_events_id)
+static int arm_spe__synth_mem_sample(struct arm_spe_queue *speq,
+u64 spe_events_id)
 {
struct arm_spe *spe = speq->spe;
+   struct arm_spe_record *record = &speq->decoder->record;
+   union perf_event *event = speq->event_buf;
+   struct perf_sample sample = { 0 };
+
+   arm_spe_prep_sample(spe, speq, event, &sample);
+
+   sample.id = spe_events_id;
+   sample.stream_id = spe_events_id;
+   sample.addr = record->virt_addr;
+   sample.phys_addr = record->phys_addr;
+
+   return arm_spe_deliver_synth_event(spe, speq, event, &sample);
+}
+
+static int arm_spe__synth_branch_sample(struct arm_spe_queue *speq,
+   u64 spe_events_id)
+{
+   struct arm_spe *spe = speq->spe;
+   struct arm_spe_record *record = &speq->decoder->record;
union perf_event *event = speq->event_buf;
-   struct perf_sample sample = { .ip = 0, };
+   struct perf_sample sample = { 0 };
 
arm_spe_prep_sample(spe, speq, event, &sample);
 
sample.id = spe_events_id;
sample.stream_id = spe_events_id;
+   sample.addr = record->to_ip;
 
return arm_spe_deliver_synth_event(spe, speq, event, &sample);
 }
@@ -283,15 +301,13 @@ static int arm_spe_sample(struct arm_spe_queue *speq)
 
if (spe->sample_flc) {
if (record->type & ARM_SPE_L1D_MISS) {
-   err = arm_spe_synth_spe_events_sample(
-   speq, spe->l1d_miss_id);
+   err = arm_spe__synth_mem_sample(speq, spe->l1d_miss_id);
if (err)
return err;
}
 
if (record->type & ARM_SPE_L1D_ACCESS) {
-   err = arm_spe_synth_spe_events_sample(
-   speq, spe->l1d_access_id);
+   err = arm_spe__synth_mem_sample(speq, 
spe->l1d_access_id);
if (err)
return err;
}
@@ -299,15 +315,13 @@ static int arm_spe_sample(struct arm_spe_queue *speq)
 
if (spe->sample_llc) {
if (record->type & ARM_SPE_LLC_MISS) {
-   err = arm_spe_synth_spe_events_sample(
-   speq, spe->llc_miss_id);
+   err = arm_spe__synth_mem_sample(speq, spe->llc_miss_id);
if (err)
return err;
}
 
if (record->type & ARM_SPE_LLC_ACCESS) {
-   err = arm_spe_synth_spe_events_sample(
-   speq, spe->llc_access_id);
+   err = arm_spe__synth_mem_sample(speq, 
spe->llc_access_id);
if (err)
return err;
}
@@ -315,31 +329,27 @@ static int arm_spe_sample(struct arm_spe_queue *speq)
 
if (spe->sample_tlb) {
if (record->type & ARM_SPE_TLB_MISS) {
-   err = arm_spe_synth_spe_events_sample(
-   speq, spe->tlb_m

[PATCH v2 3/6] perf arm-spe: Store operation type in packet

2021-02-11 Thread James Clark
From: Leo Yan 

This patch is to store operation type in packet structure.

Signed-off-by: Leo Yan 
Signed-off-by: James Clark 
Reviewed-by: James Clark 
Tested-by: James Clark 
Cc: Peter Zijlstra 
Cc: Ingo Molnar 
Cc: Arnaldo Carvalho de Melo 
Cc: Mark Rutland 
Cc: Alexander Shishkin 
Cc: Jiri Olsa 
Cc: Namhyung Kim 
Cc: John Garry 
Cc: Will Deacon 
Cc: Mathieu Poirier 
Cc: Al Grant 
Cc: Andre Przywara 
Cc: Wei Li 
Cc: Adrian Hunter 
---
 tools/perf/util/arm-spe-decoder/arm-spe-decoder.c | 6 ++
 tools/perf/util/arm-spe-decoder/arm-spe-decoder.h | 6 ++
 2 files changed, 12 insertions(+)

diff --git a/tools/perf/util/arm-spe-decoder/arm-spe-decoder.c 
b/tools/perf/util/arm-spe-decoder/arm-spe-decoder.c
index 7aac3048b090..32fe41835fa6 100644
--- a/tools/perf/util/arm-spe-decoder/arm-spe-decoder.c
+++ b/tools/perf/util/arm-spe-decoder/arm-spe-decoder.c
@@ -182,6 +182,12 @@ static int arm_spe_read_record(struct arm_spe_decoder 
*decoder)
case ARM_SPE_CONTEXT:
break;
case ARM_SPE_OP_TYPE:
+   if (idx == SPE_OP_PKT_HDR_CLASS_LD_ST_ATOMIC) {
+   if (payload & 0x1)
+   decoder->record.op = ARM_SPE_ST;
+   else
+   decoder->record.op = ARM_SPE_LD;
+   }
break;
case ARM_SPE_EVENTS:
if (payload & BIT(EV_L1D_REFILL))
diff --git a/tools/perf/util/arm-spe-decoder/arm-spe-decoder.h 
b/tools/perf/util/arm-spe-decoder/arm-spe-decoder.h
index 7b845001afe7..59bdb7309674 100644
--- a/tools/perf/util/arm-spe-decoder/arm-spe-decoder.h
+++ b/tools/perf/util/arm-spe-decoder/arm-spe-decoder.h
@@ -24,9 +24,15 @@ enum arm_spe_sample_type {
ARM_SPE_REMOTE_ACCESS   = 1 << 7,
 };
 
+enum arm_spe_op_type {
+   ARM_SPE_LD  = 1 << 0,
+   ARM_SPE_ST  = 1 << 1,
+};
+
 struct arm_spe_record {
enum arm_spe_sample_type type;
int err;
+   u32 op;
u64 from_ip;
u64 to_ip;
u64 timestamp;
-- 
2.28.0



[PATCH v2 1/6] perf arm-spe: Enable sample type PERF_SAMPLE_DATA_SRC

2021-02-11 Thread James Clark
From: Leo Yan 

This patch is to enable sample type PERF_SAMPLE_DATA_SRC for Arm SPE in
the perf data, when output the tracing data, it tells tools that it
contains data source in the memory event.

Signed-off-by: Leo Yan 
Signed-off-by: James Clark 
Reviewed-by: James Clark 
Tested-by: James Clark 
Cc: Peter Zijlstra 
Cc: Ingo Molnar 
Cc: Arnaldo Carvalho de Melo 
Cc: Mark Rutland 
Cc: Alexander Shishkin 
Cc: Jiri Olsa 
Cc: Namhyung Kim 
Cc: John Garry 
Cc: Will Deacon 
Cc: Mathieu Poirier 
Cc: Al Grant 
Cc: Andre Przywara 
Cc: Wei Li 
Cc: Adrian Hunter 
---
 tools/perf/util/arm-spe.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/perf/util/arm-spe.c b/tools/perf/util/arm-spe.c
index 8901a1656a41..b134516e890b 100644
--- a/tools/perf/util/arm-spe.c
+++ b/tools/perf/util/arm-spe.c
@@ -803,7 +803,7 @@ arm_spe_synth_events(struct arm_spe *spe, struct 
perf_session *session)
attr.type = PERF_TYPE_HARDWARE;
attr.sample_type = evsel->core.attr.sample_type & PERF_SAMPLE_MASK;
attr.sample_type |= PERF_SAMPLE_IP | PERF_SAMPLE_TID |
-   PERF_SAMPLE_PERIOD;
+   PERF_SAMPLE_PERIOD | PERF_SAMPLE_DATA_SRC;
if (spe->timeless_decoding)
attr.sample_type &= ~(u64)PERF_SAMPLE_TIME;
else
-- 
2.28.0



[PATCH v2 2/6] perf arm-spe: Store memory address in packet

2021-02-11 Thread James Clark
From: Leo Yan 

This patch is to store virtual and physical memory addresses in packet,
which will be used for memory samples.

Signed-off-by: Leo Yan 
Signed-off-by: James Clark 
Reviewed-by: James Clark 
Tested-by: James Clark 
Cc: Peter Zijlstra 
Cc: Ingo Molnar 
Cc: Arnaldo Carvalho de Melo 
Cc: Mark Rutland 
Cc: Alexander Shishkin 
Cc: Jiri Olsa 
Cc: Namhyung Kim 
Cc: John Garry 
Cc: Will Deacon 
Cc: Mathieu Poirier 
Cc: Al Grant 
Cc: Andre Przywara 
Cc: Wei Li 
Cc: Adrian Hunter 
---
 tools/perf/util/arm-spe-decoder/arm-spe-decoder.c | 4 
 tools/perf/util/arm-spe-decoder/arm-spe-decoder.h | 2 ++
 2 files changed, 6 insertions(+)

diff --git a/tools/perf/util/arm-spe-decoder/arm-spe-decoder.c 
b/tools/perf/util/arm-spe-decoder/arm-spe-decoder.c
index 90d575cee1b9..7aac3048b090 100644
--- a/tools/perf/util/arm-spe-decoder/arm-spe-decoder.c
+++ b/tools/perf/util/arm-spe-decoder/arm-spe-decoder.c
@@ -172,6 +172,10 @@ static int arm_spe_read_record(struct arm_spe_decoder 
*decoder)
decoder->record.from_ip = ip;
else if (idx == SPE_ADDR_PKT_HDR_INDEX_BRANCH)
decoder->record.to_ip = ip;
+   else if (idx == SPE_ADDR_PKT_HDR_INDEX_DATA_VIRT)
+   decoder->record.virt_addr = ip;
+   else if (idx == SPE_ADDR_PKT_HDR_INDEX_DATA_PHYS)
+   decoder->record.phys_addr = ip;
break;
case ARM_SPE_COUNTER:
break;
diff --git a/tools/perf/util/arm-spe-decoder/arm-spe-decoder.h 
b/tools/perf/util/arm-spe-decoder/arm-spe-decoder.h
index 24727b8ca7ff..7b845001afe7 100644
--- a/tools/perf/util/arm-spe-decoder/arm-spe-decoder.h
+++ b/tools/perf/util/arm-spe-decoder/arm-spe-decoder.h
@@ -30,6 +30,8 @@ struct arm_spe_record {
u64 from_ip;
u64 to_ip;
u64 timestamp;
+   u64 virt_addr;
+   u64 phys_addr;
 };
 
 struct arm_spe_insn;
-- 
2.28.0



Re: [PATCH 8/8] perf arm-spe: Set thread TID

2021-02-10 Thread James Clark



On 09/02/2021 17:36, James Clark wrote:
> 
> 
> On 04/02/2021 12:27, Leo Yan wrote:
>> On Mon, Feb 01, 2021 at 07:40:45PM +0200, James Clark wrote:
>>>
>>> On 31/01/2021 14:01, Leo Yan wrote:
>>>> Option 1: by merging patches 07/08 and 08/08, we can firstly support PID
>>>> tracing for root namespace, and later we can extend to support PID
>>>> tracing in container (and in VMs).
>>>>
>> Arm SPE has the problem for step2, due to the trace uses statistical
>> approach, it doesn't trace the complete branch instructions, so it
>> cannot promise to capture all branches for the symbol "__switch_to".
>> If we only use the events PERF_RECORD_SWITCH /
>> PERF_RECORD_SWITCH_CPU_WIDE, then it will lead to the coarse result
>> for PID tracing.
>>
>> For this reason, seems to me it's pragmatic to use CONTEXTIDR for
>> PID tracing at current stage, at least it can allow the root domain
>> tracing works accurately.  But this will leave the issue for tracing
>> PID in non root namespace, we need to figure out solution later.
>>
>> Hi Mark.R, Al, do you have any comments for this?
> 
> Hi Leo,
> 
> I spoke with Al and his suggestion is to clear the PID value if the event
> was opened outside of the root namespace.
> 
> I think that's not a bad idea as it gets us PIDs in most cases but also
> doesn't show any incorrect data. Do you know if it's possible to determine
> that from a perf.data file? Unfortunately it doesn't seem to be possible
> to disable CONTEXTIDR tracing when opening the event as it's compile time
> only and can't be disabled dynamically.
> 
> James
> 

I've had a think about it and I think we should do one of two things:

#1) Remove the PID setting from the data source patchset. This will keep the
existing behaviour of using the PID of the first traced process only even
if there are forks. Later we can implement #2 or attempt to make it work
even in non root namespaces.

I'm not sure how this will impact your c2c patchset if you are relying on
the PID data Leo?

#2) Make a change in the SPE driver to add an option for disabling CONTEXTIDR.
We will disable this from userspace if the event is opened in a non root
namespace. So we will only show PID data if we know it's valid, otherwise
    the existing behaviour of only using the first PID will remain.

Hopefully those solutions will help to minimise changes in behaviour between
kernel releases that could be confusing.


>>
>> Thanks,
>> Leo
>>
>>>>> Signed-off-by: Leo Yan 
>>>>> Signed-off-by: James Clark 
>>>>
>>>> Besides for techinical question, you could add your "Co-developed-by"
>>>> tags for patches 06, 07, 08/08, which you have took time to refin them.
>>>>
>>>> Thanks you for kindly efforts.
>>>>
>>>> [1] https://lore.kernel.org/patchwork/patch/1353286/
>>>>
>>>>> Cc: Peter Zijlstra 
>>>>> Cc: Ingo Molnar 
>>>>> Cc: Arnaldo Carvalho de Melo 
>>>>> Cc: Mark Rutland 
>>>>> Cc: Alexander Shishkin 
>>>>> Cc: Jiri Olsa 
>>>>> Cc: Namhyung Kim 
>>>>> Cc: John Garry 
>>>>> Cc: Will Deacon 
>>>>> Cc: Mathieu Poirier 
>>>>> Cc: Al Grant 
>>>>> Cc: Andre Przywara 
>>>>> Cc: Wei Li 
>>>>> Cc: Tan Xiaojun 
>>>>> Cc: Adrian Hunter 
>>>>> ---
>>>>>  tools/perf/util/arm-spe.c | 75 ++-
>>>>>  1 file changed, 50 insertions(+), 25 deletions(-)
>>>>>
>>>>> diff --git a/tools/perf/util/arm-spe.c b/tools/perf/util/arm-spe.c
>>>>> index 27a0b9dfe22d..9828fad7e516 100644
>>>>> --- a/tools/perf/util/arm-spe.c
>>>>> +++ b/tools/perf/util/arm-spe.c
>>>>> @@ -223,6 +223,46 @@ static inline u8 arm_spe_cpumode(struct arm_spe 
>>>>> *spe, u64 ip)
>>>>>   PERF_RECORD_MISC_USER;
>>>>>  }
>>>>>  
>>>>> +static void arm_spe_set_pid_tid_cpu(struct arm_spe *spe,
>>>>> + struct auxtrace_queue *queue)
>>>>> +{
>>>>> + struct arm_spe_queue *speq = queue->priv;
>>>>> + pid_t tid;
>>>>> +
>>>>> + tid = machine__get_current_tid(spe->machine, speq->cpu);
>>>>> + if (tid != -1) {
>>>>> + speq->tid = t

Re: [PATCH 8/8] perf arm-spe: Set thread TID

2021-02-09 Thread James Clark



On 04/02/2021 12:27, Leo Yan wrote:
> On Mon, Feb 01, 2021 at 07:40:45PM +0200, James Clark wrote:
>>
>> On 31/01/2021 14:01, Leo Yan wrote:
>>> Option 1: by merging patches 07/08 and 08/08, we can firstly support PID
>>> tracing for root namespace, and later we can extend to support PID
>>> tracing in container (and in VMs).
>>>
> Arm SPE has the problem for step2, due to the trace uses statistical
> approach, it doesn't trace the complete branch instructions, so it
> cannot promise to capture all branches for the symbol "__switch_to".
> If we only use the events PERF_RECORD_SWITCH /
> PERF_RECORD_SWITCH_CPU_WIDE, then it will lead to the coarse result
> for PID tracing.
> 
> For this reason, seems to me it's pragmatic to use CONTEXTIDR for
> PID tracing at current stage, at least it can allow the root domain
> tracing works accurately.  But this will leave the issue for tracing
> PID in non root namespace, we need to figure out solution later.
> 
> Hi Mark.R, Al, do you have any comments for this?

Hi Leo,

I spoke with Al and his suggestion is to clear the PID value if the event
was opened outside of the root namespace.

I think that's not a bad idea as it gets us PIDs in most cases but also
doesn't show any incorrect data. Do you know if it's possible to determine
that from a perf.data file? Unfortunately it doesn't seem to be possible
to disable CONTEXTIDR tracing when opening the event as it's compile time
only and can't be disabled dynamically.

James

> 
> Thanks,
> Leo
> 
>>>> Signed-off-by: Leo Yan 
>>>> Signed-off-by: James Clark 
>>>
>>> Besides for techinical question, you could add your "Co-developed-by"
>>> tags for patches 06, 07, 08/08, which you have took time to refin them.
>>>
>>> Thanks you for kindly efforts.
>>>
>>> [1] https://lore.kernel.org/patchwork/patch/1353286/
>>>
>>>> Cc: Peter Zijlstra 
>>>> Cc: Ingo Molnar 
>>>> Cc: Arnaldo Carvalho de Melo 
>>>> Cc: Mark Rutland 
>>>> Cc: Alexander Shishkin 
>>>> Cc: Jiri Olsa 
>>>> Cc: Namhyung Kim 
>>>> Cc: John Garry 
>>>> Cc: Will Deacon 
>>>> Cc: Mathieu Poirier 
>>>> Cc: Al Grant 
>>>> Cc: Andre Przywara 
>>>> Cc: Wei Li 
>>>> Cc: Tan Xiaojun 
>>>> Cc: Adrian Hunter 
>>>> ---
>>>>  tools/perf/util/arm-spe.c | 75 ++-
>>>>  1 file changed, 50 insertions(+), 25 deletions(-)
>>>>
>>>> diff --git a/tools/perf/util/arm-spe.c b/tools/perf/util/arm-spe.c
>>>> index 27a0b9dfe22d..9828fad7e516 100644
>>>> --- a/tools/perf/util/arm-spe.c
>>>> +++ b/tools/perf/util/arm-spe.c
>>>> @@ -223,6 +223,46 @@ static inline u8 arm_spe_cpumode(struct arm_spe *spe, 
>>>> u64 ip)
>>>>PERF_RECORD_MISC_USER;
>>>>  }
>>>>  
>>>> +static void arm_spe_set_pid_tid_cpu(struct arm_spe *spe,
>>>> +  struct auxtrace_queue *queue)
>>>> +{
>>>> +  struct arm_spe_queue *speq = queue->priv;
>>>> +  pid_t tid;
>>>> +
>>>> +  tid = machine__get_current_tid(spe->machine, speq->cpu);
>>>> +  if (tid != -1) {
>>>> +  speq->tid = tid;
>>>> +  thread__zput(speq->thread);
>>>> +  } else
>>>> +  speq->tid = queue->tid;
>>>> +
>>>> +  if ((!speq->thread) && (speq->tid != -1)) {
>>>> +  speq->thread = machine__find_thread(spe->machine, -1,
>>>> +  speq->tid);
>>>> +  }
>>>> +
>>>> +  if (speq->thread) {
>>>> +  speq->pid = speq->thread->pid_;
>>>> +  if (queue->cpu == -1)
>>>> +  speq->cpu = speq->thread->cpu;
>>>> +  }
>>>> +}
>>>> +
>>>> +static int arm_spe_set_tid(struct arm_spe_queue *speq, pid_t tid)
>>>> +{
>>>> +  int err;
>>>> +  struct arm_spe *spe = speq->spe;
>>>> +  struct auxtrace_queue *queue;
>>>> +
>>>> +  err = machine__set_current_tid(spe->machine, speq->cpu, tid, tid);
>>>> +  if (err)
>>>> +  return err;
>>>> +
>>>> +  queue = &speq->spe->queues.queue_array[speq->queue_

[RFC PATCH 5/5] perf cs-etm: split decode by aux records.

2021-02-09 Thread James Clark
The trace data between aux records is not continuous, so the decoder
must be reset between each record to ensure that parsing happens
correctly and without any early exits.

Signed-off-by: James Clark 
---
 tools/perf/util/cs-etm.c | 108 ---
 1 file changed, 66 insertions(+), 42 deletions(-)

diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c
index 0aaa1f6d2822..b0f464a50e2f 100644
--- a/tools/perf/util/cs-etm.c
+++ b/tools/perf/util/cs-etm.c
@@ -95,6 +95,7 @@ struct cs_etm_queue {
int aux_record_list_len;
int aux_record_list_idx;
struct perf_record_aux *aux_record_list;
+   bool timestamp_found;
 };
 
 /* RB tree for quick conversion between traceID and metadata pointers */
@@ -788,6 +789,9 @@ static int cs_etm__seach_first_timestamp(struct 
cs_etm_queue *etmq,
 
etmq->aux_record_list[etmq->aux_record_list_len++] = *aux_record;
 
+   if (etmq->timestamp_found)
+   return 0;
+
/*
 * We are under a CPU-wide trace scenario.  As such we need to know
 * when the code that generated the traces started to execute so that
@@ -796,56 +800,60 @@ static int cs_etm__seach_first_timestamp(struct 
cs_etm_queue *etmq,
 * timestamp.  The timestamp is then added to the auxtrace min heap
 * in order to know what nibble (of all the etmqs) to decode first.
 */
-   while (1) {
-   /*
-* Fetch an aux_buffer from this etmq.  Bail if no more
-* blocks or an error has been encountered.
-*/
-   ret = cs_etm__get_data_block(etmq);
-   if (ret <= 0)
-   return ret;
-
-   /*
-* Run decoder on the trace block.  The decoder will stop when
-* encountering a timestamp, a full packet queue or the end of
-* trace for that block.
-*/
-   ret = cs_etm__decode_data_block(etmq);
+   /*
+* Fetch an aux_buffer from this etmq.  Bail if no more
+* blocks or an error has been encountered.
+*/
+   if (etmq->aux_record_list[etmq->aux_record_list_idx].aux_size <= 0) {
+   etmq->aux_record_list_idx++;
+   ret = cs_etm_decoder__reset(etmq->decoder);
if (ret)
return ret;
+   }
+   ret = cs_etm__get_data_block(etmq);
+   if (ret <= 0)
+   return ret;
 
-   /*
-* Function cs_etm_decoder__do_{hard|soft}_timestamp() does all
-* the timestamp calculation for us.
-*/
-   timestamp = cs_etm__etmq_get_timestamp(etmq, &trace_chan_id);
+   /*
+* Run decoder on the trace block.  The decoder will stop when
+* encountering a timestamp, a full packet queue or the end of
+* trace for that block.
+*/
+   ret = cs_etm__decode_data_block(etmq);
+   if (ret)
+   return ret;
 
-   /* We found a timestamp, no need to continue. */
-   if (timestamp)
-   break;
+   /*
+* Function cs_etm_decoder__do_{hard|soft}_timestamp() does all
+* the timestamp calculation for us.
+*/
+   timestamp = cs_etm__etmq_get_timestamp(etmq, &trace_chan_id);
 
+   /* We found a timestamp, no need to continue. */
+   if (timestamp) {
/*
-* We didn't find a timestamp so empty all the traceid packet
-* queues before looking for another timestamp packet, either
-* in the current data block or a new one.  Packets that were
-* just decoded are useless since no timestamp has been
-* associated with them.  As such simply discard them.
+* We have a timestamp.  Add it to the min heap to reflect when
+* instructions conveyed by the range packets of this traceID 
queue
+* started to execute.  Once the same has been done for all the 
traceID
+* queues of each etmq, redenring and decoding can start in
+* chronological order.
+*
+* Note that packets decoded above are still in the traceID's 
packet
+* queue and will be processed in cs_etm__process_queues().
 */
-   cs_etm__clear_all_packet_queues(etmq);
+   etmq->timestamp_found = true;
+   cs_queue_nr = TO_CS_QUEUE_NR(etmq->queue_nr, trace_chan_id);
+   return auxtrace_heap__add(&etmq->etm->heap, cs_queue_nr, 
timestamp);
}
-
/*
-* We have a timestamp.  Add it to the min heap to reflect when
-* instructions conveyed by the range packets of this traceID queue
-* started to execute.  Once the same has been done f

[RFC PATCH 2/5] perf cs-etm: Only search timestamp in current sample's queue.

2021-02-09 Thread James Clark
Change initial timestamp search to only operate on the queue
related to the current event. In a later change the bounds
of the aux record will also be used to reset the decoder and
the record is only relevant to a single queue.

This doesn't work --per-thread mode where cpu == -1 so
a further change will be required.

Also this change makes some files that had coresight data
but didn't syntesise any events start working and generating
events. I'm not sure of the reason for that. I'd expect this
change to only affect the ordering of events.

Signed-off-by: James Clark 
---
 tools/perf/util/cs-etm.c | 34 --
 1 file changed, 20 insertions(+), 14 deletions(-)

diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c
index 440001cdd3b8..9ebe43d60d1e 100644
--- a/tools/perf/util/cs-etm.c
+++ b/tools/perf/util/cs-etm.c
@@ -97,7 +97,7 @@ struct cs_etm_queue {
 /* RB tree for quick conversion between traceID and metadata pointers */
 static struct intlist *traceid_list;
 
-static int cs_etm__update_queues(struct cs_etm_auxtrace *etm);
+static int cs_etm__update_queues(struct cs_etm_auxtrace *etm, int cpu);
 static int cs_etm__process_queues(struct cs_etm_auxtrace *etm);
 static int cs_etm__process_timeless_queues(struct cs_etm_auxtrace *etm,
   pid_t tid);
@@ -524,7 +524,7 @@ static void cs_etm__dump_event(struct cs_etm_auxtrace *etm,
 static int cs_etm__flush_events(struct perf_session *session,
struct perf_tool *tool)
 {
-   int ret;
+   //int ret;
struct cs_etm_auxtrace *etm = container_of(session->auxtrace,
   struct cs_etm_auxtrace,
   auxtrace);
@@ -534,10 +534,12 @@ static int cs_etm__flush_events(struct perf_session 
*session,
if (!tool->ordered_events)
return -EINVAL;
 
-   ret = cs_etm__update_queues(etm);
+   // TODO: does this need to be here? It is already called in 
cs_etm__process_event
+   //   when the aux records are available.
+   //ret = cs_etm__update_queues(etm);
 
-   if (ret < 0)
-   return ret;
+   //if (ret < 0)
+   //  return ret;
 
if (etm->timeless_decoding)
return cs_etm__process_timeless_queues(etm, -1);
@@ -851,10 +853,7 @@ static int cs_etm__setup_queue(struct cs_etm_auxtrace *etm,
etmq->queue_nr = queue_nr;
etmq->offset = 0;
 
-   if (etm->timeless_decoding)
-   return 0;
-   else
-   return cs_etm__seach_first_timestamp(etmq);
+   return 0;
 }
 
 static int cs_etm__setup_queues(struct cs_etm_auxtrace *etm)
@@ -874,14 +873,20 @@ static int cs_etm__setup_queues(struct cs_etm_auxtrace 
*etm)
return 0;
 }
 
-static int cs_etm__update_queues(struct cs_etm_auxtrace *etm)
+static int cs_etm__update_queues(struct cs_etm_auxtrace *etm, int cpu)
 {
+   int ret;
if (etm->queues.new_data) {
etm->queues.new_data = false;
-   return cs_etm__setup_queues(etm);
+   ret = cs_etm__setup_queues(etm);
+   if (ret)
+   return ret;
}
 
-   return 0;
+   if (!etm->timeless_decoding)
+   return 
cs_etm__seach_first_timestamp(etm->queues.queue_array[cpu].priv);
+   else
+   return 0;
 }
 
 static inline
@@ -2358,8 +2363,9 @@ static int cs_etm__process_event(struct perf_session 
*session,
else
timestamp = 0;
 
-   if (timestamp || etm->timeless_decoding) {
-   err = cs_etm__update_queues(etm);
+   if ((timestamp || etm->timeless_decoding)
+   && event->header.type == PERF_RECORD_AUX) {
+   err = cs_etm__update_queues(etm, sample->cpu);
if (err)
return err;
}
-- 
2.28.0



[RFC PATCH 4/5] perf cs-etm: don't process queues until cs_etm__flush_events

2021-02-09 Thread James Clark
To make sure processing happens in the correct order, queue processing
shouldn't start until every aux queue has had its first timestamp found.

Now that we're only searching for timestamps within each aux record, we
need to wait until all aux records are delivered before starting the
processing.

Signed-off-by: James Clark 
---
 tools/perf/util/cs-etm.c | 4 
 1 file changed, 4 deletions(-)

diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c
index efe418a7c82e..0aaa1f6d2822 100644
--- a/tools/perf/util/cs-etm.c
+++ b/tools/perf/util/cs-etm.c
@@ -2394,10 +2394,6 @@ static int cs_etm__process_event(struct perf_session 
*session,
else if (event->header.type == PERF_RECORD_SWITCH_CPU_WIDE)
return cs_etm__process_switch_cpu_wide(etm, event);
 
-   if (!etm->timeless_decoding &&
-   event->header.type == PERF_RECORD_AUX)
-   return cs_etm__process_queues(etm);
-
return 0;
 }
 
-- 
2.28.0



[RFC PATCH 3/5] perf cs-etm: Save aux records in each etm queue

2021-02-09 Thread James Clark
The aux records will be used set the bounds of decoding in a
later commit. In the future we may also want to use the flags
of each record to control decoding.

Do these need to be saved in their entirety, or can pointers
to each record safely be saved instead for later access?

Signed-off-by: James Clark 
---
 tools/perf/util/cs-etm.c | 24 +++-
 1 file changed, 19 insertions(+), 5 deletions(-)

diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c
index 9ebe43d60d1e..efe418a7c82e 100644
--- a/tools/perf/util/cs-etm.c
+++ b/tools/perf/util/cs-etm.c
@@ -92,12 +92,16 @@ struct cs_etm_queue {
/* Conversion between traceID and index in traceid_queues array */
struct intlist *traceid_queues_list;
struct cs_etm_traceid_queue **traceid_queues;
+   int aux_record_list_len;
+   int aux_record_list_idx;
+   struct perf_record_aux *aux_record_list;
 };
 
 /* RB tree for quick conversion between traceID and metadata pointers */
 static struct intlist *traceid_list;
 
-static int cs_etm__update_queues(struct cs_etm_auxtrace *etm, int cpu);
+static int cs_etm__update_queues(struct cs_etm_auxtrace *etm, int cpu,
+struct perf_record_aux *aux_record);
 static int cs_etm__process_queues(struct cs_etm_auxtrace *etm);
 static int cs_etm__process_timeless_queues(struct cs_etm_auxtrace *etm,
   pid_t tid);
@@ -593,6 +597,7 @@ static void cs_etm__free_queue(void *priv)
 
cs_etm_decoder__free(etmq->decoder);
cs_etm__free_traceid_queues(etmq);
+   free(etmq->aux_record_list);
free(etmq);
 }
 
@@ -767,13 +772,22 @@ static struct cs_etm_queue *cs_etm__alloc_queue(struct 
cs_etm_auxtrace *etm)
return NULL;
 }
 
-static int cs_etm__seach_first_timestamp(struct cs_etm_queue *etmq)
+static int cs_etm__seach_first_timestamp(struct cs_etm_queue *etmq,
+struct perf_record_aux *aux_record)
 {
int ret = 0;
u64 timestamp;
unsigned int cs_queue_nr;
u8 trace_chan_id;
 
+   etmq->aux_record_list = reallocarray(etmq->aux_record_list,
+ etmq->aux_record_list_len+1,
+ sizeof(*etmq->aux_record_list));
+   if (!etmq->aux_record_list)
+   return -ENOMEM;
+
+   etmq->aux_record_list[etmq->aux_record_list_len++] = *aux_record;
+
/*
 * We are under a CPU-wide trace scenario.  As such we need to know
 * when the code that generated the traces started to execute so that
@@ -873,7 +887,7 @@ static int cs_etm__setup_queues(struct cs_etm_auxtrace *etm)
return 0;
 }
 
-static int cs_etm__update_queues(struct cs_etm_auxtrace *etm, int cpu)
+static int cs_etm__update_queues(struct cs_etm_auxtrace *etm, int cpu, struct 
perf_record_aux *aux)
 {
int ret;
if (etm->queues.new_data) {
@@ -884,7 +898,7 @@ static int cs_etm__update_queues(struct cs_etm_auxtrace 
*etm, int cpu)
}
 
if (!etm->timeless_decoding)
-   return 
cs_etm__seach_first_timestamp(etm->queues.queue_array[cpu].priv);
+   return 
cs_etm__seach_first_timestamp(etm->queues.queue_array[cpu].priv, aux);
else
return 0;
 }
@@ -2365,7 +2379,7 @@ static int cs_etm__process_event(struct perf_session 
*session,
 
if ((timestamp || etm->timeless_decoding)
&& event->header.type == PERF_RECORD_AUX) {
-   err = cs_etm__update_queues(etm, sample->cpu);
+   err = cs_etm__update_queues(etm, sample->cpu, &event->aux);
if (err)
return err;
}
-- 
2.28.0



[RFC PATCH 0/5] Split Coresight decode by aux records

2021-02-09 Thread James Clark
The following patches fix opening perf.data files that have timestamps
(ordered data), aren't recorded with --per-thread, and that have
discontinuous data in a single aux trace buffer.

I have some open questions:
 * Can cs_etm__update_queues() be removed from cs_etm__flush_events()?
 * Why does the second commit start making some files process correctly?
 * Is it ok to wait for the flush to start processing? Previously
   processing happened when the first aux record was delivered to
   cs_etm__process_event().
 * Do the aux records need to be saved into a new buffer or can they
   be pulled from elsewhere?

I also have some further changes to make to make per-thread mode work
where the cpu field of the sample is set to -1. And when there are
no timestamps cs_etm__process_timeless_queues() is used, which is a
completely different code path.

Thanks
James

James Clark (5):
  perf cs-etm: Split up etm queue setup function
  perf cs-etm: Only search timestamp in current sample's queue.
  perf cs-etm: Save aux records in each etm queue
  perf cs-etm: don't process queues until cs_etm__flush_events
  perf cs-etm: split decode by aux records.

 tools/perf/util/cs-etm.c | 200 +++
 1 file changed, 121 insertions(+), 79 deletions(-)

-- 
2.28.0



[RFC PATCH 1/5] perf cs-etm: Split up etm queue setup function

2021-02-09 Thread James Clark
Refactor the function into separate allocation and
timestamp search parts. Later the timestamp search
will be done multiple times.

Signed-off-by: James Clark 
---
 tools/perf/util/cs-etm.c | 60 +---
 1 file changed, 31 insertions(+), 29 deletions(-)

diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c
index a2a369e2fbb6..440001cdd3b8 100644
--- a/tools/perf/util/cs-etm.c
+++ b/tools/perf/util/cs-etm.c
@@ -765,33 +765,12 @@ static struct cs_etm_queue *cs_etm__alloc_queue(struct 
cs_etm_auxtrace *etm)
return NULL;
 }
 
-static int cs_etm__setup_queue(struct cs_etm_auxtrace *etm,
-  struct auxtrace_queue *queue,
-  unsigned int queue_nr)
+static int cs_etm__seach_first_timestamp(struct cs_etm_queue *etmq)
 {
int ret = 0;
+   u64 timestamp;
unsigned int cs_queue_nr;
u8 trace_chan_id;
-   u64 timestamp;
-   struct cs_etm_queue *etmq = queue->priv;
-
-   if (list_empty(&queue->head) || etmq)
-   goto out;
-
-   etmq = cs_etm__alloc_queue(etm);
-
-   if (!etmq) {
-   ret = -ENOMEM;
-   goto out;
-   }
-
-   queue->priv = etmq;
-   etmq->etm = etm;
-   etmq->queue_nr = queue_nr;
-   etmq->offset = 0;
-
-   if (etm->timeless_decoding)
-   goto out;
 
/*
 * We are under a CPU-wide trace scenario.  As such we need to know
@@ -808,7 +787,7 @@ static int cs_etm__setup_queue(struct cs_etm_auxtrace *etm,
 */
ret = cs_etm__get_data_block(etmq);
if (ret <= 0)
-   goto out;
+   return ret;
 
/*
 * Run decoder on the trace block.  The decoder will stop when
@@ -817,7 +796,7 @@ static int cs_etm__setup_queue(struct cs_etm_auxtrace *etm,
 */
ret = cs_etm__decode_data_block(etmq);
if (ret)
-   goto out;
+   return ret;
 
/*
 * Function cs_etm_decoder__do_{hard|soft}_timestamp() does all
@@ -849,10 +828,33 @@ static int cs_etm__setup_queue(struct cs_etm_auxtrace 
*etm,
 * Note that packets decoded above are still in the traceID's packet
 * queue and will be processed in cs_etm__process_queues().
 */
-   cs_queue_nr = TO_CS_QUEUE_NR(queue_nr, trace_chan_id);
-   ret = auxtrace_heap__add(&etm->heap, cs_queue_nr, timestamp);
-out:
-   return ret;
+   cs_queue_nr = TO_CS_QUEUE_NR(etmq->queue_nr, trace_chan_id);
+   return auxtrace_heap__add(&etmq->etm->heap, cs_queue_nr, timestamp);
+}
+
+static int cs_etm__setup_queue(struct cs_etm_auxtrace *etm,
+  struct auxtrace_queue *queue,
+  unsigned int queue_nr)
+{
+   struct cs_etm_queue *etmq = queue->priv;
+
+   if (list_empty(&queue->head) || etmq)
+   return 0;
+
+   etmq = cs_etm__alloc_queue(etm);
+
+   if (!etmq)
+   return -ENOMEM;
+
+   queue->priv = etmq;
+   etmq->etm = etm;
+   etmq->queue_nr = queue_nr;
+   etmq->offset = 0;
+
+   if (etm->timeless_decoding)
+   return 0;
+   else
+   return cs_etm__seach_first_timestamp(etmq);
 }
 
 static int cs_etm__setup_queues(struct cs_etm_auxtrace *etm)
-- 
2.28.0



Re: [PATCH 4/4] perf tools: determine if LR is the return address

2021-02-08 Thread James Clark



On 22/01/2021 18:18, Alexandre Truong wrote:

> +}
> +
> +static int add_entry(struct unwind_entry *entry, void *arg)
> +{
> + struct entries *entries = arg;
> +
> + entries->stack[entries->i++] = entry->ip;
> + return 0;
> +}
> +
> +u64 get_leaf_frame_caller_aarch64(struct perf_sample *sample, struct thread 
> *thread)
> +{
> + u64 leaf_frame;
> + struct entries entries = {{0, 0}, 0};
> +
> + if (get_leaf_frame_caller_enabled(sample))
> + return 0;
> +
> + unwind__get_entries(add_entry, &entries, thread, sample, 2);
> + leaf_frame = callchain_param.order == ORDER_CALLER ?
> + entries.stack[0] : entries.stack[1];
> +
> + if (leaf_frame + 1 == sample->user_regs.regs[PERF_REG_ARM64_LR])
> + return sample->user_regs.regs[PERF_REG_ARM64_LR];

Hi Alex,

>From your other reply about your investigation it looks like the check against 
>PERF_REG_ARM64_LR isn't
required because libunwind won't return a value if it's not correct. Whether 
it's equal to the LR or not.

And PERF_REG_ARM64_LR points to the instruction _after_ the call site. i.e. 
where to return to,
not where the call was made from. So just leaf_frame rather than leaf_frame+1 
would be more accurate.

I was also looking at unwind_entry in machine.c which is similar to your 
add_entry function and saw that it
does some extra bits like this:

if (symbol_conf.hide_unresolved && entry->ms.sym == NULL)
return 0;

if (append_inlines(cursor, &entry->ms, entry->ip) == 0)
return 0;

/*
 * Convert entry->ip from a virtual address to an offset in
 * its corresponding binary.
 */
if (entry->ms.map)
addr = map__map_ip(entry->ms.map, entry->ip);

I have a feeling you will also need to do those on your values returned from 
libunwind to make it 100%
equivalent.

James

> + return 0;
> +}
> diff --git a/tools/perf/util/arm-frame-pointer-unwind-support.h 
> b/tools/perf/util/arm-frame-pointer-unwind-support.h
> new file mode 100644
> index ..16dc03fa9abe
> --- /dev/null
> +++ b/tools/perf/util/arm-frame-pointer-unwind-support.h
> @@ -0,0 +1,7 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef __PERF_ARM_FRAME_POINTER_UNWIND_SUPPORT_H
> +#define __PERF_ARM_FRAME_POINTER_UNWIND_SUPPORT_H
> +
> +u64 get_leaf_frame_caller_aarch64(struct perf_sample *sample, struct thread 
> *thread);
> +
> +#endif /* __PERF_ARM_FRAME_POINTER_UNWIND_SUPPORT_H */
> diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
> index 40082d70eec1..bc6147e46c89 100644
> --- a/tools/perf/util/machine.c
> +++ b/tools/perf/util/machine.c
> @@ -34,6 +34,7 @@
>  #include "bpf-event.h"
>  #include  // page_size
>  #include "cgroup.h"
> +#include "arm-frame-pointer-unwind-support.h"
>  
>  #include 
>  #include 
> @@ -2671,10 +2672,12 @@ static int find_prev_cpumode(struct ip_callchain 
> *chain, struct thread *thread,
>   return err;
>  }
>  
> -static u64 get_leaf_frame_caller(struct perf_sample *sample __maybe_unused,
> - struct thread *thread __maybe_unused)
> +static u64 get_leaf_frame_caller(struct perf_sample *sample, struct thread 
> *thread)
>  {
> - return 0;
> + if (strncmp(thread->maps->machine->env->arch, "aarch64", 7) == 0)
> + return get_leaf_frame_caller_aarch64(sample, thread);
> + else
> + return 0;
>  }
>  
>  static int thread__resolve_callchain_sample(struct thread *thread,
> 


Re: [PATCH 8/8] perf arm-spe: Set thread TID

2021-02-01 Thread James Clark



On 31/01/2021 14:01, Leo Yan wrote:
> Option 1: by merging patches 07/08 and 08/08, we can firstly support PID
> tracing for root namespace, and later we can extend to support PID
> tracing in container (and in VMs).
> 
> Option 2: we can use the software method to establish PID for SPE
> trace, which can base on kernel's events PERF_RECORD_SWITCH /
> PERF_RECORD_SWITCH_CPU_WIDE and check context switch ip.
> 
> To be honest, I am a bit concern for option 1 for later might
> introduce regression when later support PID for containers (and VMs).
> If you have a plan for option 1, I think it's good to record current
> limitation and the plan for next step in the commit log, so we can merge
> this patch at this time and later extend for containers.
> 
> Otherwise, we need to consider how to implement the PID tracing with
> option 2.  If it is the case, we should firstly only merge patches
> 01 ~ 06 for data source enabling.  How about you think for this?

In my opinion we should do option 1 and use what is there at the moment. That
gets users 90% of the functionality right now.

I plan to look at option 2 at some point, and it can always be added on top of
option 1 or replace what is there. But I don't know when I would get to it or
how long it will take.

James

> 
>> Signed-off-by: Leo Yan 
>> Signed-off-by: James Clark 
> 
> Besides for techinical question, you could add your "Co-developed-by"
> tags for patches 06, 07, 08/08, which you have took time to refin them.
> 
> Thanks you for kindly efforts.
> 
> [1] https://lore.kernel.org/patchwork/patch/1353286/
> 
>> Cc: Peter Zijlstra 
>> Cc: Ingo Molnar 
>> Cc: Arnaldo Carvalho de Melo 
>> Cc: Mark Rutland 
>> Cc: Alexander Shishkin 
>> Cc: Jiri Olsa 
>> Cc: Namhyung Kim 
>> Cc: John Garry 
>> Cc: Will Deacon 
>> Cc: Mathieu Poirier 
>> Cc: Al Grant 
>> Cc: Andre Przywara 
>> Cc: Wei Li 
>> Cc: Tan Xiaojun 
>> Cc: Adrian Hunter 
>> ---
>>  tools/perf/util/arm-spe.c | 75 ++-
>>  1 file changed, 50 insertions(+), 25 deletions(-)
>>
>> diff --git a/tools/perf/util/arm-spe.c b/tools/perf/util/arm-spe.c
>> index 27a0b9dfe22d..9828fad7e516 100644
>> --- a/tools/perf/util/arm-spe.c
>> +++ b/tools/perf/util/arm-spe.c
>> @@ -223,6 +223,46 @@ static inline u8 arm_spe_cpumode(struct arm_spe *spe, 
>> u64 ip)
>>  PERF_RECORD_MISC_USER;
>>  }
>>  
>> +static void arm_spe_set_pid_tid_cpu(struct arm_spe *spe,
>> +struct auxtrace_queue *queue)
>> +{
>> +struct arm_spe_queue *speq = queue->priv;
>> +pid_t tid;
>> +
>> +tid = machine__get_current_tid(spe->machine, speq->cpu);
>> +if (tid != -1) {
>> +speq->tid = tid;
>> +thread__zput(speq->thread);
>> +} else
>> +speq->tid = queue->tid;
>> +
>> +if ((!speq->thread) && (speq->tid != -1)) {
>> +speq->thread = machine__find_thread(spe->machine, -1,
>> +speq->tid);
>> +}
>> +
>> +if (speq->thread) {
>> +speq->pid = speq->thread->pid_;
>> +if (queue->cpu == -1)
>> +speq->cpu = speq->thread->cpu;
>> +}
>> +}
>> +
>> +static int arm_spe_set_tid(struct arm_spe_queue *speq, pid_t tid)
>> +{
>> +int err;
>> +struct arm_spe *spe = speq->spe;
>> +struct auxtrace_queue *queue;
>> +
>> +err = machine__set_current_tid(spe->machine, speq->cpu, tid, tid);
>> +if (err)
>> +return err;
>> +
>> +queue = &speq->spe->queues.queue_array[speq->queue_nr];
>> +arm_spe_set_pid_tid_cpu(speq->spe, queue);
>> +return 0;
>> +}
>> +
>>  static void arm_spe_prep_sample(struct arm_spe *spe,
>>  struct arm_spe_queue *speq,
>>  union perf_event *event,
>> @@ -431,6 +471,7 @@ static int arm_spe_sample(struct arm_spe_queue *speq)
>>  static int arm_spe_run_decoder(struct arm_spe_queue *speq, u64 *timestamp)
>>  {
>>  struct arm_spe *spe = speq->spe;
>> +const struct arm_spe_record *record;
>>  int ret;
>>  
>>  if (!spe->kernel_start)
>> @@ -450,6 +491,11 @@ static int arm_spe_run_decoder(struct arm_spe_queue 
>> *speq, u64 *timestamp)
>>  if (ret < 0)
>>  cont

Re: [PATCH 4/4] perf tools: determine if LR is the return address

2021-01-26 Thread James Clark



On 24/01/2021 02:05, Jiri Olsa wrote:
> On Fri, Jan 22, 2021 at 04:18:54PM +, Alexandre Truong wrote:
>> On arm64 and frame pointer mode (e.g: perf record --callgraph fp),
>> use dwarf unwind info to check if the link register is the return
>> address in order to inject it to the frame pointer stack.
>>
>> Write the following application:
>>
>>  int a = 10;
>>
>>  void f2(void)
>>  {
>>  for (int i = 0; i < 100; i++)
>>  a *= a;
>>  }
>>
>>  void f1()
>>  {
>>  f2();
>>  }
>>
>>  int main (void)
>>  {
>>  f1();
>>  return 0;
>>  }
>>
>> with the following compilation flags:
>>  gcc -g -fno-omit-frame-pointer -fno-inline -O1
>>
>> The compiler omits the frame pointer for f2 on arm. This is a problem
>> with any leaf call, for example an application with many different
>> calls to malloc() would always omit the calling frame, even if it
>> can be determined.
>>
>>  ./perf record --call-graph fp ./a.out
>>  ./perf report
>>
>> currently gives the following stack:
>>
>> 0xea52f361
>> _start
>> __libc_start_main
>> main
>> f2
> 
> reproduced on x86 as well
> 
>> +static bool get_leaf_frame_caller_enabled(struct perf_sample *sample)
>> +{
>> +return callchain_param.record_mode != CALLCHAIN_FP || 
>> !sample->user_regs.regs
>> +|| sample->user_regs.mask != PERF_REGS_MASK;
>> +}
>> +
>> +static int add_entry(struct unwind_entry *entry, void *arg)
>> +{
>> +struct entries *entries = arg;
>> +
>> +entries->stack[entries->i++] = entry->ip;
>> +return 0;
>> +}
>> +
>> +u64 get_leaf_frame_caller_aarch64(struct perf_sample *sample, struct thread 
>> *thread)
>> +{
>> +u64 leaf_frame;
>> +struct entries entries = {{0, 0}, 0};
>> +
>> +if (get_leaf_frame_caller_enabled(sample))
> 
> the name suggest you'd want to continue if it's true
> 
>> +return 0;
>> +
>> +unwind__get_entries(add_entry, &entries, thread, sample, 2);
> 
> I'm scratching my head how this unwinds anything, you enabled just
> registers, not the stack right? so the unwind code would do just
> IP -> LR + 1 shift?

I think the idea about using libunwind is that the LR might not
be a valid return address. It could be used as a general purpose
register, or just not used at all.

Libunwind should be able to use the dwarf present in the binary to
unwind one frame, as long as nothing stored in the stack is needed.

But now I look at the disassembly for this example, I see that f2()
just has a single 'b' instruction, and not 'bl' so the link register
won't be set. And also 'f1' does store a few things on the stack.
Whether these are needed or not to unwind one frame I'm not sure.

It could be that libunwind is falling back to a frame pointer unwind
mode, which we don't want.

I think it needs further investigation.


James

> 
> thanks,
> jirka
> 
>> +leaf_frame = callchain_param.order == ORDER_CALLER ?
>> +entries.stack[0] : entries.stack[1];
>> +
>> +if (leaf_frame + 1 == sample->user_regs.regs[PERF_REG_ARM64_LR])
>> +return sample->user_regs.regs[PERF_REG_ARM64_LR];
>> +return 0;
>> +}
> 
> SNIP
> 


[PATCH 5/8] perf arm-spe: Synthesize memory event

2021-01-19 Thread James Clark
From: Leo Yan 

The memory event can deliver two benefits:

- The first benefit is the memory event can give out global view for
  memory accessing, rather than organizing events with scatter mode
  (e.g. uses separate event for L1 cache, last level cache, etc) which
  which can only display a event for single memory type, memory events
  include all memory accessing so it can display the data accessing
  cross memory levels in the same view;

- The second benefit is the sample generation might introduce a big
  overhead and need to wait for long time for Perf reporting, we can
  specify itrace option '--itrace=M' to filter out other events and only
  output memory events, this can significantly reduce the overhead
  caused by generating samples.

This patch is to enable memory event for Arm SPE.

Signed-off-by: Leo Yan 
Signed-off-by: James Clark 
Cc: Peter Zijlstra 
Cc: Ingo Molnar 
Cc: Arnaldo Carvalho de Melo 
Cc: Mark Rutland 
Cc: Alexander Shishkin 
Cc: Jiri Olsa 
Cc: Namhyung Kim 
Cc: John Garry 
Cc: Will Deacon 
Cc: Mathieu Poirier 
Cc: Al Grant 
Cc: Andre Przywara 
Cc: Wei Li 
Cc: Tan Xiaojun 
Cc: Adrian Hunter 
---
 tools/perf/util/arm-spe.c | 30 ++
 1 file changed, 30 insertions(+)

diff --git a/tools/perf/util/arm-spe.c b/tools/perf/util/arm-spe.c
index 578725344603..5550906486d8 100644
--- a/tools/perf/util/arm-spe.c
+++ b/tools/perf/util/arm-spe.c
@@ -53,6 +53,7 @@ struct arm_spe {
u8  sample_tlb;
u8  sample_branch;
u8  sample_remote_access;
+   u8  sample_memory;
 
u64 l1d_miss_id;
u64 l1d_access_id;
@@ -62,6 +63,7 @@ struct arm_spe {
u64 tlb_access_id;
u64 branch_miss_id;
u64 remote_access_id;
+   u64 memory_id;
 
u64 kernel_start;
 
@@ -293,6 +295,18 @@ static int arm_spe__synth_branch_sample(struct 
arm_spe_queue *speq,
return arm_spe_deliver_synth_event(spe, speq, event, &sample);
 }
 
+#define SPE_MEM_TYPE   (ARM_SPE_L1D_ACCESS | ARM_SPE_L1D_MISS | \
+ARM_SPE_LLC_ACCESS | ARM_SPE_LLC_MISS | \
+ARM_SPE_REMOTE_ACCESS)
+
+static bool arm_spe__is_memory_event(enum arm_spe_sample_type type)
+{
+   if (type & SPE_MEM_TYPE)
+   return true;
+
+   return false;
+}
+
 static int arm_spe_sample(struct arm_spe_queue *speq)
 {
const struct arm_spe_record *record = &speq->decoder->record;
@@ -354,6 +368,12 @@ static int arm_spe_sample(struct arm_spe_queue *speq)
return err;
}
 
+   if (spe->sample_memory && arm_spe__is_memory_event(record->type)) {
+   err = arm_spe__synth_mem_sample(speq, spe->memory_id);
+   if (err)
+   return err;
+   }
+
return 0;
 }
 
@@ -917,6 +937,16 @@ arm_spe_synth_events(struct arm_spe *spe, struct 
perf_session *session)
id += 1;
}
 
+   if (spe->synth_opts.mem) {
+   spe->sample_memory = true;
+
+   err = arm_spe_synth_event(session, &attr, id);
+   if (err)
+   return err;
+   spe->memory_id = id;
+   arm_spe_set_event_name(evlist, id, "memory");
+   }
+
return 0;
 }
 
-- 
2.28.0



[PATCH 8/8] perf arm-spe: Set thread TID

2021-01-19 Thread James Clark
From: Leo Yan 

Set thread TID for SPE samples. Now that the context ID is saved
in each record it can be used to set the TID for a sample.

The context ID is only present in SPE data if the kernel is
compiled with CONFIG_PID_IN_CONTEXTIDR and perf record is
run as root. Otherwise the PID of the first process is assigned
to each SPE sample.

Signed-off-by: Leo Yan 
Signed-off-by: James Clark 
Cc: Peter Zijlstra 
Cc: Ingo Molnar 
Cc: Arnaldo Carvalho de Melo 
Cc: Mark Rutland 
Cc: Alexander Shishkin 
Cc: Jiri Olsa 
Cc: Namhyung Kim 
Cc: John Garry 
Cc: Will Deacon 
Cc: Mathieu Poirier 
Cc: Al Grant 
Cc: Andre Przywara 
Cc: Wei Li 
Cc: Tan Xiaojun 
Cc: Adrian Hunter 
---
 tools/perf/util/arm-spe.c | 75 ++-
 1 file changed, 50 insertions(+), 25 deletions(-)

diff --git a/tools/perf/util/arm-spe.c b/tools/perf/util/arm-spe.c
index 27a0b9dfe22d..9828fad7e516 100644
--- a/tools/perf/util/arm-spe.c
+++ b/tools/perf/util/arm-spe.c
@@ -223,6 +223,46 @@ static inline u8 arm_spe_cpumode(struct arm_spe *spe, u64 
ip)
PERF_RECORD_MISC_USER;
 }
 
+static void arm_spe_set_pid_tid_cpu(struct arm_spe *spe,
+   struct auxtrace_queue *queue)
+{
+   struct arm_spe_queue *speq = queue->priv;
+   pid_t tid;
+
+   tid = machine__get_current_tid(spe->machine, speq->cpu);
+   if (tid != -1) {
+   speq->tid = tid;
+   thread__zput(speq->thread);
+   } else
+   speq->tid = queue->tid;
+
+   if ((!speq->thread) && (speq->tid != -1)) {
+   speq->thread = machine__find_thread(spe->machine, -1,
+   speq->tid);
+   }
+
+   if (speq->thread) {
+   speq->pid = speq->thread->pid_;
+   if (queue->cpu == -1)
+   speq->cpu = speq->thread->cpu;
+   }
+}
+
+static int arm_spe_set_tid(struct arm_spe_queue *speq, pid_t tid)
+{
+   int err;
+   struct arm_spe *spe = speq->spe;
+   struct auxtrace_queue *queue;
+
+   err = machine__set_current_tid(spe->machine, speq->cpu, tid, tid);
+   if (err)
+   return err;
+
+   queue = &speq->spe->queues.queue_array[speq->queue_nr];
+   arm_spe_set_pid_tid_cpu(speq->spe, queue);
+   return 0;
+}
+
 static void arm_spe_prep_sample(struct arm_spe *spe,
struct arm_spe_queue *speq,
union perf_event *event,
@@ -431,6 +471,7 @@ static int arm_spe_sample(struct arm_spe_queue *speq)
 static int arm_spe_run_decoder(struct arm_spe_queue *speq, u64 *timestamp)
 {
struct arm_spe *spe = speq->spe;
+   const struct arm_spe_record *record;
int ret;
 
if (!spe->kernel_start)
@@ -450,6 +491,11 @@ static int arm_spe_run_decoder(struct arm_spe_queue *speq, 
u64 *timestamp)
if (ret < 0)
continue;
 
+   record = &speq->decoder->record;
+   ret = arm_spe_set_tid(speq, record->context_id);
+   if (ret)
+   return ret;
+
ret = arm_spe_sample(speq);
if (ret)
return ret;
@@ -500,6 +546,10 @@ static int arm_spe__setup_queue(struct arm_spe *spe,
 
record = &speq->decoder->record;
 
+   ret = arm_spe_set_tid(speq, record->context_id);
+   if (ret)
+   return ret;
+
speq->timestamp = record->timestamp;
ret = auxtrace_heap__add(&spe->heap, queue_nr, speq->timestamp);
if (ret)
@@ -552,31 +602,6 @@ static bool arm_spe__is_timeless_decoding(struct arm_spe 
*spe)
return timeless_decoding;
 }
 
-static void arm_spe_set_pid_tid_cpu(struct arm_spe *spe,
-   struct auxtrace_queue *queue)
-{
-   struct arm_spe_queue *speq = queue->priv;
-   pid_t tid;
-
-   tid = machine__get_current_tid(spe->machine, speq->cpu);
-   if (tid != -1) {
-   speq->tid = tid;
-   thread__zput(speq->thread);
-   } else
-   speq->tid = queue->tid;
-
-   if ((!speq->thread) && (speq->tid != -1)) {
-   speq->thread = machine__find_thread(spe->machine, -1,
-   speq->tid);
-   }
-
-   if (speq->thread) {
-   speq->pid = speq->thread->pid_;
-   if (queue->cpu == -1)
-   speq->cpu = speq->thread->cpu;
-   }
-}
-
 static int arm_spe_process_queues(struct arm_spe *spe, u64 timestamp)
 {
unsigned int queue_nr;
-- 
2.28.0



[PATCH 7/8] perf arm-spe: Save context ID in record

2021-01-19 Thread James Clark
From: Leo Yan 

This patch is to save context ID in record, this will be used to set TID
for samples.

Signed-off-by: Leo Yan 
Signed-off-by: James Clark 
Cc: Peter Zijlstra 
Cc: Ingo Molnar 
Cc: Arnaldo Carvalho de Melo 
Cc: Mark Rutland 
Cc: Alexander Shishkin 
Cc: Jiri Olsa 
Cc: Namhyung Kim 
Cc: John Garry 
Cc: Will Deacon 
Cc: Mathieu Poirier 
Cc: Al Grant 
Cc: Andre Przywara 
Cc: Wei Li 
Cc: Tan Xiaojun 
Cc: Adrian Hunter 
---
 tools/perf/util/arm-spe-decoder/arm-spe-decoder.c | 2 ++
 tools/perf/util/arm-spe-decoder/arm-spe-decoder.h | 1 +
 2 files changed, 3 insertions(+)

diff --git a/tools/perf/util/arm-spe-decoder/arm-spe-decoder.c 
b/tools/perf/util/arm-spe-decoder/arm-spe-decoder.c
index 32fe41835fa6..1b58859d2314 100644
--- a/tools/perf/util/arm-spe-decoder/arm-spe-decoder.c
+++ b/tools/perf/util/arm-spe-decoder/arm-spe-decoder.c
@@ -151,6 +151,7 @@ static int arm_spe_read_record(struct arm_spe_decoder 
*decoder)
u64 payload, ip;
 
memset(&decoder->record, 0x0, sizeof(decoder->record));
+   decoder->record.context_id = -1;
 
while (1) {
err = arm_spe_get_next_packet(decoder);
@@ -180,6 +181,7 @@ static int arm_spe_read_record(struct arm_spe_decoder 
*decoder)
case ARM_SPE_COUNTER:
break;
case ARM_SPE_CONTEXT:
+   decoder->record.context_id = payload;
break;
case ARM_SPE_OP_TYPE:
if (idx == SPE_OP_PKT_HDR_CLASS_LD_ST_ATOMIC) {
diff --git a/tools/perf/util/arm-spe-decoder/arm-spe-decoder.h 
b/tools/perf/util/arm-spe-decoder/arm-spe-decoder.h
index 59bdb7309674..46a8556a9e95 100644
--- a/tools/perf/util/arm-spe-decoder/arm-spe-decoder.h
+++ b/tools/perf/util/arm-spe-decoder/arm-spe-decoder.h
@@ -38,6 +38,7 @@ struct arm_spe_record {
u64 timestamp;
u64 virt_addr;
u64 phys_addr;
+   u64 context_id;
 };
 
 struct arm_spe_insn;
-- 
2.28.0



[PATCH 6/8] perf arm-spe: Set sample's data source field

2021-01-19 Thread James Clark
From: Leo Yan 

The sample structure contains the field 'data_src' which is used to
tell the data operation attributions, e.g. operation type is loading or
storing, cache level, it's snooping or remote accessing, etc.  At the
end, the 'data_src' will be parsed by perf mem/c2c tools to display
human readable strings.

This patch is to fill the 'data_src' field in the synthesized samples
base on different types.  Currently perf tool can display statistics for
L1/L2/L3 caches but it doesn't support the 'last level cache'.  To fit
to current implementation, 'data_src' field uses L3 cache for last level
cache.

Before this commit, perf mem report looks like this:
# Samples: 75K of event 'l1d-miss'
# Total weight : 75951
# Sort order   : 
local_weight,mem,sym,dso,symbol_daddr,dso_daddr,snoop,tlb,locked
#
# Overhead   Samples  Local Weight  Memory access Symbol
  Shared Object Data Symbol Data Object   Snoop 
TLB access
#         
..    ..  
    ...
#
81.56% 61945  0 N/A   [.] 
0x09d8  serial_c  [.] [unknown] 
N/A   N/A
18.44% 14003  0 N/A   [.] 
0x0828  serial_c  [.] [unknown] 
N/A   N/A

Now on a system with Arm SPE, addresses and access types are displayed:

# Samples: 75K of event 'l1d-miss'
# Total weight : 75951
# Sort order   : 
local_weight,mem,sym,dso,symbol_daddr,dso_daddr,snoop,tlb,locked
#
# Overhead   Samples  Local Weight  Memory access Symbol
  Shared Object Data Symbol Data Object  Snoop  
   TLB access
#         
..    ..  ...  
  ..
#
 0.43%   324  0 L1 miss   [.] 
0x09d8  serial_c  [.] 0x80794e00  anon N/A  
 Walker hit
 0.42%   322  0 L1 miss   [.] 
0x09d8  serial_c  [.] 0x80794580  anon     N/A  
 Walker hit

Signed-off-by: Leo Yan 
Signed-off-by: James Clark 
Cc: Peter Zijlstra 
Cc: Ingo Molnar 
Cc: Arnaldo Carvalho de Melo 
Cc: Mark Rutland 
Cc: Alexander Shishkin 
Cc: Jiri Olsa 
Cc: Namhyung Kim 
Cc: John Garry 
Cc: Will Deacon 
Cc: Mathieu Poirier 
Cc: Al Grant 
Cc: Andre Przywara 
Cc: Wei Li 
Cc: Tan Xiaojun 
Cc: Adrian Hunter 
---
 tools/perf/util/arm-spe.c | 69 ++-
 1 file changed, 60 insertions(+), 9 deletions(-)

diff --git a/tools/perf/util/arm-spe.c b/tools/perf/util/arm-spe.c
index 5550906486d8..27a0b9dfe22d 100644
--- a/tools/perf/util/arm-spe.c
+++ b/tools/perf/util/arm-spe.c
@@ -261,7 +261,7 @@ arm_spe_deliver_synth_event(struct arm_spe *spe,
 }
 
 static int arm_spe__synth_mem_sample(struct arm_spe_queue *speq,
-u64 spe_events_id)
+u64 spe_events_id, u64 data_src)
 {
struct arm_spe *spe = speq->spe;
struct arm_spe_record *record = &speq->decoder->record;
@@ -274,6 +274,7 @@ static int arm_spe__synth_mem_sample(struct arm_spe_queue 
*speq,
sample.stream_id = spe_events_id;
sample.addr = record->virt_addr;
sample.phys_addr = record->phys_addr;
+   sample.data_src = data_src;
 
return arm_spe_deliver_synth_event(spe, speq, event, &sample);
 }
@@ -307,21 +308,66 @@ static bool arm_spe__is_memory_event(enum 
arm_spe_sample_type type)
return false;
 }
 
+static u64 arm_spe__synth_data_source(const struct arm_spe_record *record)
+{
+   union perf_mem_data_src data_src = { 0 };
+
+   if (record->op == ARM_SPE_LD)
+   data_src.mem_op = PERF_MEM_OP_LOAD;
+   else
+   data_src.mem_op = PERF_MEM_OP_STORE;
+
+   if (record->type & (ARM_SPE_LLC_ACCESS | ARM_SPE_LLC_MISS)) {
+   data_src.mem_lvl = PERF_MEM_LVL_L3;
+
+   if (record->type & ARM_SPE_LLC_MISS)
+   data_src.mem_lvl |= PERF_MEM_LVL_MISS;
+   else
+   data_src.mem_lvl |= PERF_MEM_LVL_HIT;
+   } else if (record->type & (ARM_SPE_L1D_ACCESS | ARM_SPE_L1D_MISS)) {
+   data_src.mem_lvl = PERF_MEM_LVL_L1;
+
+   if (record->type & ARM_SPE_L1D_MISS)
+   data_src.mem_lvl |= PERF_MEM_LVL_MISS;
+   else
+ 

[PATCH 3/8] perf arm-spe: Store operation type in packet

2021-01-19 Thread James Clark
From: Leo Yan 

This patch is to store operation type in packet structure.

Signed-off-by: Leo Yan 
Signed-off-by: James Clark 
Cc: Peter Zijlstra 
Cc: Ingo Molnar 
Cc: Arnaldo Carvalho de Melo 
Cc: Mark Rutland 
Cc: Alexander Shishkin 
Cc: Jiri Olsa 
Cc: Namhyung Kim 
Cc: John Garry 
Cc: Will Deacon 
Cc: Mathieu Poirier 
Cc: Al Grant 
Cc: Andre Przywara 
Cc: Wei Li 
Cc: Tan Xiaojun 
Cc: Adrian Hunter 
---
 tools/perf/util/arm-spe-decoder/arm-spe-decoder.c | 6 ++
 tools/perf/util/arm-spe-decoder/arm-spe-decoder.h | 6 ++
 2 files changed, 12 insertions(+)

diff --git a/tools/perf/util/arm-spe-decoder/arm-spe-decoder.c 
b/tools/perf/util/arm-spe-decoder/arm-spe-decoder.c
index 7aac3048b090..32fe41835fa6 100644
--- a/tools/perf/util/arm-spe-decoder/arm-spe-decoder.c
+++ b/tools/perf/util/arm-spe-decoder/arm-spe-decoder.c
@@ -182,6 +182,12 @@ static int arm_spe_read_record(struct arm_spe_decoder 
*decoder)
case ARM_SPE_CONTEXT:
break;
case ARM_SPE_OP_TYPE:
+   if (idx == SPE_OP_PKT_HDR_CLASS_LD_ST_ATOMIC) {
+   if (payload & 0x1)
+   decoder->record.op = ARM_SPE_ST;
+   else
+   decoder->record.op = ARM_SPE_LD;
+   }
break;
case ARM_SPE_EVENTS:
if (payload & BIT(EV_L1D_REFILL))
diff --git a/tools/perf/util/arm-spe-decoder/arm-spe-decoder.h 
b/tools/perf/util/arm-spe-decoder/arm-spe-decoder.h
index 7b845001afe7..59bdb7309674 100644
--- a/tools/perf/util/arm-spe-decoder/arm-spe-decoder.h
+++ b/tools/perf/util/arm-spe-decoder/arm-spe-decoder.h
@@ -24,9 +24,15 @@ enum arm_spe_sample_type {
ARM_SPE_REMOTE_ACCESS   = 1 << 7,
 };
 
+enum arm_spe_op_type {
+   ARM_SPE_LD  = 1 << 0,
+   ARM_SPE_ST  = 1 << 1,
+};
+
 struct arm_spe_record {
enum arm_spe_sample_type type;
int err;
+   u32 op;
u64 from_ip;
u64 to_ip;
u64 timestamp;
-- 
2.28.0



[PATCH 4/8] perf arm-spe: Fill address info for samples

2021-01-19 Thread James Clark
From: Leo Yan 

To properly handle memory and branch samples, this patch divides into
two functions for generating samples: arm_spe__synth_mem_sample() is for
synthesizing memory and TLB samples; arm_spe__synth_branch_sample() is
to synthesize branch samples.

Arm SPE backend decoder has passed virtual and physical address through
packets, the address info is stored into the synthesize samples in the
function arm_spe__synth_mem_sample().

Signed-off-by: Leo Yan 
Signed-off-by: James Clark 
Cc: Peter Zijlstra 
Cc: Ingo Molnar 
Cc: Arnaldo Carvalho de Melo 
Cc: Mark Rutland 
Cc: Alexander Shishkin 
Cc: Jiri Olsa 
Cc: Namhyung Kim 
Cc: John Garry 
Cc: Will Deacon 
Cc: Mathieu Poirier 
Cc: Al Grant 
Cc: Andre Przywara 
Cc: Wei Li 
Cc: Tan Xiaojun 
Cc: Adrian Hunter 
---
 tools/perf/util/arm-spe.c | 52 +++
 1 file changed, 31 insertions(+), 21 deletions(-)

diff --git a/tools/perf/util/arm-spe.c b/tools/perf/util/arm-spe.c
index b134516e890b..578725344603 100644
--- a/tools/perf/util/arm-spe.c
+++ b/tools/perf/util/arm-spe.c
@@ -235,7 +235,6 @@ static void arm_spe_prep_sample(struct arm_spe *spe,
sample->cpumode = arm_spe_cpumode(spe, sample->ip);
sample->pid = speq->pid;
sample->tid = speq->tid;
-   sample->addr = record->to_ip;
sample->period = 1;
sample->cpu = speq->cpu;
 
@@ -259,18 +258,37 @@ arm_spe_deliver_synth_event(struct arm_spe *spe,
return ret;
 }
 
-static int
-arm_spe_synth_spe_events_sample(struct arm_spe_queue *speq,
-   u64 spe_events_id)
+static int arm_spe__synth_mem_sample(struct arm_spe_queue *speq,
+u64 spe_events_id)
 {
struct arm_spe *spe = speq->spe;
+   struct arm_spe_record *record = &speq->decoder->record;
+   union perf_event *event = speq->event_buf;
+   struct perf_sample sample = { 0 };
+
+   arm_spe_prep_sample(spe, speq, event, &sample);
+
+   sample.id = spe_events_id;
+   sample.stream_id = spe_events_id;
+   sample.addr = record->virt_addr;
+   sample.phys_addr = record->phys_addr;
+
+   return arm_spe_deliver_synth_event(spe, speq, event, &sample);
+}
+
+static int arm_spe__synth_branch_sample(struct arm_spe_queue *speq,
+   u64 spe_events_id)
+{
+   struct arm_spe *spe = speq->spe;
+   struct arm_spe_record *record = &speq->decoder->record;
union perf_event *event = speq->event_buf;
-   struct perf_sample sample = { .ip = 0, };
+   struct perf_sample sample = { 0 };
 
arm_spe_prep_sample(spe, speq, event, &sample);
 
sample.id = spe_events_id;
sample.stream_id = spe_events_id;
+   sample.addr = record->to_ip;
 
return arm_spe_deliver_synth_event(spe, speq, event, &sample);
 }
@@ -283,15 +301,13 @@ static int arm_spe_sample(struct arm_spe_queue *speq)
 
if (spe->sample_flc) {
if (record->type & ARM_SPE_L1D_MISS) {
-   err = arm_spe_synth_spe_events_sample(
-   speq, spe->l1d_miss_id);
+   err = arm_spe__synth_mem_sample(speq, spe->l1d_miss_id);
if (err)
return err;
}
 
if (record->type & ARM_SPE_L1D_ACCESS) {
-   err = arm_spe_synth_spe_events_sample(
-   speq, spe->l1d_access_id);
+   err = arm_spe__synth_mem_sample(speq, 
spe->l1d_access_id);
if (err)
return err;
}
@@ -299,15 +315,13 @@ static int arm_spe_sample(struct arm_spe_queue *speq)
 
if (spe->sample_llc) {
if (record->type & ARM_SPE_LLC_MISS) {
-   err = arm_spe_synth_spe_events_sample(
-   speq, spe->llc_miss_id);
+   err = arm_spe__synth_mem_sample(speq, spe->llc_miss_id);
if (err)
return err;
}
 
if (record->type & ARM_SPE_LLC_ACCESS) {
-   err = arm_spe_synth_spe_events_sample(
-   speq, spe->llc_access_id);
+   err = arm_spe__synth_mem_sample(speq, 
spe->llc_access_id);
if (err)
return err;
}
@@ -315,31 +329,27 @@ static int arm_spe_sample(struct arm_spe_queue *speq)
 
if (spe->sample_tlb) {
if (record->type & ARM_SPE_TLB_MISS) {
-   err = arm_spe_synth_spe_events_sample(
-   speq, spe->tlb_miss_

[PATCH 2/8] perf arm-spe: Store memory address in packet

2021-01-19 Thread James Clark
From: Leo Yan 

This patch is to store virtual and physical memory addresses in packet,
which will be used for memory samples.

Signed-off-by: Leo Yan 
Signed-off-by: James Clark 
Cc: Peter Zijlstra 
Cc: Ingo Molnar 
Cc: Arnaldo Carvalho de Melo 
Cc: Mark Rutland 
Cc: Alexander Shishkin 
Cc: Jiri Olsa 
Cc: Namhyung Kim 
Cc: John Garry 
Cc: Will Deacon 
Cc: Mathieu Poirier 
Cc: Al Grant 
Cc: Andre Przywara 
Cc: Wei Li 
Cc: Tan Xiaojun 
Cc: Adrian Hunter 
---
 tools/perf/util/arm-spe-decoder/arm-spe-decoder.c | 4 
 tools/perf/util/arm-spe-decoder/arm-spe-decoder.h | 2 ++
 2 files changed, 6 insertions(+)

diff --git a/tools/perf/util/arm-spe-decoder/arm-spe-decoder.c 
b/tools/perf/util/arm-spe-decoder/arm-spe-decoder.c
index 90d575cee1b9..7aac3048b090 100644
--- a/tools/perf/util/arm-spe-decoder/arm-spe-decoder.c
+++ b/tools/perf/util/arm-spe-decoder/arm-spe-decoder.c
@@ -172,6 +172,10 @@ static int arm_spe_read_record(struct arm_spe_decoder 
*decoder)
decoder->record.from_ip = ip;
else if (idx == SPE_ADDR_PKT_HDR_INDEX_BRANCH)
decoder->record.to_ip = ip;
+   else if (idx == SPE_ADDR_PKT_HDR_INDEX_DATA_VIRT)
+   decoder->record.virt_addr = ip;
+   else if (idx == SPE_ADDR_PKT_HDR_INDEX_DATA_PHYS)
+   decoder->record.phys_addr = ip;
break;
case ARM_SPE_COUNTER:
break;
diff --git a/tools/perf/util/arm-spe-decoder/arm-spe-decoder.h 
b/tools/perf/util/arm-spe-decoder/arm-spe-decoder.h
index 24727b8ca7ff..7b845001afe7 100644
--- a/tools/perf/util/arm-spe-decoder/arm-spe-decoder.h
+++ b/tools/perf/util/arm-spe-decoder/arm-spe-decoder.h
@@ -30,6 +30,8 @@ struct arm_spe_record {
u64 from_ip;
u64 to_ip;
u64 timestamp;
+   u64 virt_addr;
+   u64 phys_addr;
 };
 
 struct arm_spe_insn;
-- 
2.28.0



[PATCH 1/8] perf arm-spe: Enable sample type PERF_SAMPLE_DATA_SRC

2021-01-19 Thread James Clark
From: Leo Yan 

This patch is to enable sample type PERF_SAMPLE_DATA_SRC for Arm SPE in
the perf data, when output the tracing data, it tells tools that it
contains data source in the memory event.

Signed-off-by: Leo Yan 
Signed-off-by: James Clark 
Cc: Peter Zijlstra 
Cc: Ingo Molnar 
Cc: Arnaldo Carvalho de Melo 
Cc: Mark Rutland 
Cc: Alexander Shishkin 
Cc: Jiri Olsa 
Cc: Namhyung Kim 
Cc: John Garry 
Cc: Will Deacon 
Cc: Mathieu Poirier 
Cc: Al Grant 
Cc: Andre Przywara 
Cc: Wei Li 
Cc: Tan Xiaojun 
Cc: Adrian Hunter 
---
 tools/perf/util/arm-spe.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/perf/util/arm-spe.c b/tools/perf/util/arm-spe.c
index 8901a1656a41..b134516e890b 100644
--- a/tools/perf/util/arm-spe.c
+++ b/tools/perf/util/arm-spe.c
@@ -803,7 +803,7 @@ arm_spe_synth_events(struct arm_spe *spe, struct 
perf_session *session)
attr.type = PERF_TYPE_HARDWARE;
attr.sample_type = evsel->core.attr.sample_type & PERF_SAMPLE_MASK;
attr.sample_type |= PERF_SAMPLE_IP | PERF_SAMPLE_TID |
-   PERF_SAMPLE_PERIOD;
+   PERF_SAMPLE_PERIOD | PERF_SAMPLE_DATA_SRC;
if (spe->timeless_decoding)
attr.sample_type &= ~(u64)PERF_SAMPLE_TIME;
else
-- 
2.28.0



[PATCH] perf tools: Update OpenCSD to v1.0.0

2021-01-08 Thread James Clark
Replace the OCSD_INSTR switch statement with an if to
fix compilation error about unhandled values and avoid
this issue again in the future.

Add new OCSD_GEN_TRC_ELEM_SYNC_MARKER and
OCSD_GEN_TRC_ELEM_MEMTRANS enum values to fix unhandled
value compilation error. Currently they are ignored.

Increase the minimum version number to v1.0.0 now
that new enum values are used that are only present
in this version.

Signed-off-by: James Clark 
Cc: John Garry 
Cc: Will Deacon 
Cc: Mathieu Poirier 
Cc: Leo Yan 
Cc: Suzuki K Poulose 
Cc: Mike Leach 
Cc: Al Grant 
Cc: Peter Zijlstra 
Cc: Ingo Molnar 
Cc: Arnaldo Carvalho de Melo 
Cc: Mark Rutland 
Cc: Alexander Shishkin 
Cc: Jiri Olsa 
Cc: Namhyung Kim 
---
 tools/build/feature/test-libopencsd.c   |  4 ++--
 tools/perf/util/cs-etm-decoder/cs-etm-decoder.c | 15 ---
 2 files changed, 6 insertions(+), 13 deletions(-)

diff --git a/tools/build/feature/test-libopencsd.c 
b/tools/build/feature/test-libopencsd.c
index 1547bc2c0950..52c790b0317b 100644
--- a/tools/build/feature/test-libopencsd.c
+++ b/tools/build/feature/test-libopencsd.c
@@ -4,9 +4,9 @@
 /*
  * Check OpenCSD library version is sufficient to provide required features
  */
-#define OCSD_MIN_VER ((0 << 16) | (14 << 8) | (0))
+#define OCSD_MIN_VER ((1 << 16) | (0 << 8) | (0))
 #if !defined(OCSD_VER_NUM) || (OCSD_VER_NUM < OCSD_MIN_VER)
-#error "OpenCSD >= 0.14.0 is required"
+#error "OpenCSD >= 1.0.0 is required"
 #endif
 
 int main(void)
diff --git a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c 
b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c
index cd007cc9c283..3f4bc4050477 100644
--- a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c
+++ b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c
@@ -419,19 +419,10 @@ cs_etm_decoder__buffer_range(struct cs_etm_queue *etmq,
packet->last_instr_subtype = elem->last_i_subtype;
packet->last_instr_cond = elem->last_instr_cond;
 
-   switch (elem->last_i_type) {
-   case OCSD_INSTR_BR:
-   case OCSD_INSTR_BR_INDIRECT:
+   if (elem->last_i_type == OCSD_INSTR_BR || elem->last_i_type == 
OCSD_INSTR_BR_INDIRECT)
packet->last_instr_taken_branch = elem->last_instr_exec;
-   break;
-   case OCSD_INSTR_ISB:
-   case OCSD_INSTR_DSB_DMB:
-   case OCSD_INSTR_WFI_WFE:
-   case OCSD_INSTR_OTHER:
-   default:
+   else
packet->last_instr_taken_branch = false;
-   break;
-   }
 
packet->last_instr_size = elem->last_instr_sz;
 
@@ -572,6 +563,8 @@ static ocsd_datapath_resp_t 
cs_etm_decoder__gen_trace_elem_printer(
case OCSD_GEN_TRC_ELEM_EVENT:
case OCSD_GEN_TRC_ELEM_SWTRACE:
case OCSD_GEN_TRC_ELEM_CUSTOM:
+   case OCSD_GEN_TRC_ELEM_SYNC_MARKER:
+   case OCSD_GEN_TRC_ELEM_MEMTRANS:
default:
break;
}
-- 
2.28.0



Re: [PATCH] drivers/perf: Enable PID_IN_CONTEXTIDR with SPE

2020-12-14 Thread James Clark



On 02/12/2020 01:09, Will Deacon wrote:
> On Tue, Dec 01, 2020 at 12:10:40PM +0800, Leo Yan wrote:
>> On Mon, Nov 30, 2020 at 04:46:51PM +, Will Deacon wrote:
>>> On Mon, Nov 30, 2020 at 06:24:54PM +0200, James Clark wrote:
>>>> Enable PID_IN_CONTEXTIDR by default when Arm SPE is enabled.
>>>> This flag is required to get PID data in the SPE trace. Without
>>>> it the perf tool will report 0 for PID which isn't very useful,
>>>> especially when doing system wide profiling or profiling
>>>> applications that fork.
>>>
>>> Can perf not figure out the pid some other way? (e.g. by tracing context
>>> switches and correlating that with the SPE data?).
>>
>> For perf 'per-thread' mode, we can use context switch trace event as
>> assisted info to select thread context.  But for "system wide" mode and
>> "snapshot" mode in perf tool, since the trace data is continuous, I
>> think we cannot use context switch trace event to correlate the SPE
>> trace data.
> 
> Is there no way to correlate them with something like CNTVCT?
> 
>>> Also, how does this work with pid namespaces?
>>
>> Here we are studying the implemetation of Intel-PT and Arm CoreSight.
>>
>> The context ID is stored into the hardware trace data when record;
>> afterwards when perf tool decodes the trace data and detects the
>> packet for context ID, it will select the machine's thread context in
>> perf [1].  Since the perf tool gathers all the threads infomation in
>> perf data file, based on the context ID, it can find the corresponding
>> thread pointer with function machine__find_thread() [2].
>>
>> Since your question is for "pid namespace", to be honest, I don't know
>> how perf tool to handle any confliction for differrent processes share
>> the same PID, and I am not sure if you are asking CGroup related stuff
>> or not.  If this cannot answer your question, please let me know.
> 
> My point was that the pid value written to CONTEXTIDR is a global pid
> and does not take namespacing into account. If perf is run inside a pid
> namespace, it will therefore not work.

That's an interesting point, but I think we should improve this for the simple
use case without namespaces first just to improve the user experience, so I've
sent v2 of the patch with the change you suggested about using "default y".

One other thing that is an issue that I'd like to ask about is this line in
arm_spe_pmu.c:

if (IS_ENABLED(CONFIG_PID_IN_CONTEXTIDR) && perfmon_capable())
reg |= BIT(SYS_PMSCR_EL1_CX_SHIFT);

This means that the user has to be root to get the context saved with SPE.
Is this a necessary security feature? I thought that PIDs are viewable by
all users anyway? Do you think there is any way we could remove the 
perfmon_capable()
requirement?

James


[PATCH v2] drivers/perf: Enable PID_IN_CONTEXTIDR with SPE

2020-12-14 Thread James Clark
Enable PID_IN_CONTEXTIDR by default when Arm SPE is enabled.
This flag is required to get PID data in the SPE trace. Without
it the perf tool will report 0 for PID which isn't very useful,
especially when doing system wide profiling or profiling
applications that fork.

There is a small performance overhead when enabling
PID_IN_CONTEXTIDR, but SPE itself is optional and not enabled by
default so the impact is minimised.

Cc: Will Deacon 
Cc: Mark Rutland 
Cc: Al Grant 
Cc: Leo Yan 
Cc: John Garry 
Cc: Suzuki K Poulose 
Cc: Mathieu Poirier 
Cc: Catalin Marinas 
Signed-off-by: James Clark 
---
 arch/arm64/Kconfig.debug | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/arm64/Kconfig.debug b/arch/arm64/Kconfig.debug
index 265c4461031f..b030bb21a0bb 100644
--- a/arch/arm64/Kconfig.debug
+++ b/arch/arm64/Kconfig.debug
@@ -2,6 +2,7 @@
 
 config PID_IN_CONTEXTIDR
bool "Write the current PID to the CONTEXTIDR register"
+   default y if ARM_SPE_PMU
help
  Enabling this option causes the kernel to write the current PID to
  the CONTEXTIDR register, at the expense of some additional
-- 
2.28.0



[PATCH] drivers/perf: Enable PID_IN_CONTEXTIDR with SPE

2020-11-30 Thread James Clark
Enable PID_IN_CONTEXTIDR by default when Arm SPE is enabled.
This flag is required to get PID data in the SPE trace. Without
it the perf tool will report 0 for PID which isn't very useful,
especially when doing system wide profiling or profiling
applications that fork.

There is a small performance overhead when enabling
PID_IN_CONTEXTIDR, but SPE itself is optional and not enabled by
default so the impact is minimised.

Cc: Will Deacon 
Cc: Mark Rutland 
Cc: Al Grant 
Cc: Leo Yan 
Cc: John Garry 
Cc: Suzuki K Poulose 
Signed-off-by: James Clark 
---
 drivers/perf/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/perf/Kconfig b/drivers/perf/Kconfig
index 130327ff0b0e..47ede46c3d57 100644
--- a/drivers/perf/Kconfig
+++ b/drivers/perf/Kconfig
@@ -125,6 +125,7 @@ config XGENE_PMU
 config ARM_SPE_PMU
tristate "Enable support for the ARMv8.2 Statistical Profiling 
Extension"
depends on ARM64
+   select PID_IN_CONTEXTIDR
help
  Enable perf support for the ARMv8.2 Statistical Profiling
  Extension, which provides periodic sampling of operations in
-- 
2.28.0



Re: [PATCH] perf tools: add aarch64 registers to --user-regs

2020-11-30 Thread James Clark


On 27/11/2020 17:39, Alexandre Truong wrote:
> Previously, this command returns no help message on aarch64:
> 
>   -> ./perf record --user-regs=?
> 
>   available registers:
>   Usage: perf record [] []
>   or: perf record [] --  []
> 
> With this change, the registers are listed.
> 
>   -> ./perf record --user-regs=?
> 
>   available registers: x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 x10 x11 x12 x13 x14 x15 
> x16 x17 x18 x19 x20 x21 x22 x23 x24 x25 x26 x27 x28 x29 lr sp pc
> 
> It's also now possible to record subsets of registers on aarch64:
> 
>   -> ./perf record --user-regs=x4,x5 ls
>   -> ./perf report --dump-raw-trace
> 
>   12801163749305260 0xc70 [0x40]: PERF_RECORD_SAMPLE(IP, 0x2): 51956/51956: 
> 0xaa6571f0 period: 145785 addr: 0
>   ... user regs: mask 0x30 ABI 64-bit
>    x40x006c
>    x50x00100101
>... thread: ls:51956
> .. dso: /usr/lib64/ld-2.17.so
> 

Checked that the registers can be listed with =? and that recording different 
combinations of registers works as expected.

Tested-by: James Clark 


[PATCH v6 10/12] perf tools: Add separate die member

2020-11-26 Thread James Clark
Add die as a separate member so that it doesn't have to be
packed into the int value.

Signed-off-by: James Clark 
Cc: Peter Zijlstra 
Cc: Ingo Molnar 
Cc: Arnaldo Carvalho de Melo 
Cc: Mark Rutland 
Cc: Alexander Shishkin 
Cc: Jiri Olsa 
Cc: Namhyung Kim 
Cc: Thomas Richter 
Cc: John Garry 
---
 tools/perf/builtin-stat.c  | 14 +++---
 tools/perf/tests/topology.c|  8 +---
 tools/perf/util/cpumap.c   | 28 ++--
 tools/perf/util/cpumap.h   |  6 +-
 tools/perf/util/stat-display.c |  6 +++---
 5 files changed, 26 insertions(+), 36 deletions(-)

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 193e7a4e0c7b..514144dad8b1 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -1389,11 +1389,7 @@ static struct aggr_cpu_id perf_env__get_die(struct 
perf_cpu_map *map, int idx, v
 * make a unique ID.
 */
id.socket = env->cpu[cpu].socket_id;
-
-   if (WARN_ONCE(env->cpu[cpu].die_id >> 8, "The die id number is 
too big.\n"))
-   return cpu_map__empty_aggr_cpu_id();
-
-   id.id = env->cpu[cpu].die_id & 0xff;
+   id.die = env->cpu[cpu].die_id;
}
 
return id;
@@ -1407,20 +1403,16 @@ static struct aggr_cpu_id perf_env__get_core(struct 
perf_cpu_map *map, int idx,
 
if (cpu != -1) {
/*
-* encode die id in bit range 23:16
 * core_id is relative to socket and die,
 * we need a global id. So we combine
 * socket + die id + core id
 */
-   if (WARN_ONCE(env->cpu[cpu].die_id >> 8, "The die id number is 
too big.\n"))
-   return cpu_map__empty_aggr_cpu_id();
-
if (WARN_ONCE(env->cpu[cpu].core_id >> 16, "The core id number 
is too big.\n"))
return cpu_map__empty_aggr_cpu_id();
 
id.socket = env->cpu[cpu].socket_id;
-   id.id = (env->cpu[cpu].die_id << 16) |
-  (env->cpu[cpu].core_id & 0x);
+   id.die = env->cpu[cpu].die_id;
+   id.id = env->cpu[cpu].core_id & 0x;
}
 
return id;
diff --git a/tools/perf/tests/topology.c b/tools/perf/tests/topology.c
index 7a07827c0707..da6ed47db491 100644
--- a/tools/perf/tests/topology.c
+++ b/tools/perf/tests/topology.c
@@ -117,7 +117,7 @@ static int check_cpu_topology(char *path, struct 
perf_cpu_map *map)
session->header.env.cpu[map->map[i]].socket_id == 
id.socket);
 
TEST_ASSERT_VAL("Core map - Die ID doesn't match",
-   session->header.env.cpu[map->map[i]].die_id == 
cpu_map__id_to_die(id.id));
+   session->header.env.cpu[map->map[i]].die_id == id.die);
TEST_ASSERT_VAL("Core map - Node ID is set", id.node == -1);
}
 
@@ -128,10 +128,10 @@ static int check_cpu_topology(char *path, struct 
perf_cpu_map *map)
session->header.env.cpu[map->map[i]].socket_id == 
id.socket);
 
TEST_ASSERT_VAL("Die map - Die ID doesn't match",
-   session->header.env.cpu[map->map[i]].die_id ==
-   cpu_map__id_to_die(id.id << 16));
+   session->header.env.cpu[map->map[i]].die_id == id.die);
 
TEST_ASSERT_VAL("Die map - Node ID is set", id.node == -1);
+   TEST_ASSERT_VAL("Die map - ID is set", id.id == -1);
}
 
// Test that socket ID contains only socket
@@ -141,6 +141,7 @@ static int check_cpu_topology(char *path, struct 
perf_cpu_map *map)
session->header.env.cpu[map->map[i]].socket_id == 
id.socket);
 
TEST_ASSERT_VAL("Socket map - Node ID is set", id.node == -1);
+   TEST_ASSERT_VAL("Socket map - Die ID is set", id.die == -1);
TEST_ASSERT_VAL("Socket map - ID is set", id.id == -1);
}
 
@@ -151,6 +152,7 @@ static int check_cpu_topology(char *path, struct 
perf_cpu_map *map)
cpu__get_node(map->map[i]) == id.node);
TEST_ASSERT_VAL("Node map - ID is set", id.id == -1);
TEST_ASSERT_VAL("Node map - Socket is set", id.socket == -1);
+   TEST_ASSERT_VAL("Node map - Die ID is set", id.die == -1);
}
perf_session__delete(session);
 
diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c
index d2630f03f682..10a52058d838 100644
--- a/tools/perf/util/cpumap.c
+++ b/tools/perf/util/cpumap.c
@@ -152,8 +152,10 @@ static int cmp_aggr_cpu_id(c

[PATCH v6 11/12] perf tools: Add separate core member

2020-11-26 Thread James Clark
Add core as a separate member so that it doesn't have to be
packed into the int value.

Signed-off-by: James Clark 
Cc: Peter Zijlstra 
Cc: Ingo Molnar 
Cc: Arnaldo Carvalho de Melo 
Cc: Mark Rutland 
Cc: Alexander Shishkin 
Cc: Jiri Olsa 
Cc: Namhyung Kim 
Cc: Thomas Richter 
Cc: John Garry 
---
 tools/perf/builtin-stat.c  |  9 +++--
 tools/perf/tests/topology.c|  6 +-
 tools/perf/util/cpumap.c   | 18 ++
 tools/perf/util/cpumap.h   |  6 +-
 tools/perf/util/stat-display.c | 16 
 5 files changed, 27 insertions(+), 28 deletions(-)

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 514144dad8b1..d79a29e22dfd 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -1404,15 +1404,12 @@ static struct aggr_cpu_id perf_env__get_core(struct 
perf_cpu_map *map, int idx,
if (cpu != -1) {
/*
 * core_id is relative to socket and die,
-* we need a global id. So we combine
-* socket + die id + core id
+* we need a global id. So we set
+* socket, die id and core id
 */
-   if (WARN_ONCE(env->cpu[cpu].core_id >> 16, "The core id number 
is too big.\n"))
-   return cpu_map__empty_aggr_cpu_id();
-
id.socket = env->cpu[cpu].socket_id;
id.die = env->cpu[cpu].die_id;
-   id.id = env->cpu[cpu].core_id & 0x;
+   id.core = env->cpu[cpu].core_id;
}
 
return id;
diff --git a/tools/perf/tests/topology.c b/tools/perf/tests/topology.c
index da6ed47db491..6779c7b93649 100644
--- a/tools/perf/tests/topology.c
+++ b/tools/perf/tests/topology.c
@@ -111,7 +111,7 @@ static int check_cpu_topology(char *path, struct 
perf_cpu_map *map)
for (i = 0; i < map->nr; i++) {
id = cpu_map__get_core(map, i, NULL);
TEST_ASSERT_VAL("Core map - Core ID doesn't match",
-   session->header.env.cpu[map->map[i]].core_id == 
cpu_map__id_to_cpu(id.id));
+   session->header.env.cpu[map->map[i]].core_id == 
id.core);
 
TEST_ASSERT_VAL("Core map - Socket ID doesn't match",
session->header.env.cpu[map->map[i]].socket_id == 
id.socket);
@@ -119,6 +119,7 @@ static int check_cpu_topology(char *path, struct 
perf_cpu_map *map)
TEST_ASSERT_VAL("Core map - Die ID doesn't match",
session->header.env.cpu[map->map[i]].die_id == id.die);
TEST_ASSERT_VAL("Core map - Node ID is set", id.node == -1);
+   TEST_ASSERT_VAL("Core map - ID is set", id.id == -1);
}
 
// Test that die ID contains socket and die
@@ -132,6 +133,7 @@ static int check_cpu_topology(char *path, struct 
perf_cpu_map *map)
 
TEST_ASSERT_VAL("Die map - Node ID is set", id.node == -1);
TEST_ASSERT_VAL("Die map - ID is set", id.id == -1);
+   TEST_ASSERT_VAL("Die map - Core is set", id.core == -1);
}
 
// Test that socket ID contains only socket
@@ -143,6 +145,7 @@ static int check_cpu_topology(char *path, struct 
perf_cpu_map *map)
TEST_ASSERT_VAL("Socket map - Node ID is set", id.node == -1);
TEST_ASSERT_VAL("Socket map - Die ID is set", id.die == -1);
TEST_ASSERT_VAL("Socket map - ID is set", id.id == -1);
+   TEST_ASSERT_VAL("Socket map - Core is set", id.core == -1);
}
 
// Test that node ID contains only node
@@ -153,6 +156,7 @@ static int check_cpu_topology(char *path, struct 
perf_cpu_map *map)
TEST_ASSERT_VAL("Node map - ID is set", id.id == -1);
TEST_ASSERT_VAL("Node map - Socket is set", id.socket == -1);
TEST_ASSERT_VAL("Node map - Die ID is set", id.die == -1);
+   TEST_ASSERT_VAL("Node map - Core is set", id.core == -1);
}
perf_session__delete(session);
 
diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c
index 10a52058d838..d164f7bd1ac7 100644
--- a/tools/perf/util/cpumap.c
+++ b/tools/perf/util/cpumap.c
@@ -154,8 +154,10 @@ static int cmp_aggr_cpu_id(const void *a_pointer, const 
void *b_pointer)
return a->node - b->node;
else if (a->socket != b->socket)
return a->socket - b->socket;
-   else
+   else if (a->die != b->die)
return a->die - b->die;
+   else
+   return a->core - b->core;
 }
 
 int cpu_map__build_map(struct perf_cpu_map *cpus, struct cpu_aggr_map **res,
@@ -258,10 +260,7 @@ struct aggr

[PATCH v6 07/12] perf tools: Start using cpu_aggr_id in map

2020-11-26 Thread James Clark
Use the new cpu_aggr_id struct in the cpu map
instead of int so that it can store more data.

No functional changes.

Signed-off-by: James Clark 
Cc: Peter Zijlstra 
Cc: Ingo Molnar 
Cc: Arnaldo Carvalho de Melo 
Cc: Mark Rutland 
Cc: Alexander Shishkin 
Cc: Jiri Olsa 
Cc: Namhyung Kim 
Cc: Thomas Richter 
Cc: John Garry 
---
 tools/perf/builtin-stat.c  | 6 +++---
 tools/perf/util/cpumap.c   | 8 
 tools/perf/util/cpumap.h   | 2 +-
 tools/perf/util/stat-display.c | 6 +++---
 4 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 344e50651b55..afe9fa6112b6 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -1223,10 +1223,10 @@ static struct aggr_cpu_id perf_stat__get_aggr(struct 
perf_stat_config *config,
 
cpu = map->map[idx];
 
-   if (config->cpus_aggr_map->map[cpu] == -1)
-   config->cpus_aggr_map->map[cpu] = get_id(config, map, idx).id;
+   if (cpu_map__aggr_cpu_id_is_empty(config->cpus_aggr_map->map[cpu]))
+   config->cpus_aggr_map->map[cpu] = get_id(config, map, idx);
 
-   id.id = config->cpus_aggr_map->map[cpu];
+   id = config->cpus_aggr_map->map[cpu];
return id;
 }
 
diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c
index ea81586305f4..b50609b9a585 100644
--- a/tools/perf/util/cpumap.c
+++ b/tools/perf/util/cpumap.c
@@ -97,14 +97,14 @@ struct perf_cpu_map *perf_cpu_map__empty_new(int nr)
 
 struct cpu_aggr_map *cpu_aggr_map__empty_new(int nr)
 {
-   struct cpu_aggr_map *cpus = malloc(sizeof(*cpus) + sizeof(int) * nr);
+   struct cpu_aggr_map *cpus = malloc(sizeof(*cpus) + sizeof(struct 
aggr_cpu_id) * nr);
 
if (cpus != NULL) {
int i;
 
cpus->nr = nr;
for (i = 0; i < nr; i++)
-   cpus->map[i] = -1;
+   cpus->map[i] = cpu_map__empty_aggr_cpu_id();
 
refcount_set(&cpus->refcnt, 1);
}
@@ -169,11 +169,11 @@ int cpu_map__build_map(struct perf_cpu_map *cpus, struct 
cpu_aggr_map **res,
for (cpu = 0; cpu < nr; cpu++) {
s1 = f(cpus, cpu, data);
for (s2 = 0; s2 < c->nr; s2++) {
-   if (s1.id == c->map[s2])
+   if (cpu_map__compare_aggr_cpu_id(s1, c->map[s2]))
break;
}
if (s2 == c->nr) {
-   c->map[c->nr] = s1.id;
+   c->map[c->nr] = s1;
c->nr++;
}
}
diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h
index b112069038be..d8fc265bc762 100644
--- a/tools/perf/util/cpumap.h
+++ b/tools/perf/util/cpumap.h
@@ -14,7 +14,7 @@ struct aggr_cpu_id {
 struct cpu_aggr_map {
refcount_t refcnt;
int nr;
-   int map[];
+   struct aggr_cpu_id map[];
 };
 
 struct perf_record_cpu_map_data;
diff --git a/tools/perf/util/stat-display.c b/tools/perf/util/stat-display.c
index 1a3d66329d73..da0766403d3b 100644
--- a/tools/perf/util/stat-display.c
+++ b/tools/perf/util/stat-display.c
@@ -509,7 +509,7 @@ static void aggr_update_shadow(struct perf_stat_config 
*config,
struct evsel *counter;
 
for (s = 0; s < config->aggr_map->nr; s++) {
-   id.id = config->aggr_map->map[s];
+   id = config->aggr_map->map[s];
evlist__for_each_entry(evlist, counter) {
val = 0;
for (cpu = 0; cpu < evsel__nr_cpus(counter); cpu++) {
@@ -641,7 +641,7 @@ static void print_counter_aggrdata(struct perf_stat_config 
*config,
struct aggr_cpu_id id;
double uval;
 
-   ad.id.id = id.id = config->aggr_map->map[s];
+   ad.id = id = config->aggr_map->map[s];
ad.val = ad.ena = ad.run = 0;
ad.nr = 0;
if (!collect_data(config, counter, aggr_cb, &ad))
@@ -1169,7 +1169,7 @@ static void print_percore_thread(struct perf_stat_config 
*config,
for (int i = 0; i < evsel__nr_cpus(counter); i++) {
s2 = config->aggr_get_id(config, evsel__cpus(counter), i);
for (s = 0; s < config->aggr_map->nr; s++) {
-   id.id = config->aggr_map->map[s];
+   id = config->aggr_map->map[s];
if (cpu_map__compare_aggr_cpu_id(s2, id))
break;
}
-- 
2.28.0



[PATCH v6 08/12] perf tools: Add separate node member

2020-11-26 Thread James Clark
Add node as a separate member so that it doesn't have to be
packed into the int value.

Signed-off-by: James Clark 
Cc: Peter Zijlstra 
Cc: Ingo Molnar 
Cc: Arnaldo Carvalho de Melo 
Cc: Mark Rutland 
Cc: Alexander Shishkin 
Cc: Jiri Olsa 
Cc: Namhyung Kim 
Cc: Thomas Richter 
Cc: John Garry 
---
 tools/perf/builtin-stat.c  |  2 +-
 tools/perf/tests/topology.c|  8 +++-
 tools/perf/util/cpumap.c   | 16 +++-
 tools/perf/util/cpumap.h   |  1 +
 tools/perf/util/stat-display.c |  2 +-
 5 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index afe9fa6112b6..2db2550eef9e 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -1437,7 +1437,7 @@ static struct aggr_cpu_id perf_env__get_node(struct 
perf_cpu_map *map, int idx,
int cpu = perf_env__get_cpu(data, map, idx);
struct aggr_cpu_id id = cpu_map__empty_aggr_cpu_id();
 
-   id.id = perf_env__numa_node(data, cpu);
+   id.node = perf_env__numa_node(data, cpu);
return id;
 }
 
diff --git a/tools/perf/tests/topology.c b/tools/perf/tests/topology.c
index 8dab1b4f323f..f181646e7465 100644
--- a/tools/perf/tests/topology.c
+++ b/tools/perf/tests/topology.c
@@ -119,6 +119,7 @@ static int check_cpu_topology(char *path, struct 
perf_cpu_map *map)
 
TEST_ASSERT_VAL("Core map - Die ID doesn't match",
session->header.env.cpu[map->map[i]].die_id == 
cpu_map__id_to_die(id.id));
+   TEST_ASSERT_VAL("Core map - Node ID is set", id.node == -1);
}
 
// Test that die ID contains socket and die
@@ -131,6 +132,8 @@ static int check_cpu_topology(char *path, struct 
perf_cpu_map *map)
TEST_ASSERT_VAL("Die map - Die ID doesn't match",
session->header.env.cpu[map->map[i]].die_id ==
cpu_map__id_to_die(id.id << 16));
+
+   TEST_ASSERT_VAL("Die map - Node ID is set", id.node == -1);
}
 
// Test that socket ID contains only socket
@@ -138,13 +141,16 @@ static int check_cpu_topology(char *path, struct 
perf_cpu_map *map)
id = cpu_map__get_socket(map, i, NULL);
TEST_ASSERT_VAL("Socket map - Socket ID doesn't match",
session->header.env.cpu[map->map[i]].socket_id == 
id.id);
+
+   TEST_ASSERT_VAL("Socket map - Node ID is set", id.node == -1);
}
 
// Test that node ID contains only node
for (i = 0; i < map->nr; i++) {
id = cpu_map__get_node(map, i, NULL);
TEST_ASSERT_VAL("Node map - Node ID doesn't match",
-   cpu__get_node(map->map[i]) == id.id);
+   cpu__get_node(map->map[i]) == id.node);
+   TEST_ASSERT_VAL("Node map - ID is set", id.id == -1);
}
perf_session__delete(session);
 
diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c
index b50609b9a585..5f9e98ddbe34 100644
--- a/tools/perf/util/cpumap.c
+++ b/tools/perf/util/cpumap.c
@@ -148,7 +148,10 @@ static int cmp_aggr_cpu_id(const void *a_pointer, const 
void *b_pointer)
struct aggr_cpu_id *a = (struct aggr_cpu_id *)a_pointer;
struct aggr_cpu_id *b = (struct aggr_cpu_id *)b_pointer;
 
-   return a->id - b->id;
+   if (a->id != b->id)
+   return a->id - b->id;
+   else
+   return a->node - b->node;
 }
 
 int cpu_map__build_map(struct perf_cpu_map *cpus, struct cpu_aggr_map **res,
@@ -275,7 +278,7 @@ struct aggr_cpu_id cpu_map__get_node(struct perf_cpu_map 
*map, int idx, void *da
if (idx < 0 || idx >= map->nr)
return id;
 
-   id.id = cpu_map__get_node_id(map->map[idx]);
+   id.node = cpu_map__get_node_id(map->map[idx]);
return id;
 }
 
@@ -620,18 +623,21 @@ const struct perf_cpu_map *cpu_map__online(void) /* 
thread unsafe */
 
 bool cpu_map__compare_aggr_cpu_id(struct aggr_cpu_id a, struct aggr_cpu_id b)
 {
-   return a.id == b.id;
+   return a.id == b.id &&
+   a.node == b.node;
 }
 
 bool cpu_map__aggr_cpu_id_is_empty(struct aggr_cpu_id a)
 {
-   return a.id == -1;
+   return a.id == -1 &&
+   a.node == -1;
 }
 
 struct aggr_cpu_id cpu_map__empty_aggr_cpu_id(void)
 {
struct aggr_cpu_id ret = {
-   .id = -1
+   .id = -1,
+   .node = -1
};
return ret;
 }
diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h
index d8fc265bc762..f79e92603024 100644
--- a/tools/perf/util/cpumap.h
+++ b/tools/perf/util/cpumap.h
@@ -9,6 +9,7 @@
 
 struct aggr_cpu_id {
int id;
+   int node;
 };
 
 struct cpu_aggr_map {
diff --git a/tools/perf/ut

[PATCH v6 12/12] perf tools: Add separate thread member

2020-11-26 Thread James Clark
A separate field isn't strictly required. The core
field could be re-used for thread IDs as a single
field was used previously.

But separating them will avoid confusion and catch
potential errors where core IDs are read as thread
IDs and vice versa.

Also remove the placeholder id field which is now
no longer used.

Signed-off-by: James Clark 
Cc: Peter Zijlstra 
Cc: Ingo Molnar 
Cc: Arnaldo Carvalho de Melo 
Cc: Mark Rutland 
Cc: Alexander Shishkin 
Cc: Jiri Olsa 
Cc: Namhyung Kim 
Cc: Thomas Richter 
Cc: John Garry 
---
 tools/perf/tests/topology.c|  8 
 tools/perf/util/cpumap.c   | 14 +++---
 tools/perf/util/cpumap.h   |  2 +-
 tools/perf/util/stat-display.c |  8 
 4 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/tools/perf/tests/topology.c b/tools/perf/tests/topology.c
index 6779c7b93649..078051116546 100644
--- a/tools/perf/tests/topology.c
+++ b/tools/perf/tests/topology.c
@@ -119,7 +119,7 @@ static int check_cpu_topology(char *path, struct 
perf_cpu_map *map)
TEST_ASSERT_VAL("Core map - Die ID doesn't match",
session->header.env.cpu[map->map[i]].die_id == id.die);
TEST_ASSERT_VAL("Core map - Node ID is set", id.node == -1);
-   TEST_ASSERT_VAL("Core map - ID is set", id.id == -1);
+   TEST_ASSERT_VAL("Core map - Thread is set", id.thread == -1);
}
 
// Test that die ID contains socket and die
@@ -132,8 +132,8 @@ static int check_cpu_topology(char *path, struct 
perf_cpu_map *map)
session->header.env.cpu[map->map[i]].die_id == id.die);
 
TEST_ASSERT_VAL("Die map - Node ID is set", id.node == -1);
-   TEST_ASSERT_VAL("Die map - ID is set", id.id == -1);
TEST_ASSERT_VAL("Die map - Core is set", id.core == -1);
+   TEST_ASSERT_VAL("Die map - Thread is set", id.thread == -1);
}
 
// Test that socket ID contains only socket
@@ -144,8 +144,8 @@ static int check_cpu_topology(char *path, struct 
perf_cpu_map *map)
 
TEST_ASSERT_VAL("Socket map - Node ID is set", id.node == -1);
TEST_ASSERT_VAL("Socket map - Die ID is set", id.die == -1);
-   TEST_ASSERT_VAL("Socket map - ID is set", id.id == -1);
TEST_ASSERT_VAL("Socket map - Core is set", id.core == -1);
+   TEST_ASSERT_VAL("Socket map - Thread is set", id.thread == -1);
}
 
// Test that node ID contains only node
@@ -153,10 +153,10 @@ static int check_cpu_topology(char *path, struct 
perf_cpu_map *map)
id = cpu_map__get_node(map, i, NULL);
TEST_ASSERT_VAL("Node map - Node ID doesn't match",
cpu__get_node(map->map[i]) == id.node);
-   TEST_ASSERT_VAL("Node map - ID is set", id.id == -1);
TEST_ASSERT_VAL("Node map - Socket is set", id.socket == -1);
TEST_ASSERT_VAL("Node map - Die ID is set", id.die == -1);
TEST_ASSERT_VAL("Node map - Core is set", id.core == -1);
+   TEST_ASSERT_VAL("Node map - Thread is set", id.thread == -1);
}
perf_session__delete(session);
 
diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c
index d164f7bd1ac7..87d3eca9b872 100644
--- a/tools/perf/util/cpumap.c
+++ b/tools/perf/util/cpumap.c
@@ -148,16 +148,16 @@ static int cmp_aggr_cpu_id(const void *a_pointer, const 
void *b_pointer)
struct aggr_cpu_id *a = (struct aggr_cpu_id *)a_pointer;
struct aggr_cpu_id *b = (struct aggr_cpu_id *)b_pointer;
 
-   if (a->id != b->id)
-   return a->id - b->id;
-   else if (a->node != b->node)
+   if (a->node != b->node)
return a->node - b->node;
else if (a->socket != b->socket)
return a->socket - b->socket;
else if (a->die != b->die)
return a->die - b->die;
-   else
+   else if (a->core != b->core)
return a->core - b->core;
+   else
+   return a->thread - b->thread;
 }
 
 int cpu_map__build_map(struct perf_cpu_map *cpus, struct cpu_aggr_map **res,
@@ -616,7 +616,7 @@ const struct perf_cpu_map *cpu_map__online(void) /* thread 
unsafe */
 
 bool cpu_map__compare_aggr_cpu_id(struct aggr_cpu_id a, struct aggr_cpu_id b)
 {
-   return a.id == b.id &&
+   return a.thread == b.thread &&
a.node == b.node &&
a.socket == b.socket &&
a.die == b.die &&
@@ -625,7 +625,7 @@ bool cpu_map__compare_aggr_cpu_id(struct aggr_cpu_id a, 
struct a

[PATCH v6 09/12] perf tools: Add separate socket member

2020-11-26 Thread James Clark
Add socket as a separate member so that it doesn't have to be
packed into the int value. When the socket ID was larger than
8 bits the output appeared corrupted or incomplete.

For example, here on ThunderX2 perf stat reports a socket
of -1 and an invalid die number:

  ./perf stat -a --per-die
  The socket id number is too big.

  Performance counter stats for 'system wide':

  S-1-D255   128 687.99 msec cpu-clock #   
57.240 CPUs utilized
  ...
  S36-D0 128 842.34 msec cpu-clock #   
70.081 CPUs utilized
  ...

And with --per-core there is an entry with an invalid core ID:

  ./perf stat record -a --per-core
  The socket id number is too big.

  Performance counter stats for 'system wide':
  S-1-D255-C65535 128 671.04 msec cpu-clock #   
54.112 CPUs utilized
  ...
  S36-D0-C0   4  28.27 msec cpu-clock #
2.279 CPUs utilized
  ...

This fixes the "Session topology" self test on ThunderX2.

After this fix the output contains the correct socket and die
IDs and no longer prints a warning about the size of the
socket ID:

  ./perf stat --per-die -a

  Performance counter stats for 'system wide':

  S36-D0 128 169,869.39 msec cpu-clock #  
127.501 CPUs utilized
  ...
  S3612-D0 128 169,733.05 msec cpu-clock     #  
127.398 CPUs utilized

Signed-off-by: James Clark 
Cc: Peter Zijlstra 
Cc: Ingo Molnar 
Cc: Arnaldo Carvalho de Melo 
Cc: Mark Rutland 
Cc: Alexander Shishkin 
Cc: Jiri Olsa 
Cc: Namhyung Kim 
Cc: Thomas Richter 
Cc: John Garry 
---
 tools/perf/builtin-stat.c  | 22 +++--
 tools/perf/tests/topology.c| 10 
 tools/perf/util/cpumap.c   | 44 +-
 tools/perf/util/cpumap.h   |  6 +
 tools/perf/util/stat-display.c |  8 +++
 tools/perf/util/stat.c |  2 +-
 6 files changed, 41 insertions(+), 51 deletions(-)

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 2db2550eef9e..193e7a4e0c7b 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -1371,7 +1371,7 @@ static struct aggr_cpu_id perf_env__get_socket(struct 
perf_cpu_map *map, int idx
struct aggr_cpu_id id = cpu_map__empty_aggr_cpu_id();
 
if (cpu != -1)
-   id.id = env->cpu[cpu].socket_id;
+   id.socket = env->cpu[cpu].socket_id;
 
return id;
 }
@@ -1384,18 +1384,16 @@ static struct aggr_cpu_id perf_env__get_die(struct 
perf_cpu_map *map, int idx, v
 
if (cpu != -1) {
/*
-* Encode socket in bit range 15:8
-* die_id is relative to socket,
-* we need a global id. So we combine
-* socket + die id
+* die_id is relative to socket, so start
+* with the socket ID and then add die to
+* make a unique ID.
 */
-   if (WARN_ONCE(env->cpu[cpu].socket_id >> 8, "The socket id 
number is too big.\n"))
-   return cpu_map__empty_aggr_cpu_id();
+   id.socket = env->cpu[cpu].socket_id;
 
if (WARN_ONCE(env->cpu[cpu].die_id >> 8, "The die id number is 
too big.\n"))
return cpu_map__empty_aggr_cpu_id();
 
-   id.id = (env->cpu[cpu].socket_id << 8) | (env->cpu[cpu].die_id 
& 0xff);
+   id.id = env->cpu[cpu].die_id & 0xff;
}
 
return id;
@@ -1409,23 +1407,19 @@ static struct aggr_cpu_id perf_env__get_core(struct 
perf_cpu_map *map, int idx,
 
if (cpu != -1) {
/*
-* Encode socket in bit range 31:24
 * encode die id in bit range 23:16
 * core_id is relative to socket and die,
 * we need a global id. So we combine
 * socket + die id + core id
 */
-   if (WARN_ONCE(env->cpu[cpu].socket_id >> 8, "The socket id 
number is too big.\n"))
-   return cpu_map__empty_aggr_cpu_id();
-
if (WARN_ONCE(env->cpu[cpu].die_id >> 8, "The die id number is 
too big.\n"))
return cpu_map__empty_aggr_cpu_id();
 
if (WARN_ONCE(env->cpu[cpu].core_id >> 16, "The core id number 
is too big.\n"))
return cpu_map__empty_aggr_cpu_id();
 
-   id.id = (env->cpu[cpu].socket_id << 24) |
-  (env->cpu[cpu].die_id << 16) |
+   id.socket = env->cpu[cpu].socket_id;
+   id.id = (env->cpu[cpu].die_id << 16) |
   (env->cpu[cpu].core_id & 0x);
}
 
diff --git a/tools/

[PATCH v6 03/12] perf tools: Add new struct for cpu aggregation

2020-11-26 Thread James Clark
This struct currently has only a single int member so that
it can be used as a drop in replacement for the existing
behaviour.

Comparison and constructor functions have also been added
that will replace usages of '==' and '= -1'.

No functional changes.

Signed-off-by: James Clark 
Cc: Peter Zijlstra 
Cc: Ingo Molnar 
Cc: Arnaldo Carvalho de Melo 
Cc: Mark Rutland 
Cc: Alexander Shishkin 
Cc: Jiri Olsa 
Cc: Namhyung Kim 
Cc: Thomas Richter 
Cc: John Garry 
---
 tools/perf/util/cpumap.c | 18 ++
 tools/perf/util/cpumap.h |  8 
 2 files changed, 26 insertions(+)

diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c
index 20e3a75953fc..8624948b4f1d 100644
--- a/tools/perf/util/cpumap.c
+++ b/tools/perf/util/cpumap.c
@@ -586,3 +586,21 @@ const struct perf_cpu_map *cpu_map__online(void) /* thread 
unsafe */
 
return online;
 }
+
+bool cpu_map__compare_aggr_cpu_id(struct aggr_cpu_id a, struct aggr_cpu_id b)
+{
+   return a.id == b.id;
+}
+
+bool cpu_map__aggr_cpu_id_is_empty(struct aggr_cpu_id a)
+{
+   return a.id == -1;
+}
+
+struct aggr_cpu_id cpu_map__empty_aggr_cpu_id(void)
+{
+   struct aggr_cpu_id ret = {
+   .id = -1
+   };
+   return ret;
+}
diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h
index 3a442f021468..1cdccc69cd4b 100644
--- a/tools/perf/util/cpumap.h
+++ b/tools/perf/util/cpumap.h
@@ -7,6 +7,10 @@
 #include 
 #include 
 
+struct aggr_cpu_id {
+   int id;
+};
+
 struct perf_record_cpu_map_data;
 
 struct perf_cpu_map *perf_cpu_map__empty_new(int nr);
@@ -64,4 +68,8 @@ int cpu_map__build_map(struct perf_cpu_map *cpus, struct 
perf_cpu_map **res,
 int cpu_map__cpu(struct perf_cpu_map *cpus, int idx);
 bool cpu_map__has(struct perf_cpu_map *cpus, int cpu);
 
+bool cpu_map__compare_aggr_cpu_id(struct aggr_cpu_id a, struct aggr_cpu_id b);
+bool cpu_map__aggr_cpu_id_is_empty(struct aggr_cpu_id a);
+struct aggr_cpu_id cpu_map__empty_aggr_cpu_id(void);
+
 #endif /* __PERF_CPUMAP_H */
-- 
2.28.0



[PATCH v6 00/12] perf tools: fix perf stat with large socket IDs

2020-11-26 Thread James Clark
Changes since v5:
  * Fix test for cpu_map__get_die() by shifting id before testing.
  * Fix test for cpu_map__get_socket() by not using cpu_map__id_to_socket()
which is only valid in CPU aggregation mode.

James Clark (12):
  perf tools: Improve topology test
  perf tools: Use allocator for perf_cpu_map
  perf tools: Add new struct for cpu aggregation
  perf tools: Replace aggregation ID with a struct
  perf tools: add new map type for aggregation
  perf tools: drop in cpu_aggr_map struct
  perf tools: Start using cpu_aggr_id in map
  perf tools: Add separate node member
  perf tools: Add separate socket member
  perf tools: Add separate die member
  perf tools: Add separate core member
  perf tools: Add separate thread member

 tools/perf/builtin-stat.c  | 128 
 tools/perf/tests/topology.c|  64 ++--
 tools/perf/util/cpumap.c   | 171 ++---
 tools/perf/util/cpumap.h   |  55 ++-
 tools/perf/util/stat-display.c | 102 
 tools/perf/util/stat.c |   2 +-
 tools/perf/util/stat.h |   9 +-
 7 files changed, 337 insertions(+), 194 deletions(-)

-- 
2.28.0



[PATCH v6 02/12] perf tools: Use allocator for perf_cpu_map

2020-11-26 Thread James Clark
Use the existing allocator for perf_cpu_map to avoid use
of raw malloc. This could cause an issue in later commits
where the size of perf_cpu_map is changed.

No functional changes.

Signed-off-by: James Clark 
Cc: Peter Zijlstra 
Cc: Ingo Molnar 
Cc: Arnaldo Carvalho de Melo 
Cc: Mark Rutland 
Cc: Alexander Shishkin 
Cc: Jiri Olsa 
Cc: Namhyung Kim 
Cc: Thomas Richter 
Cc: John Garry 
---
 tools/perf/util/cpumap.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c
index dc5c5e6fc502..20e3a75953fc 100644
--- a/tools/perf/util/cpumap.c
+++ b/tools/perf/util/cpumap.c
@@ -132,15 +132,16 @@ int cpu_map__build_map(struct perf_cpu_map *cpus, struct 
perf_cpu_map **res,
   int (*f)(struct perf_cpu_map *map, int cpu, void *data),
   void *data)
 {
-   struct perf_cpu_map *c;
int nr = cpus->nr;
+   struct perf_cpu_map *c = perf_cpu_map__empty_new(nr);
int cpu, s1, s2;
 
-   /* allocate as much as possible */
-   c = calloc(1, sizeof(*c) + nr * sizeof(int));
if (!c)
return -1;
 
+   /* Reset size as it may only be partially filled */
+   c->nr = 0;
+
for (cpu = 0; cpu < nr; cpu++) {
s1 = f(cpus, cpu, data);
for (s2 = 0; s2 < c->nr; s2++) {
@@ -155,7 +156,6 @@ int cpu_map__build_map(struct perf_cpu_map *cpus, struct 
perf_cpu_map **res,
/* ensure we process id in increasing order */
qsort(c->map, c->nr, sizeof(int), cmp_ids);
 
-   refcount_set(&c->refcnt, 1);
*res = c;
return 0;
 }
-- 
2.28.0



[PATCH v6 06/12] perf tools: drop in cpu_aggr_map struct

2020-11-26 Thread James Clark
Replace usages of perf_cpu_map with cpu_aggr map in
places that are involved with perf stat aggregation.

This will then later be changed to be a map of
cpu_aggr_id rather than an int so that more data can
be stored.

No functional changes.

Signed-off-by: James Clark 
Cc: Peter Zijlstra 
Cc: Ingo Molnar 
Cc: Arnaldo Carvalho de Melo 
Cc: Mark Rutland 
Cc: Alexander Shishkin 
Cc: Jiri Olsa 
Cc: Namhyung Kim 
Cc: Thomas Richter 
Cc: John Garry 
---
 tools/perf/builtin-stat.c | 29 ++---
 tools/perf/util/cpumap.c  | 12 ++--
 tools/perf/util/cpumap.h  | 10 +-
 tools/perf/util/stat.h|  4 ++--
 4 files changed, 35 insertions(+), 20 deletions(-)

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index f10c67a26472..344e50651b55 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -1322,14 +1322,29 @@ static int perf_stat_init_aggr_mode(void)
 * the aggregation translate cpumap.
 */
nr = perf_cpu_map__max(evsel_list->core.cpus);
-   stat_config.cpus_aggr_map = perf_cpu_map__empty_new(nr + 1);
+   stat_config.cpus_aggr_map = cpu_aggr_map__empty_new(nr + 1);
return stat_config.cpus_aggr_map ? 0 : -ENOMEM;
 }
 
+static void cpu_aggr_map__delete(struct cpu_aggr_map *map)
+{
+   if (map) {
+   WARN_ONCE(refcount_read(&map->refcnt) != 0,
+ "cpu_aggr_map refcnt unbalanced\n");
+   free(map);
+   }
+}
+
+static void cpu_aggr_map__put(struct cpu_aggr_map *map)
+{
+   if (map && refcount_dec_and_test(&map->refcnt))
+   cpu_aggr_map__delete(map);
+}
+
 static void perf_stat__exit_aggr_mode(void)
 {
-   perf_cpu_map__put(stat_config.aggr_map);
-   perf_cpu_map__put(stat_config.cpus_aggr_map);
+   cpu_aggr_map__put(stat_config.aggr_map);
+   cpu_aggr_map__put(stat_config.cpus_aggr_map);
stat_config.aggr_map = NULL;
stat_config.cpus_aggr_map = NULL;
 }
@@ -1427,25 +1442,25 @@ static struct aggr_cpu_id perf_env__get_node(struct 
perf_cpu_map *map, int idx,
 }
 
 static int perf_env__build_socket_map(struct perf_env *env, struct 
perf_cpu_map *cpus,
- struct perf_cpu_map **sockp)
+ struct cpu_aggr_map **sockp)
 {
return cpu_map__build_map(cpus, sockp, perf_env__get_socket, env);
 }
 
 static int perf_env__build_die_map(struct perf_env *env, struct perf_cpu_map 
*cpus,
-  struct perf_cpu_map **diep)
+  struct cpu_aggr_map **diep)
 {
return cpu_map__build_map(cpus, diep, perf_env__get_die, env);
 }
 
 static int perf_env__build_core_map(struct perf_env *env, struct perf_cpu_map 
*cpus,
-   struct perf_cpu_map **corep)
+   struct cpu_aggr_map **corep)
 {
return cpu_map__build_map(cpus, corep, perf_env__get_core, env);
 }
 
 static int perf_env__build_node_map(struct perf_env *env, struct perf_cpu_map 
*cpus,
-   struct perf_cpu_map **nodep)
+   struct cpu_aggr_map **nodep)
 {
return cpu_map__build_map(cpus, nodep, perf_env__get_node, env);
 }
diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c
index b18e53506656..ea81586305f4 100644
--- a/tools/perf/util/cpumap.c
+++ b/tools/perf/util/cpumap.c
@@ -151,12 +151,12 @@ static int cmp_aggr_cpu_id(const void *a_pointer, const 
void *b_pointer)
return a->id - b->id;
 }
 
-int cpu_map__build_map(struct perf_cpu_map *cpus, struct perf_cpu_map **res,
+int cpu_map__build_map(struct perf_cpu_map *cpus, struct cpu_aggr_map **res,
   struct aggr_cpu_id (*f)(struct perf_cpu_map *map, int 
cpu, void *data),
   void *data)
 {
int nr = cpus->nr;
-   struct perf_cpu_map *c = perf_cpu_map__empty_new(nr);
+   struct cpu_aggr_map *c = cpu_aggr_map__empty_new(nr);
int cpu, s2;
struct aggr_cpu_id s1;
 
@@ -279,22 +279,22 @@ struct aggr_cpu_id cpu_map__get_node(struct perf_cpu_map 
*map, int idx, void *da
return id;
 }
 
-int cpu_map__build_socket_map(struct perf_cpu_map *cpus, struct perf_cpu_map 
**sockp)
+int cpu_map__build_socket_map(struct perf_cpu_map *cpus, struct cpu_aggr_map 
**sockp)
 {
return cpu_map__build_map(cpus, sockp, cpu_map__get_socket, NULL);
 }
 
-int cpu_map__build_die_map(struct perf_cpu_map *cpus, struct perf_cpu_map 
**diep)
+int cpu_map__build_die_map(struct perf_cpu_map *cpus, struct cpu_aggr_map 
**diep)
 {
return cpu_map__build_map(cpus, diep, cpu_map__get_die, NULL);
 }
 
-int cpu_map__build_core_map(struct perf_cpu_map *cpus, struct perf_cpu_map 
**corep)
+int cpu_map__build_core_map(struct perf_cpu_map *cpus, struct cpu_aggr_map 
**corep)
 {
return cpu_map__build_map(cpus, corep, cpu_map__get_co

[PATCH v6 04/12] perf tools: Replace aggregation ID with a struct

2020-11-26 Thread James Clark
Replace all occurences of the usage of int with the new struct
cpu_aggr_id.

No functional changes.

Signed-off-by: James Clark 
Cc: Peter Zijlstra 
Cc: Ingo Molnar 
Cc: Arnaldo Carvalho de Melo 
Cc: Mark Rutland 
Cc: Alexander Shishkin 
Cc: Jiri Olsa 
Cc: Namhyung Kim 
Cc: Thomas Richter 
Cc: John Garry 
---
 tools/perf/builtin-stat.c  |  76 +--
 tools/perf/tests/topology.c|  17 +++---
 tools/perf/util/cpumap.c   |  82 ++---
 tools/perf/util/cpumap.h   |  10 +--
 tools/perf/util/stat-display.c | 108 +++--
 tools/perf/util/stat.c |   2 +-
 tools/perf/util/stat.h |   5 +-
 7 files changed, 173 insertions(+), 127 deletions(-)

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index f15b2f8aa14d..f10c67a26472 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -1188,65 +1188,67 @@ static struct option stat_options[] = {
OPT_END()
 };
 
-static int perf_stat__get_socket(struct perf_stat_config *config 
__maybe_unused,
+static struct aggr_cpu_id perf_stat__get_socket(struct perf_stat_config 
*config __maybe_unused,
 struct perf_cpu_map *map, int cpu)
 {
return cpu_map__get_socket(map, cpu, NULL);
 }
 
-static int perf_stat__get_die(struct perf_stat_config *config __maybe_unused,
+static struct aggr_cpu_id perf_stat__get_die(struct perf_stat_config *config 
__maybe_unused,
  struct perf_cpu_map *map, int cpu)
 {
return cpu_map__get_die(map, cpu, NULL);
 }
 
-static int perf_stat__get_core(struct perf_stat_config *config __maybe_unused,
+static struct aggr_cpu_id perf_stat__get_core(struct perf_stat_config *config 
__maybe_unused,
   struct perf_cpu_map *map, int cpu)
 {
return cpu_map__get_core(map, cpu, NULL);
 }
 
-static int perf_stat__get_node(struct perf_stat_config *config __maybe_unused,
+static struct aggr_cpu_id perf_stat__get_node(struct perf_stat_config *config 
__maybe_unused,
   struct perf_cpu_map *map, int cpu)
 {
return cpu_map__get_node(map, cpu, NULL);
 }
 
-static int perf_stat__get_aggr(struct perf_stat_config *config,
+static struct aggr_cpu_id perf_stat__get_aggr(struct perf_stat_config *config,
   aggr_get_id_t get_id, struct perf_cpu_map *map, 
int idx)
 {
int cpu;
+   struct aggr_cpu_id id = cpu_map__empty_aggr_cpu_id();
 
if (idx >= map->nr)
-   return -1;
+   return id;
 
cpu = map->map[idx];
 
if (config->cpus_aggr_map->map[cpu] == -1)
-   config->cpus_aggr_map->map[cpu] = get_id(config, map, idx);
+   config->cpus_aggr_map->map[cpu] = get_id(config, map, idx).id;
 
-   return config->cpus_aggr_map->map[cpu];
+   id.id = config->cpus_aggr_map->map[cpu];
+   return id;
 }
 
-static int perf_stat__get_socket_cached(struct perf_stat_config *config,
+static struct aggr_cpu_id perf_stat__get_socket_cached(struct perf_stat_config 
*config,
struct perf_cpu_map *map, int idx)
 {
return perf_stat__get_aggr(config, perf_stat__get_socket, map, idx);
 }
 
-static int perf_stat__get_die_cached(struct perf_stat_config *config,
+static struct aggr_cpu_id perf_stat__get_die_cached(struct perf_stat_config 
*config,
struct perf_cpu_map *map, int idx)
 {
return perf_stat__get_aggr(config, perf_stat__get_die, map, idx);
 }
 
-static int perf_stat__get_core_cached(struct perf_stat_config *config,
+static struct aggr_cpu_id perf_stat__get_core_cached(struct perf_stat_config 
*config,
  struct perf_cpu_map *map, int idx)
 {
return perf_stat__get_aggr(config, perf_stat__get_core, map, idx);
 }
 
-static int perf_stat__get_node_cached(struct perf_stat_config *config,
+static struct aggr_cpu_id perf_stat__get_node_cached(struct perf_stat_config 
*config,
  struct perf_cpu_map *map, int idx)
 {
return perf_stat__get_aggr(config, perf_stat__get_node, map, idx);
@@ -1347,18 +1349,23 @@ static inline int perf_env__get_cpu(struct perf_env 
*env, struct perf_cpu_map *m
return cpu;
 }
 
-static int perf_env__get_socket(struct perf_cpu_map *map, int idx, void *data)
+static struct aggr_cpu_id perf_env__get_socket(struct perf_cpu_map *map, int 
idx, void *data)
 {
struct perf_env *env = data;
int cpu = perf_env__get_cpu(env, map, idx);
+   struct aggr_cpu_id id = cpu_map__empty_aggr_cpu_id();
+
+   if (cpu != -1)
+   id.id = env->cpu[cpu].socket_id;
 
-   return cpu == -1 ? -1 : env->cpu[cpu].socket_id;
+   return id;
 }
 
-static int perf_env__get_die(struct perf_cpu_map *map, int idx, void *data)
+static struct aggr_cpu_id perf_e

[PATCH v6 05/12] perf tools: add new map type for aggregation

2020-11-26 Thread James Clark
Currently this is a duplicate of perf_cpu_map so that
it can be used as a drop in replacement.

In a later commit it will be changed from a map of ints
to use the new cpu_aggr_id struct.

No functional changes.

Signed-off-by: James Clark 
Cc: Peter Zijlstra 
Cc: Ingo Molnar 
Cc: Arnaldo Carvalho de Melo 
Cc: Mark Rutland 
Cc: Alexander Shishkin 
Cc: Jiri Olsa 
Cc: Namhyung Kim 
Cc: Thomas Richter 
Cc: John Garry 
---
 tools/perf/util/cpumap.c | 17 +
 tools/perf/util/cpumap.h |  8 
 2 files changed, 25 insertions(+)

diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c
index e05a12bde073..b18e53506656 100644
--- a/tools/perf/util/cpumap.c
+++ b/tools/perf/util/cpumap.c
@@ -95,6 +95,23 @@ struct perf_cpu_map *perf_cpu_map__empty_new(int nr)
return cpus;
 }
 
+struct cpu_aggr_map *cpu_aggr_map__empty_new(int nr)
+{
+   struct cpu_aggr_map *cpus = malloc(sizeof(*cpus) + sizeof(int) * nr);
+
+   if (cpus != NULL) {
+   int i;
+
+   cpus->nr = nr;
+   for (i = 0; i < nr; i++)
+   cpus->map[i] = -1;
+
+   refcount_set(&cpus->refcnt, 1);
+   }
+
+   return cpus;
+}
+
 static int cpu__get_topology_int(int cpu, const char *name, int *value)
 {
char path[PATH_MAX];
diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h
index b8c2288a3f6d..ebd65c4f431b 100644
--- a/tools/perf/util/cpumap.h
+++ b/tools/perf/util/cpumap.h
@@ -11,9 +11,17 @@ struct aggr_cpu_id {
int id;
 };
 
+struct cpu_aggr_map {
+   refcount_t refcnt;
+   int nr;
+   int map[];
+};
+
 struct perf_record_cpu_map_data;
 
 struct perf_cpu_map *perf_cpu_map__empty_new(int nr);
+struct cpu_aggr_map *cpu_aggr_map__empty_new(int nr);
+
 struct perf_cpu_map *cpu_map__new_data(struct perf_record_cpu_map_data *data);
 size_t cpu_map__snprint(struct perf_cpu_map *map, char *buf, size_t size);
 size_t cpu_map__snprint_mask(struct perf_cpu_map *map, char *buf, size_t size);
-- 
2.28.0



[PATCH v6 01/12] perf tools: Improve topology test

2020-11-26 Thread James Clark
Improve the topology test to check all aggregation
types. This is to lock down the behaviour before
'id' is changed into a struct in later commits.

Signed-off-by: James Clark 
Cc: Peter Zijlstra 
Cc: Ingo Molnar 
Cc: Arnaldo Carvalho de Melo 
Cc: Mark Rutland 
Cc: Alexander Shishkin 
Cc: Jiri Olsa 
Cc: Namhyung Kim 
Cc: Thomas Richter 
Cc: John Garry 
---
 tools/perf/tests/topology.c | 53 -
 1 file changed, 46 insertions(+), 7 deletions(-)

diff --git a/tools/perf/tests/topology.c b/tools/perf/tests/topology.c
index 22daf2bdf5fa..8228a1de7ac8 100644
--- a/tools/perf/tests/topology.c
+++ b/tools/perf/tests/topology.c
@@ -64,10 +64,11 @@ static int check_cpu_topology(char *path, struct 
perf_cpu_map *map)
.path = path,
.mode = PERF_DATA_MODE_READ,
};
-   int i;
+   int i, id;
 
session = perf_session__new(&data, false, NULL);
TEST_ASSERT_VAL("can't get session", !IS_ERR(session));
+   cpu__setup_cpunode_map();
 
/* On platforms with large numbers of CPUs process_cpu_topology()
 * might issue an error while reading the perf.data file section
@@ -85,11 +86,18 @@ static int check_cpu_topology(char *path, struct 
perf_cpu_map *map)
 *  "socket_id number is too big. You may need to upgrade the
 *  perf tool."
 *
-*  This is the reason why this test might be skipped.
+*  This is the reason why this test might be skipped. aarch64 and
+*  s390 always write this part of the header, even when the above
+*  condition is true (see do_core_id_test in header.c). So always
+*  run this test on those platforms.
 */
-   if (!session->header.env.cpu)
+   if (!session->header.env.cpu
+   && strncmp(session->header.env.arch, "s390", 4)
+   && strncmp(session->header.env.arch, "aarch64", 7))
return TEST_SKIP;
 
+   TEST_ASSERT_VAL("Session header CPU map not set", 
session->header.env.cpu);
+
for (i = 0; i < session->header.env.nr_cpus_avail; i++) {
if (!cpu_map__has(map, i))
continue;
@@ -98,14 +106,45 @@ static int check_cpu_topology(char *path, struct 
perf_cpu_map *map)
 session->header.env.cpu[i].socket_id);
}
 
+   // Test that core ID contains socket, die and core
+   for (i = 0; i < map->nr; i++) {
+   id = cpu_map__get_core(map, i, NULL);
+   TEST_ASSERT_VAL("Core map - Core ID doesn't match",
+   session->header.env.cpu[map->map[i]].core_id == 
cpu_map__id_to_cpu(id));
+
+   TEST_ASSERT_VAL("Core map - Socket ID doesn't match",
+   session->header.env.cpu[map->map[i]].socket_id ==
+   cpu_map__id_to_socket(id));
+
+   TEST_ASSERT_VAL("Core map - Die ID doesn't match",
+   session->header.env.cpu[map->map[i]].die_id == 
cpu_map__id_to_die(id));
+   }
+
+   // Test that die ID contains socket and die
for (i = 0; i < map->nr; i++) {
-   TEST_ASSERT_VAL("Core ID doesn't match",
-   (session->header.env.cpu[map->map[i]].core_id == 
(cpu_map__get_core(map, i, NULL) & 0x)));
+   id = cpu_map__get_die(map, i, NULL);
+   TEST_ASSERT_VAL("Die map - Socket ID doesn't match",
+   session->header.env.cpu[map->map[i]].socket_id ==
+   cpu_map__id_to_socket(id << 16));
+
+   TEST_ASSERT_VAL("Die map - Die ID doesn't match",
+   session->header.env.cpu[map->map[i]].die_id ==
+   cpu_map__id_to_die(id << 16));
+   }
 
-   TEST_ASSERT_VAL("Socket ID doesn't match",
-   (session->header.env.cpu[map->map[i]].socket_id == 
cpu_map__get_socket(map, i, NULL)));
+   // Test that socket ID contains only socket
+   for (i = 0; i < map->nr; i++) {
+   id = cpu_map__get_socket(map, i, NULL);
+   TEST_ASSERT_VAL("Socket map - Socket ID doesn't match",
+   session->header.env.cpu[map->map[i]].socket_id == id);
}
 
+   // Test that node ID contains only node
+   for (i = 0; i < map->nr; i++) {
+   id = cpu_map__get_node(map, i, NULL);
+   TEST_ASSERT_VAL("Node map - Node ID doesn't match",
+   cpu__get_node(map->map[i]) == id);
+   }
perf_session__delete(session);
 
return 0;
-- 
2.28.0



Re: [PATCH v5 01/12] perf tools: Improve topology test

2020-11-26 Thread James Clark



On 18/11/2020 13:21, Namhyung Kim wrote:
> Hello,
> 
> On Tue, Nov 17, 2020 at 11:49 PM James Clark  wrote:
>>
>> Improve the topology test to check all aggregation
>> types. This is to lock down the behaviour before
>> 'id' is changed into a struct in later commits.
>>
>> Signed-off-by: James Clark 
>> Cc: Peter Zijlstra 
>> Cc: Ingo Molnar 
>> Cc: Arnaldo Carvalho de Melo 
>> Cc: Mark Rutland 
>> Cc: Alexander Shishkin 
>> Cc: Jiri Olsa 
>> Cc: Namhyung Kim 
>> Cc: Thomas Richter 
>> Cc: John Garry 
>> ---
>>  tools/perf/tests/topology.c | 53 -
>>  1 file changed, 46 insertions(+), 7 deletions(-)
>>
>> diff --git a/tools/perf/tests/topology.c b/tools/perf/tests/topology.c
>> index 22daf2bdf5fa..7bd8848d36b6 100644
>> --- a/tools/perf/tests/topology.c
>> +++ b/tools/perf/tests/topology.c
>> @@ -64,10 +64,11 @@ static int check_cpu_topology(char *path, struct 
>> perf_cpu_map *map)
>> .path = path,
>> .mode = PERF_DATA_MODE_READ,
>> };
>> -   int i;
>> +   int i, id;
>>
>> session = perf_session__new(&data, false, NULL);
>> TEST_ASSERT_VAL("can't get session", !IS_ERR(session));
>> +   cpu__setup_cpunode_map();
>>
>> /* On platforms with large numbers of CPUs process_cpu_topology()
>>  * might issue an error while reading the perf.data file section
>> @@ -85,11 +86,18 @@ static int check_cpu_topology(char *path, struct 
>> perf_cpu_map *map)
>>  *  "socket_id number is too big. You may need to upgrade the
>>  *  perf tool."
>>  *
>> -*  This is the reason why this test might be skipped.
>> +*  This is the reason why this test might be skipped. aarch64 and
>> +*  s390 always write this part of the header, even when the above
>> +*  condition is true (see do_core_id_test in header.c). So always
>> +*  run this test on those platforms.
>>  */
>> -   if (!session->header.env.cpu)
>> +   if (!session->header.env.cpu
>> +   && strncmp(session->header.env.arch, "s390", 4)
>> +   && strncmp(session->header.env.arch, "aarch64", 7))
>> return TEST_SKIP;
>>
>> +   TEST_ASSERT_VAL("Session header CPU map not set", 
>> session->header.env.cpu);
>> +
>> for (i = 0; i < session->header.env.nr_cpus_avail; i++) {
>> if (!cpu_map__has(map, i))
>> continue;
>> @@ -98,14 +106,45 @@ static int check_cpu_topology(char *path, struct 
>> perf_cpu_map *map)
>>  session->header.env.cpu[i].socket_id);
>> }
>>
>> +   // Test that core ID contains socket, die and core
>> +   for (i = 0; i < map->nr; i++) {
>> +   id = cpu_map__get_core(map, i, NULL);
>> +   TEST_ASSERT_VAL("Core map - Core ID doesn't match",
>> +   session->header.env.cpu[map->map[i]].core_id == 
>> cpu_map__id_to_cpu(id));
>> +
>> +   TEST_ASSERT_VAL("Core map - Socket ID doesn't match",
>> +   session->header.env.cpu[map->map[i]].socket_id ==
>> +   cpu_map__id_to_socket(id));
>> +
>> +   TEST_ASSERT_VAL("Core map - Die ID doesn't match",
>> +   session->header.env.cpu[map->map[i]].die_id == 
>> cpu_map__id_to_die(id));
>> +   }
>> +
>> +   // Test that die ID contains socket and die
>> for (i = 0; i < map->nr; i++) {
>> -   TEST_ASSERT_VAL("Core ID doesn't match",
>> -   (session->header.env.cpu[map->map[i]].core_id == 
>> (cpu_map__get_core(map, i, NULL) & 0x)));
>> +   id = cpu_map__get_die(map, i, NULL);
>> +   TEST_ASSERT_VAL("Die map - Socket ID doesn't match",
>> +   session->header.env.cpu[map->map[i]].socket_id ==
>> +   cpu_map__id_to_socket(id));
> 
> I'm not sure it works.  It seems cpu_map__get_die() returns
> 16 bit id (socket | die) but cpu_map__id_to_socket() takes
> 32 bit id (socket | die | core), right?

Hi Namhyung,

Yes you are r

Re: [PATCH 07/13 v4] perf tools: restrict visibility of functions

2020-11-17 Thread James Clark



On 15/11/2020 23:17, Jiri Olsa wrote:
> On Fri, Nov 13, 2020 at 07:26:48PM +0200, James Clark wrote:
>> These cpu_aggr_map refcounting functions are only used in
>> builtin-stat.c so their visibilty can be reduced to just
>> that file.
>>
>> No functional changes.
>>
>> Signed-off-by: James Clark 
>> Cc: Peter Zijlstra 
>> Cc: Ingo Molnar 
>> Cc: Arnaldo Carvalho de Melo 
>> Cc: Mark Rutland 
>> Cc: Alexander Shishkin 
>> Cc: Jiri Olsa 
>> Cc: Namhyung Kim 
>> Cc: Thomas Richter 
>> Cc: John Garry 
>> ---
>>  tools/perf/builtin-stat.c | 15 +++
>>  tools/perf/util/cpumap.c  | 15 ---
>>  tools/perf/util/cpumap.h  |  2 --
>>  3 files changed, 15 insertions(+), 17 deletions(-)
>>
>> diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
>> index 7daac139f6cc..344e50651b55 100644
>> --- a/tools/perf/builtin-stat.c
>> +++ b/tools/perf/builtin-stat.c
>> @@ -1326,6 +1326,21 @@ static int perf_stat_init_aggr_mode(void)
>>  return stat_config.cpus_aggr_map ? 0 : -ENOMEM;
>>  }
>>  
>> +static void cpu_aggr_map__delete(struct cpu_aggr_map *map)
>> +{
>> +if (map) {
>> +WARN_ONCE(refcount_read(&map->refcnt) != 0,
>> +  "cpu_aggr_map refcnt unbalanced\n");
>> +free(map);
>> +}
>> +}
>> +
>> +static void cpu_aggr_map__put(struct cpu_aggr_map *map)
>> +{
>> +if (map && refcount_dec_and_test(&map->refcnt))
>> +cpu_aggr_map__delete(map);
>> +}
> 
> you could add them directly as static and skip this change

I think I was trying to avoid the compilation error from the static
functions not being used. But I moved the addition into the commit
where they are used in V5.

James

> 
> jirka
> 
>> +
>>  static void perf_stat__exit_aggr_mode(void)
>>  {
>>  cpu_aggr_map__put(stat_config.aggr_map);
>> diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c
>> index e831a18ec95e..e90270f0be57 100644
>> --- a/tools/perf/util/cpumap.c
>> +++ b/tools/perf/util/cpumap.c
>> @@ -112,21 +112,6 @@ struct cpu_aggr_map *cpu_aggr_map__empty_new(int nr)
>>  return cpus;
>>  }
>>  
>> -void cpu_aggr_map__delete(struct cpu_aggr_map *map)
>> -{
>> -if (map) {
>> -WARN_ONCE(refcount_read(&map->refcnt) != 0,
>> -  "cpu_aggr_map refcnt unbalanced\n");
>> -free(map);
>> -}
>> -}
>> -
>> -void cpu_aggr_map__put(struct cpu_aggr_map *map)
>> -{
>> -if (map && refcount_dec_and_test(&map->refcnt))
>> -cpu_aggr_map__delete(map);
>> -}
>> -
>>  static int cpu__get_topology_int(int cpu, const char *name, int *value)
>>  {
>>  char path[PATH_MAX];
>> diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h
>> index d82822ddcbce..b112069038be 100644
>> --- a/tools/perf/util/cpumap.h
>> +++ b/tools/perf/util/cpumap.h
>> @@ -21,8 +21,6 @@ struct perf_record_cpu_map_data;
>>  
>>  struct perf_cpu_map *perf_cpu_map__empty_new(int nr);
>>  struct cpu_aggr_map *cpu_aggr_map__empty_new(int nr);
>> -void cpu_aggr_map__delete(struct cpu_aggr_map *map);
>> -void cpu_aggr_map__put(struct cpu_aggr_map *map);
>>  
>>  struct perf_cpu_map *cpu_map__new_data(struct perf_record_cpu_map_data 
>> *data);
>>  size_t cpu_map__snprint(struct perf_cpu_map *map, char *buf, size_t size);
>> -- 
>> 2.28.0
>>
> 


Re: [PATCH 13/13 v4] perf tools: add thread field

2020-11-17 Thread James Clark



On 15/11/2020 23:17, Jiri Olsa wrote:
> On Fri, Nov 13, 2020 at 07:26:54PM +0200, James Clark wrote:
>> A separate field isn't strictly required. The core
>> field could be re-used for thread IDs as a single
>> field was used previously.
>>
>> But separating them will avoid confusion and catch
>> potential errors where core IDs are read as thread
>> IDs and vice versa.
>>
>> Also remove the placeholder id field which is now
>> no longer used.
>>
>> Signed-off-by: James Clark 
>> Cc: Peter Zijlstra 
>> Cc: Ingo Molnar 
>> Cc: Arnaldo Carvalho de Melo 
>> Cc: Mark Rutland 
>> Cc: Alexander Shishkin 
>> Cc: Jiri Olsa 
>> Cc: Namhyung Kim 
>> Cc: Thomas Richter 
>> Cc: John Garry 
>> ---
>>  tools/perf/tests/topology.c|  8 
>>  tools/perf/util/cpumap.c   | 14 +++---
>>  tools/perf/util/cpumap.h   |  2 +-
>>  tools/perf/util/stat-display.c |  8 
>>  4 files changed, 16 insertions(+), 16 deletions(-)
>>
>> diff --git a/tools/perf/tests/topology.c b/tools/perf/tests/topology.c
>> index 694f786a77f3..2276db0b1b6f 100644
>> --- a/tools/perf/tests/topology.c
>> +++ b/tools/perf/tests/topology.c
>> @@ -119,7 +119,7 @@ static int check_cpu_topology(char *path, struct 
>> perf_cpu_map *map)
>>  TEST_ASSERT_VAL("Core map - Die ID doesn't match",
>>  session->header.env.cpu[map->map[i]].die_id == id.die);
>>  TEST_ASSERT_VAL("Core map - Node ID is set", id.node == -1);
>> -TEST_ASSERT_VAL("Core map - ID is set", id.id == -1);
>> +TEST_ASSERT_VAL("Core map - Thread is set", id.thread == -1);
>>  }
>>  
>>  // Test that die ID contains socket and die
>> @@ -131,7 +131,7 @@ static int check_cpu_topology(char *path, struct 
>> perf_cpu_map *map)
>>  TEST_ASSERT_VAL("Die map - Die ID doesn't match",
>>  session->header.env.cpu[map->map[i]].die_id == id.die);
>>  TEST_ASSERT_VAL("Die map - Node ID is set", id.node == -1);
>> -TEST_ASSERT_VAL("Die map - ID is set", id.id == -1);
>> +TEST_ASSERT_VAL("Die map - Thread is set", id.thread == -1);
>>  }
>>  
>>  // Test that socket ID contains only socket
>> @@ -141,7 +141,7 @@ static int check_cpu_topology(char *path, struct 
>> perf_cpu_map *map)
>>  session->header.env.cpu[map->map[i]].socket_id == 
>> id.socket);
>>  TEST_ASSERT_VAL("Socket map - Node ID is set", id.node == -1);
>>  TEST_ASSERT_VAL("Socket map - Die ID is set", id.die == -1);
>> -TEST_ASSERT_VAL("Socket map - ID is set", id.id == -1);
>> +TEST_ASSERT_VAL("Socket map - Thread is set", id.thread == -1);
>>  }
>>  
>>  // Test that node ID contains only node
>> @@ -149,7 +149,7 @@ static int check_cpu_topology(char *path, struct 
>> perf_cpu_map *map)
>>  id = cpu_map__get_node(map, i, NULL);
>>  TEST_ASSERT_VAL("Node map - Node ID doesn't match",
>>  cpu__get_node(map->map[i]) == id.node);
>> -TEST_ASSERT_VAL("Node map - ID shouldn't be set", id.id == -1);
>> +TEST_ASSERT_VAL("Node map - Thread shouldn't be set", id.thread 
>> == -1);
>>  TEST_ASSERT_VAL("Node map - Die ID is set", id.die == -1);
>>  }
> 
> should we test all unset parts are -1, like here id.core,
> id.socket and there are missing tests also in above code

Yes I think that's a good idea. I added all the missing ones in V5.

Thanks for the review.

James

> 
> jirka
> 


[PATCH v5 05/12] perf tools: add new map type for aggregation

2020-11-17 Thread James Clark
Currently this is a duplicate of perf_cpu_map so that
it can be used as a drop in replacement.

In a later commit it will be changed from a map of ints
to use the new cpu_aggr_id struct.

No functional changes.

Signed-off-by: James Clark 
Cc: Peter Zijlstra 
Cc: Ingo Molnar 
Cc: Arnaldo Carvalho de Melo 
Cc: Mark Rutland 
Cc: Alexander Shishkin 
Cc: Jiri Olsa 
Cc: Namhyung Kim 
Cc: Thomas Richter 
Cc: John Garry 
---
 tools/perf/util/cpumap.c | 17 +
 tools/perf/util/cpumap.h |  8 
 2 files changed, 25 insertions(+)

diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c
index e05a12bde073..b18e53506656 100644
--- a/tools/perf/util/cpumap.c
+++ b/tools/perf/util/cpumap.c
@@ -95,6 +95,23 @@ struct perf_cpu_map *perf_cpu_map__empty_new(int nr)
return cpus;
 }
 
+struct cpu_aggr_map *cpu_aggr_map__empty_new(int nr)
+{
+   struct cpu_aggr_map *cpus = malloc(sizeof(*cpus) + sizeof(int) * nr);
+
+   if (cpus != NULL) {
+   int i;
+
+   cpus->nr = nr;
+   for (i = 0; i < nr; i++)
+   cpus->map[i] = -1;
+
+   refcount_set(&cpus->refcnt, 1);
+   }
+
+   return cpus;
+}
+
 static int cpu__get_topology_int(int cpu, const char *name, int *value)
 {
char path[PATH_MAX];
diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h
index b8c2288a3f6d..ebd65c4f431b 100644
--- a/tools/perf/util/cpumap.h
+++ b/tools/perf/util/cpumap.h
@@ -11,9 +11,17 @@ struct aggr_cpu_id {
int id;
 };
 
+struct cpu_aggr_map {
+   refcount_t refcnt;
+   int nr;
+   int map[];
+};
+
 struct perf_record_cpu_map_data;
 
 struct perf_cpu_map *perf_cpu_map__empty_new(int nr);
+struct cpu_aggr_map *cpu_aggr_map__empty_new(int nr);
+
 struct perf_cpu_map *cpu_map__new_data(struct perf_record_cpu_map_data *data);
 size_t cpu_map__snprint(struct perf_cpu_map *map, char *buf, size_t size);
 size_t cpu_map__snprint_mask(struct perf_cpu_map *map, char *buf, size_t size);
-- 
2.28.0



[PATCH v5 00/12] perf tools: fix perf stat with large socket IDs

2020-11-17 Thread James Clark
Changes since v4:

* Test all fields in topology test, even if they should be -1
* Remove extra refcount from cpu_map__build_map()
* Reduce the changes in sort_aggr_thread()
* Move addition of cpu_aggr_map__put() and cpu_aggr_map__delete()
  into the commit where they are used so that they don't have
  to be changed to static in a separate commit

James Clark (12):
  perf tools: Improve topology test
  perf tools: Use allocator for perf_cpu_map
  perf tools: Add new struct for cpu aggregation
  perf tools: Replace aggregation ID with a struct
  perf tools: add new map type for aggregation
  perf tools: drop in cpu_aggr_map struct
  perf tools: Start using cpu_aggr_id in map
  perf tools: Add separate node member
  perf tools: Add separate socket member
  perf tools: Add separate die member
  perf tools: Add separate core member
  perf tools: Add separate thread member

 tools/perf/builtin-stat.c  | 128 
 tools/perf/tests/topology.c|  62 ++--
 tools/perf/util/cpumap.c   | 171 ++---
 tools/perf/util/cpumap.h   |  55 ++-
 tools/perf/util/stat-display.c | 102 
 tools/perf/util/stat.c |   2 +-
 tools/perf/util/stat.h |   9 +-
 7 files changed, 335 insertions(+), 194 deletions(-)

-- 
2.28.0



[PATCH v5 12/12] perf tools: Add separate thread member

2020-11-17 Thread James Clark
A separate field isn't strictly required. The core
field could be re-used for thread IDs as a single
field was used previously.

But separating them will avoid confusion and catch
potential errors where core IDs are read as thread
IDs and vice versa.

Also remove the placeholder id field which is now
no longer used.

Signed-off-by: James Clark 
Cc: Peter Zijlstra 
Cc: Ingo Molnar 
Cc: Arnaldo Carvalho de Melo 
Cc: Mark Rutland 
Cc: Alexander Shishkin 
Cc: Jiri Olsa 
Cc: Namhyung Kim 
Cc: Thomas Richter 
Cc: John Garry 
---
 tools/perf/tests/topology.c|  8 
 tools/perf/util/cpumap.c   | 14 +++---
 tools/perf/util/cpumap.h   |  2 +-
 tools/perf/util/stat-display.c |  8 
 4 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/tools/perf/tests/topology.c b/tools/perf/tests/topology.c
index 3baaac6c7454..b73e92a15cdc 100644
--- a/tools/perf/tests/topology.c
+++ b/tools/perf/tests/topology.c
@@ -119,7 +119,7 @@ static int check_cpu_topology(char *path, struct 
perf_cpu_map *map)
TEST_ASSERT_VAL("Core map - Die ID doesn't match",
session->header.env.cpu[map->map[i]].die_id == id.die);
TEST_ASSERT_VAL("Core map - Node ID is set", id.node == -1);
-   TEST_ASSERT_VAL("Core map - ID is set", id.id == -1);
+   TEST_ASSERT_VAL("Core map - Thread is set", id.thread == -1);
}
 
// Test that die ID contains socket and die
@@ -131,8 +131,8 @@ static int check_cpu_topology(char *path, struct 
perf_cpu_map *map)
TEST_ASSERT_VAL("Die map - Die ID doesn't match",
session->header.env.cpu[map->map[i]].die_id == id.die);
TEST_ASSERT_VAL("Die map - Node ID is set", id.node == -1);
-   TEST_ASSERT_VAL("Die map - ID is set", id.id == -1);
TEST_ASSERT_VAL("Die map - Core is set", id.core == -1);
+   TEST_ASSERT_VAL("Die map - Thread is set", id.thread == -1);
}
 
// Test that socket ID contains only socket
@@ -142,8 +142,8 @@ static int check_cpu_topology(char *path, struct 
perf_cpu_map *map)
session->header.env.cpu[map->map[i]].socket_id == 
id.socket);
TEST_ASSERT_VAL("Socket map - Node ID is set", id.node == -1);
TEST_ASSERT_VAL("Socket map - Die ID is set", id.die == -1);
-   TEST_ASSERT_VAL("Socket map - ID is set", id.id == -1);
TEST_ASSERT_VAL("Socket map - Core is set", id.core == -1);
+   TEST_ASSERT_VAL("Socket map - Thread is set", id.thread == -1);
}
 
// Test that node ID contains only node
@@ -151,10 +151,10 @@ static int check_cpu_topology(char *path, struct 
perf_cpu_map *map)
id = cpu_map__get_node(map, i, NULL);
TEST_ASSERT_VAL("Node map - Node ID doesn't match",
cpu__get_node(map->map[i]) == id.node);
-   TEST_ASSERT_VAL("Node map - ID is set", id.id == -1);
TEST_ASSERT_VAL("Node map - Socket is set", id.socket == -1);
TEST_ASSERT_VAL("Node map - Die ID is set", id.die == -1);
TEST_ASSERT_VAL("Node map - Core is set", id.core == -1);
+   TEST_ASSERT_VAL("Node map - Thread is set", id.thread == -1);
}
perf_session__delete(session);
 
diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c
index d164f7bd1ac7..87d3eca9b872 100644
--- a/tools/perf/util/cpumap.c
+++ b/tools/perf/util/cpumap.c
@@ -148,16 +148,16 @@ static int cmp_aggr_cpu_id(const void *a_pointer, const 
void *b_pointer)
struct aggr_cpu_id *a = (struct aggr_cpu_id *)a_pointer;
struct aggr_cpu_id *b = (struct aggr_cpu_id *)b_pointer;
 
-   if (a->id != b->id)
-   return a->id - b->id;
-   else if (a->node != b->node)
+   if (a->node != b->node)
return a->node - b->node;
else if (a->socket != b->socket)
return a->socket - b->socket;
else if (a->die != b->die)
return a->die - b->die;
-   else
+   else if (a->core != b->core)
return a->core - b->core;
+   else
+   return a->thread - b->thread;
 }
 
 int cpu_map__build_map(struct perf_cpu_map *cpus, struct cpu_aggr_map **res,
@@ -616,7 +616,7 @@ const struct perf_cpu_map *cpu_map__online(void) /* thread 
unsafe */
 
 bool cpu_map__compare_aggr_cpu_id(struct aggr_cpu_id a, struct aggr_cpu_id b)
 {
-   return a.id == b.id &&
+   return a.thread == b.thread &&
a.node == b.node &&
a.socke

Re: [PATCH 02/13 v4] perf tools: Use allocator for perf_cpu_map

2020-11-17 Thread James Clark



On 15/11/2020 23:17, Jiri Olsa wrote:
> On Fri, Nov 13, 2020 at 07:26:43PM +0200, James Clark wrote:
>> Use the existing allocator for perf_cpu_map to avoid use
>> of raw malloc. This could cause an issue in later commits
>> where the size of perf_cpu_map is changed.
>>
>> No functional changes.
>>
>> Signed-off-by: James Clark 
>> Cc: Peter Zijlstra 
>> Cc: Ingo Molnar 
>> Cc: Arnaldo Carvalho de Melo 
>> Cc: Mark Rutland 
>> Cc: Alexander Shishkin 
>> Cc: Jiri Olsa 
>> Cc: Namhyung Kim 
>> Cc: Thomas Richter 
>> Cc: John Garry 
>> ---
>>  tools/perf/util/cpumap.c | 7 ---
>>  1 file changed, 4 insertions(+), 3 deletions(-)
>>
>> diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c
>> index dc5c5e6fc502..fd7d0a77a9e6 100644
>> --- a/tools/perf/util/cpumap.c
>> +++ b/tools/perf/util/cpumap.c
>> @@ -132,15 +132,16 @@ int cpu_map__build_map(struct perf_cpu_map *cpus, 
>> struct perf_cpu_map **res,
>> int (*f)(struct perf_cpu_map *map, int cpu, void *data),
>> void *data)
>>  {
>> -struct perf_cpu_map *c;
>>  int nr = cpus->nr;
>> +struct perf_cpu_map *c = perf_cpu_map__empty_new(nr);
>>  int cpu, s1, s2;
>>  
>> -/* allocate as much as possible */
>> -c = calloc(1, sizeof(*c) + nr * sizeof(int));
>>  if (!c)
>>  return -1;
>>  
>> +/* Reset size as it may only be partially filled */
>> +c->nr = 0;
>> +
>>  for (cpu = 0; cpu < nr; cpu++) {
>>  s1 = f(cpus, cpu, data);
>>  for (s2 = 0; s2 < c->nr; s2++) {
> 
> also remove refcount_set call down here,
> it's already in set in perf_cpu_map__empty_new

Oops yeah, good catch. Removed in V5

James
> 
> thanks,
> jirka
> 
>> -- 
>> 2.28.0
>>
> 


Re: [PATCH 04/13 v4] perf tools: Replace aggregation ID with a struct

2020-11-17 Thread James Clark



On 15/11/2020 23:17, Jiri Olsa wrote:
> On Fri, Nov 13, 2020 at 07:26:45PM +0200, James Clark wrote:
> 
> SNIP
> 
>> @@ -754,7 +766,7 @@ static void print_aggr_thread(struct perf_stat_config 
>> *config,
>>  FILE *output = config->output;
>>  int nthreads = perf_thread_map__nr(counter->core.threads);
>>  int ncpus = perf_cpu_map__nr(counter->core.cpus);
>> -int thread, sorted_threads, id;
>> +int thread, sorted_threads;
>>  struct perf_aggr_thread_value *buf;
>>  
>>  buf = sort_aggr_thread(counter, nthreads, ncpus, &sorted_threads, 
>> _target);
>> @@ -767,13 +779,12 @@ static void print_aggr_thread(struct perf_stat_config 
>> *config,
>>  if (prefix)
>>  fprintf(output, "%s", prefix);
>>  
>> -id = buf[thread].id;
> 
> would it be less changes in here if you kept id with new type?

Yes it did turn out with almost no changes by just changing the type.

James

> 
> jirka
> 
>>  if (config->stats)
>> -printout(config, id, 0, buf[thread].counter, 
>> buf[thread].uval,
>> +printout(config, buf[thread].id, 0, 
>> buf[thread].counter, buf[thread].uval,
>>   prefix, buf[thread].run, buf[thread].ena, 1.0,
>> - &config->stats[id]);
>> + &config->stats[buf[thread].id.id]);
>>  else
>> -printout(config, id, 0, buf[thread].counter, 
>> buf[thread].uval,
>> +printout(config, buf[thread].id, 0, 
>> buf[thread].counter, buf[thread].uval,
>>   prefix, buf[thread].run, buf[thread].ena, 1.0,
>>   &rt_stat);
>>  fputc('\n', output);
> 
> SNIP
> 


[PATCH v5 10/12] perf tools: Add separate die member

2020-11-17 Thread James Clark
Add die as a separate member so that it doesn't have to be
packed into the int value.

Signed-off-by: James Clark 
Cc: Peter Zijlstra 
Cc: Ingo Molnar 
Cc: Arnaldo Carvalho de Melo 
Cc: Mark Rutland 
Cc: Alexander Shishkin 
Cc: Jiri Olsa 
Cc: Namhyung Kim 
Cc: Thomas Richter 
Cc: John Garry 
---
 tools/perf/builtin-stat.c  | 14 +++---
 tools/perf/tests/topology.c|  7 +--
 tools/perf/util/cpumap.c   | 28 ++--
 tools/perf/util/cpumap.h   |  6 +-
 tools/perf/util/stat-display.c |  6 +++---
 5 files changed, 26 insertions(+), 35 deletions(-)

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 193e7a4e0c7b..514144dad8b1 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -1389,11 +1389,7 @@ static struct aggr_cpu_id perf_env__get_die(struct 
perf_cpu_map *map, int idx, v
 * make a unique ID.
 */
id.socket = env->cpu[cpu].socket_id;
-
-   if (WARN_ONCE(env->cpu[cpu].die_id >> 8, "The die id number is 
too big.\n"))
-   return cpu_map__empty_aggr_cpu_id();
-
-   id.id = env->cpu[cpu].die_id & 0xff;
+   id.die = env->cpu[cpu].die_id;
}
 
return id;
@@ -1407,20 +1403,16 @@ static struct aggr_cpu_id perf_env__get_core(struct 
perf_cpu_map *map, int idx,
 
if (cpu != -1) {
/*
-* encode die id in bit range 23:16
 * core_id is relative to socket and die,
 * we need a global id. So we combine
 * socket + die id + core id
 */
-   if (WARN_ONCE(env->cpu[cpu].die_id >> 8, "The die id number is 
too big.\n"))
-   return cpu_map__empty_aggr_cpu_id();
-
if (WARN_ONCE(env->cpu[cpu].core_id >> 16, "The core id number 
is too big.\n"))
return cpu_map__empty_aggr_cpu_id();
 
id.socket = env->cpu[cpu].socket_id;
-   id.id = (env->cpu[cpu].die_id << 16) |
-  (env->cpu[cpu].core_id & 0x);
+   id.die = env->cpu[cpu].die_id;
+   id.id = env->cpu[cpu].core_id & 0x;
}
 
return id;
diff --git a/tools/perf/tests/topology.c b/tools/perf/tests/topology.c
index f9c54be7767e..1b0db2405720 100644
--- a/tools/perf/tests/topology.c
+++ b/tools/perf/tests/topology.c
@@ -117,7 +117,7 @@ static int check_cpu_topology(char *path, struct 
perf_cpu_map *map)
session->header.env.cpu[map->map[i]].socket_id == 
id.socket);
 
TEST_ASSERT_VAL("Core map - Die ID doesn't match",
-   session->header.env.cpu[map->map[i]].die_id == 
cpu_map__id_to_die(id.id));
+   session->header.env.cpu[map->map[i]].die_id == id.die);
TEST_ASSERT_VAL("Core map - Node ID is set", id.node == -1);
}
 
@@ -128,8 +128,9 @@ static int check_cpu_topology(char *path, struct 
perf_cpu_map *map)
session->header.env.cpu[map->map[i]].socket_id == 
id.socket);
 
TEST_ASSERT_VAL("Die map - Die ID doesn't match",
-   session->header.env.cpu[map->map[i]].die_id == 
cpu_map__id_to_die(id.id));
+   session->header.env.cpu[map->map[i]].die_id == id.die);
TEST_ASSERT_VAL("Die map - Node ID is set", id.node == -1);
+   TEST_ASSERT_VAL("Die map - ID is set", id.id == -1);
}
 
// Test that socket ID contains only socket
@@ -138,6 +139,7 @@ static int check_cpu_topology(char *path, struct 
perf_cpu_map *map)
TEST_ASSERT_VAL("Socket map - Socket ID doesn't match",
session->header.env.cpu[map->map[i]].socket_id == 
id.socket);
TEST_ASSERT_VAL("Socket map - Node ID is set", id.node == -1);
+   TEST_ASSERT_VAL("Socket map - Die ID is set", id.die == -1);
TEST_ASSERT_VAL("Socket map - ID is set", id.id == -1);
}
 
@@ -148,6 +150,7 @@ static int check_cpu_topology(char *path, struct 
perf_cpu_map *map)
cpu__get_node(map->map[i]) == id.node);
TEST_ASSERT_VAL("Node map - ID is set", id.id == -1);
TEST_ASSERT_VAL("Node map - Socket is set", id.socket == -1);
+   TEST_ASSERT_VAL("Node map - Die ID is set", id.die == -1);
}
perf_session__delete(session);
 
diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c
index d2630f03f682..10a52058d838 100644
--- a/tools/perf/util/cpumap.c
+++ b/tools/perf/util/cpumap.c
@@ -152,8 +152,10 @@ stati

[PATCH v5 04/12] perf tools: Replace aggregation ID with a struct

2020-11-17 Thread James Clark
Replace all occurences of the usage of int with the new struct
cpu_aggr_id.

No functional changes.

Signed-off-by: James Clark 
Cc: Peter Zijlstra 
Cc: Ingo Molnar 
Cc: Arnaldo Carvalho de Melo 
Cc: Mark Rutland 
Cc: Alexander Shishkin 
Cc: Jiri Olsa 
Cc: Namhyung Kim 
Cc: Thomas Richter 
Cc: John Garry 
---
 tools/perf/builtin-stat.c  |  76 +--
 tools/perf/tests/topology.c|  17 +++---
 tools/perf/util/cpumap.c   |  82 ++---
 tools/perf/util/cpumap.h   |  10 +--
 tools/perf/util/stat-display.c | 108 +++--
 tools/perf/util/stat.c |   2 +-
 tools/perf/util/stat.h |   5 +-
 7 files changed, 173 insertions(+), 127 deletions(-)

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index f15b2f8aa14d..f10c67a26472 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -1188,65 +1188,67 @@ static struct option stat_options[] = {
OPT_END()
 };
 
-static int perf_stat__get_socket(struct perf_stat_config *config 
__maybe_unused,
+static struct aggr_cpu_id perf_stat__get_socket(struct perf_stat_config 
*config __maybe_unused,
 struct perf_cpu_map *map, int cpu)
 {
return cpu_map__get_socket(map, cpu, NULL);
 }
 
-static int perf_stat__get_die(struct perf_stat_config *config __maybe_unused,
+static struct aggr_cpu_id perf_stat__get_die(struct perf_stat_config *config 
__maybe_unused,
  struct perf_cpu_map *map, int cpu)
 {
return cpu_map__get_die(map, cpu, NULL);
 }
 
-static int perf_stat__get_core(struct perf_stat_config *config __maybe_unused,
+static struct aggr_cpu_id perf_stat__get_core(struct perf_stat_config *config 
__maybe_unused,
   struct perf_cpu_map *map, int cpu)
 {
return cpu_map__get_core(map, cpu, NULL);
 }
 
-static int perf_stat__get_node(struct perf_stat_config *config __maybe_unused,
+static struct aggr_cpu_id perf_stat__get_node(struct perf_stat_config *config 
__maybe_unused,
   struct perf_cpu_map *map, int cpu)
 {
return cpu_map__get_node(map, cpu, NULL);
 }
 
-static int perf_stat__get_aggr(struct perf_stat_config *config,
+static struct aggr_cpu_id perf_stat__get_aggr(struct perf_stat_config *config,
   aggr_get_id_t get_id, struct perf_cpu_map *map, 
int idx)
 {
int cpu;
+   struct aggr_cpu_id id = cpu_map__empty_aggr_cpu_id();
 
if (idx >= map->nr)
-   return -1;
+   return id;
 
cpu = map->map[idx];
 
if (config->cpus_aggr_map->map[cpu] == -1)
-   config->cpus_aggr_map->map[cpu] = get_id(config, map, idx);
+   config->cpus_aggr_map->map[cpu] = get_id(config, map, idx).id;
 
-   return config->cpus_aggr_map->map[cpu];
+   id.id = config->cpus_aggr_map->map[cpu];
+   return id;
 }
 
-static int perf_stat__get_socket_cached(struct perf_stat_config *config,
+static struct aggr_cpu_id perf_stat__get_socket_cached(struct perf_stat_config 
*config,
struct perf_cpu_map *map, int idx)
 {
return perf_stat__get_aggr(config, perf_stat__get_socket, map, idx);
 }
 
-static int perf_stat__get_die_cached(struct perf_stat_config *config,
+static struct aggr_cpu_id perf_stat__get_die_cached(struct perf_stat_config 
*config,
struct perf_cpu_map *map, int idx)
 {
return perf_stat__get_aggr(config, perf_stat__get_die, map, idx);
 }
 
-static int perf_stat__get_core_cached(struct perf_stat_config *config,
+static struct aggr_cpu_id perf_stat__get_core_cached(struct perf_stat_config 
*config,
  struct perf_cpu_map *map, int idx)
 {
return perf_stat__get_aggr(config, perf_stat__get_core, map, idx);
 }
 
-static int perf_stat__get_node_cached(struct perf_stat_config *config,
+static struct aggr_cpu_id perf_stat__get_node_cached(struct perf_stat_config 
*config,
  struct perf_cpu_map *map, int idx)
 {
return perf_stat__get_aggr(config, perf_stat__get_node, map, idx);
@@ -1347,18 +1349,23 @@ static inline int perf_env__get_cpu(struct perf_env 
*env, struct perf_cpu_map *m
return cpu;
 }
 
-static int perf_env__get_socket(struct perf_cpu_map *map, int idx, void *data)
+static struct aggr_cpu_id perf_env__get_socket(struct perf_cpu_map *map, int 
idx, void *data)
 {
struct perf_env *env = data;
int cpu = perf_env__get_cpu(env, map, idx);
+   struct aggr_cpu_id id = cpu_map__empty_aggr_cpu_id();
+
+   if (cpu != -1)
+   id.id = env->cpu[cpu].socket_id;
 
-   return cpu == -1 ? -1 : env->cpu[cpu].socket_id;
+   return id;
 }
 
-static int perf_env__get_die(struct perf_cpu_map *map, int idx, void *data)
+static struct aggr_cpu_id perf_e

[PATCH v5 07/12] perf tools: Start using cpu_aggr_id in map

2020-11-17 Thread James Clark
Use the new cpu_aggr_id struct in the cpu map
instead of int so that it can store more data.

No functional changes.

Signed-off-by: James Clark 
Cc: Peter Zijlstra 
Cc: Ingo Molnar 
Cc: Arnaldo Carvalho de Melo 
Cc: Mark Rutland 
Cc: Alexander Shishkin 
Cc: Jiri Olsa 
Cc: Namhyung Kim 
Cc: Thomas Richter 
Cc: John Garry 
---
 tools/perf/builtin-stat.c  | 6 +++---
 tools/perf/util/cpumap.c   | 8 
 tools/perf/util/cpumap.h   | 2 +-
 tools/perf/util/stat-display.c | 6 +++---
 4 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 344e50651b55..afe9fa6112b6 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -1223,10 +1223,10 @@ static struct aggr_cpu_id perf_stat__get_aggr(struct 
perf_stat_config *config,
 
cpu = map->map[idx];
 
-   if (config->cpus_aggr_map->map[cpu] == -1)
-   config->cpus_aggr_map->map[cpu] = get_id(config, map, idx).id;
+   if (cpu_map__aggr_cpu_id_is_empty(config->cpus_aggr_map->map[cpu]))
+   config->cpus_aggr_map->map[cpu] = get_id(config, map, idx);
 
-   id.id = config->cpus_aggr_map->map[cpu];
+   id = config->cpus_aggr_map->map[cpu];
return id;
 }
 
diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c
index ea81586305f4..b50609b9a585 100644
--- a/tools/perf/util/cpumap.c
+++ b/tools/perf/util/cpumap.c
@@ -97,14 +97,14 @@ struct perf_cpu_map *perf_cpu_map__empty_new(int nr)
 
 struct cpu_aggr_map *cpu_aggr_map__empty_new(int nr)
 {
-   struct cpu_aggr_map *cpus = malloc(sizeof(*cpus) + sizeof(int) * nr);
+   struct cpu_aggr_map *cpus = malloc(sizeof(*cpus) + sizeof(struct 
aggr_cpu_id) * nr);
 
if (cpus != NULL) {
int i;
 
cpus->nr = nr;
for (i = 0; i < nr; i++)
-   cpus->map[i] = -1;
+   cpus->map[i] = cpu_map__empty_aggr_cpu_id();
 
refcount_set(&cpus->refcnt, 1);
}
@@ -169,11 +169,11 @@ int cpu_map__build_map(struct perf_cpu_map *cpus, struct 
cpu_aggr_map **res,
for (cpu = 0; cpu < nr; cpu++) {
s1 = f(cpus, cpu, data);
for (s2 = 0; s2 < c->nr; s2++) {
-   if (s1.id == c->map[s2])
+   if (cpu_map__compare_aggr_cpu_id(s1, c->map[s2]))
break;
}
if (s2 == c->nr) {
-   c->map[c->nr] = s1.id;
+   c->map[c->nr] = s1;
c->nr++;
}
}
diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h
index b112069038be..d8fc265bc762 100644
--- a/tools/perf/util/cpumap.h
+++ b/tools/perf/util/cpumap.h
@@ -14,7 +14,7 @@ struct aggr_cpu_id {
 struct cpu_aggr_map {
refcount_t refcnt;
int nr;
-   int map[];
+   struct aggr_cpu_id map[];
 };
 
 struct perf_record_cpu_map_data;
diff --git a/tools/perf/util/stat-display.c b/tools/perf/util/stat-display.c
index 1a3d66329d73..da0766403d3b 100644
--- a/tools/perf/util/stat-display.c
+++ b/tools/perf/util/stat-display.c
@@ -509,7 +509,7 @@ static void aggr_update_shadow(struct perf_stat_config 
*config,
struct evsel *counter;
 
for (s = 0; s < config->aggr_map->nr; s++) {
-   id.id = config->aggr_map->map[s];
+   id = config->aggr_map->map[s];
evlist__for_each_entry(evlist, counter) {
val = 0;
for (cpu = 0; cpu < evsel__nr_cpus(counter); cpu++) {
@@ -641,7 +641,7 @@ static void print_counter_aggrdata(struct perf_stat_config 
*config,
struct aggr_cpu_id id;
double uval;
 
-   ad.id.id = id.id = config->aggr_map->map[s];
+   ad.id = id = config->aggr_map->map[s];
ad.val = ad.ena = ad.run = 0;
ad.nr = 0;
if (!collect_data(config, counter, aggr_cb, &ad))
@@ -1169,7 +1169,7 @@ static void print_percore_thread(struct perf_stat_config 
*config,
for (int i = 0; i < evsel__nr_cpus(counter); i++) {
s2 = config->aggr_get_id(config, evsel__cpus(counter), i);
for (s = 0; s < config->aggr_map->nr; s++) {
-   id.id = config->aggr_map->map[s];
+   id = config->aggr_map->map[s];
if (cpu_map__compare_aggr_cpu_id(s2, id))
break;
}
-- 
2.28.0



[PATCH v5 11/12] perf tools: Add separate core member

2020-11-17 Thread James Clark
Add core as a separate member so that it doesn't have to be
packed into the int value.

Signed-off-by: James Clark 
Cc: Peter Zijlstra 
Cc: Ingo Molnar 
Cc: Arnaldo Carvalho de Melo 
Cc: Mark Rutland 
Cc: Alexander Shishkin 
Cc: Jiri Olsa 
Cc: Namhyung Kim 
Cc: Thomas Richter 
Cc: John Garry 
---
 tools/perf/builtin-stat.c  |  9 +++--
 tools/perf/tests/topology.c|  6 +-
 tools/perf/util/cpumap.c   | 18 ++
 tools/perf/util/cpumap.h   |  6 +-
 tools/perf/util/stat-display.c | 16 
 5 files changed, 27 insertions(+), 28 deletions(-)

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 514144dad8b1..d79a29e22dfd 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -1404,15 +1404,12 @@ static struct aggr_cpu_id perf_env__get_core(struct 
perf_cpu_map *map, int idx,
if (cpu != -1) {
/*
 * core_id is relative to socket and die,
-* we need a global id. So we combine
-* socket + die id + core id
+* we need a global id. So we set
+* socket, die id and core id
 */
-   if (WARN_ONCE(env->cpu[cpu].core_id >> 16, "The core id number 
is too big.\n"))
-   return cpu_map__empty_aggr_cpu_id();
-
id.socket = env->cpu[cpu].socket_id;
id.die = env->cpu[cpu].die_id;
-   id.id = env->cpu[cpu].core_id & 0x;
+   id.core = env->cpu[cpu].core_id;
}
 
return id;
diff --git a/tools/perf/tests/topology.c b/tools/perf/tests/topology.c
index 1b0db2405720..3baaac6c7454 100644
--- a/tools/perf/tests/topology.c
+++ b/tools/perf/tests/topology.c
@@ -111,7 +111,7 @@ static int check_cpu_topology(char *path, struct 
perf_cpu_map *map)
for (i = 0; i < map->nr; i++) {
id = cpu_map__get_core(map, i, NULL);
TEST_ASSERT_VAL("Core map - Core ID doesn't match",
-   session->header.env.cpu[map->map[i]].core_id == 
cpu_map__id_to_cpu(id.id));
+   session->header.env.cpu[map->map[i]].core_id == 
id.core);
 
TEST_ASSERT_VAL("Core map - Socket ID doesn't match",
session->header.env.cpu[map->map[i]].socket_id == 
id.socket);
@@ -119,6 +119,7 @@ static int check_cpu_topology(char *path, struct 
perf_cpu_map *map)
TEST_ASSERT_VAL("Core map - Die ID doesn't match",
session->header.env.cpu[map->map[i]].die_id == id.die);
TEST_ASSERT_VAL("Core map - Node ID is set", id.node == -1);
+   TEST_ASSERT_VAL("Core map - ID is set", id.id == -1);
}
 
// Test that die ID contains socket and die
@@ -131,6 +132,7 @@ static int check_cpu_topology(char *path, struct 
perf_cpu_map *map)
session->header.env.cpu[map->map[i]].die_id == id.die);
TEST_ASSERT_VAL("Die map - Node ID is set", id.node == -1);
TEST_ASSERT_VAL("Die map - ID is set", id.id == -1);
+   TEST_ASSERT_VAL("Die map - Core is set", id.core == -1);
}
 
// Test that socket ID contains only socket
@@ -141,6 +143,7 @@ static int check_cpu_topology(char *path, struct 
perf_cpu_map *map)
TEST_ASSERT_VAL("Socket map - Node ID is set", id.node == -1);
TEST_ASSERT_VAL("Socket map - Die ID is set", id.die == -1);
TEST_ASSERT_VAL("Socket map - ID is set", id.id == -1);
+   TEST_ASSERT_VAL("Socket map - Core is set", id.core == -1);
}
 
// Test that node ID contains only node
@@ -151,6 +154,7 @@ static int check_cpu_topology(char *path, struct 
perf_cpu_map *map)
TEST_ASSERT_VAL("Node map - ID is set", id.id == -1);
TEST_ASSERT_VAL("Node map - Socket is set", id.socket == -1);
TEST_ASSERT_VAL("Node map - Die ID is set", id.die == -1);
+   TEST_ASSERT_VAL("Node map - Core is set", id.core == -1);
}
perf_session__delete(session);
 
diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c
index 10a52058d838..d164f7bd1ac7 100644
--- a/tools/perf/util/cpumap.c
+++ b/tools/perf/util/cpumap.c
@@ -154,8 +154,10 @@ static int cmp_aggr_cpu_id(const void *a_pointer, const 
void *b_pointer)
return a->node - b->node;
else if (a->socket != b->socket)
return a->socket - b->socket;
-   else
+   else if (a->die != b->die)
return a->die - b->die;
+   else
+   return a->core - b->core;
 }
 
 int cpu_map__build_ma

[PATCH v5 09/12] perf tools: Add separate socket member

2020-11-17 Thread James Clark
Add socket as a separate member so that it doesn't have to be
packed into the int value. When the socket ID was larger than
8 bits the output appeared corrupted or incomplete.

For example, here on ThunderX2 perf stat reports a socket
of -1 and an invalid die number:

  ./perf stat -a --per-die
  The socket id number is too big.

  Performance counter stats for 'system wide':

  S-1-D255   128 687.99 msec cpu-clock #   
57.240 CPUs utilized
  ...
  S36-D0 128 842.34 msec cpu-clock #   
70.081 CPUs utilized
  ...

And with --per-core there is an entry with an invalid core ID:

  ./perf stat record -a --per-core
  The socket id number is too big.

  Performance counter stats for 'system wide':
  S-1-D255-C65535 128 671.04 msec cpu-clock #   
54.112 CPUs utilized
  ...
  S36-D0-C0   4  28.27 msec cpu-clock #
2.279 CPUs utilized
  ...

This fixes the "Session topology" self test on ThunderX2.

After this fix the output contains the correct socket and die
IDs and no longer prints a warning about the size of the
socket ID:

  ./perf stat --per-die -a

  Performance counter stats for 'system wide':

  S36-D0 128 169,869.39 msec cpu-clock #  
127.501 CPUs utilized
  ...
  S3612-D0 128 169,733.05 msec cpu-clock     #  
127.398 CPUs utilized

Signed-off-by: James Clark 
Cc: Peter Zijlstra 
Cc: Ingo Molnar 
Cc: Arnaldo Carvalho de Melo 
Cc: Mark Rutland 
Cc: Alexander Shishkin 
Cc: Jiri Olsa 
Cc: Namhyung Kim 
Cc: Thomas Richter 
Cc: John Garry 
---
 tools/perf/builtin-stat.c  | 22 +++--
 tools/perf/tests/topology.c| 11 -
 tools/perf/util/cpumap.c   | 44 +-
 tools/perf/util/cpumap.h   |  6 +
 tools/perf/util/stat-display.c |  8 +++
 tools/perf/util/stat.c |  2 +-
 6 files changed, 41 insertions(+), 52 deletions(-)

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 2db2550eef9e..193e7a4e0c7b 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -1371,7 +1371,7 @@ static struct aggr_cpu_id perf_env__get_socket(struct 
perf_cpu_map *map, int idx
struct aggr_cpu_id id = cpu_map__empty_aggr_cpu_id();
 
if (cpu != -1)
-   id.id = env->cpu[cpu].socket_id;
+   id.socket = env->cpu[cpu].socket_id;
 
return id;
 }
@@ -1384,18 +1384,16 @@ static struct aggr_cpu_id perf_env__get_die(struct 
perf_cpu_map *map, int idx, v
 
if (cpu != -1) {
/*
-* Encode socket in bit range 15:8
-* die_id is relative to socket,
-* we need a global id. So we combine
-* socket + die id
+* die_id is relative to socket, so start
+* with the socket ID and then add die to
+* make a unique ID.
 */
-   if (WARN_ONCE(env->cpu[cpu].socket_id >> 8, "The socket id 
number is too big.\n"))
-   return cpu_map__empty_aggr_cpu_id();
+   id.socket = env->cpu[cpu].socket_id;
 
if (WARN_ONCE(env->cpu[cpu].die_id >> 8, "The die id number is 
too big.\n"))
return cpu_map__empty_aggr_cpu_id();
 
-   id.id = (env->cpu[cpu].socket_id << 8) | (env->cpu[cpu].die_id 
& 0xff);
+   id.id = env->cpu[cpu].die_id & 0xff;
}
 
return id;
@@ -1409,23 +1407,19 @@ static struct aggr_cpu_id perf_env__get_core(struct 
perf_cpu_map *map, int idx,
 
if (cpu != -1) {
/*
-* Encode socket in bit range 31:24
 * encode die id in bit range 23:16
 * core_id is relative to socket and die,
 * we need a global id. So we combine
 * socket + die id + core id
 */
-   if (WARN_ONCE(env->cpu[cpu].socket_id >> 8, "The socket id 
number is too big.\n"))
-   return cpu_map__empty_aggr_cpu_id();
-
if (WARN_ONCE(env->cpu[cpu].die_id >> 8, "The die id number is 
too big.\n"))
return cpu_map__empty_aggr_cpu_id();
 
if (WARN_ONCE(env->cpu[cpu].core_id >> 16, "The core id number 
is too big.\n"))
return cpu_map__empty_aggr_cpu_id();
 
-   id.id = (env->cpu[cpu].socket_id << 24) |
-  (env->cpu[cpu].die_id << 16) |
+   id.socket = env->cpu[cpu].socket_id;
+   id.id = (env->cpu[cpu].die_id << 16) |
   (env->cpu[cpu].core_id & 0x);
}
 
diff --git a/tools/

[PATCH v5 03/12] perf tools: Add new struct for cpu aggregation

2020-11-17 Thread James Clark
This struct currently has only a single int member so that
it can be used as a drop in replacement for the existing
behaviour.

Comparison and constructor functions have also been added
that will replace usages of '==' and '= -1'.

No functional changes.

Signed-off-by: James Clark 
Cc: Peter Zijlstra 
Cc: Ingo Molnar 
Cc: Arnaldo Carvalho de Melo 
Cc: Mark Rutland 
Cc: Alexander Shishkin 
Cc: Jiri Olsa 
Cc: Namhyung Kim 
Cc: Thomas Richter 
Cc: John Garry 
---
 tools/perf/util/cpumap.c | 18 ++
 tools/perf/util/cpumap.h |  8 
 2 files changed, 26 insertions(+)

diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c
index 20e3a75953fc..8624948b4f1d 100644
--- a/tools/perf/util/cpumap.c
+++ b/tools/perf/util/cpumap.c
@@ -586,3 +586,21 @@ const struct perf_cpu_map *cpu_map__online(void) /* thread 
unsafe */
 
return online;
 }
+
+bool cpu_map__compare_aggr_cpu_id(struct aggr_cpu_id a, struct aggr_cpu_id b)
+{
+   return a.id == b.id;
+}
+
+bool cpu_map__aggr_cpu_id_is_empty(struct aggr_cpu_id a)
+{
+   return a.id == -1;
+}
+
+struct aggr_cpu_id cpu_map__empty_aggr_cpu_id(void)
+{
+   struct aggr_cpu_id ret = {
+   .id = -1
+   };
+   return ret;
+}
diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h
index 3a442f021468..1cdccc69cd4b 100644
--- a/tools/perf/util/cpumap.h
+++ b/tools/perf/util/cpumap.h
@@ -7,6 +7,10 @@
 #include 
 #include 
 
+struct aggr_cpu_id {
+   int id;
+};
+
 struct perf_record_cpu_map_data;
 
 struct perf_cpu_map *perf_cpu_map__empty_new(int nr);
@@ -64,4 +68,8 @@ int cpu_map__build_map(struct perf_cpu_map *cpus, struct 
perf_cpu_map **res,
 int cpu_map__cpu(struct perf_cpu_map *cpus, int idx);
 bool cpu_map__has(struct perf_cpu_map *cpus, int cpu);
 
+bool cpu_map__compare_aggr_cpu_id(struct aggr_cpu_id a, struct aggr_cpu_id b);
+bool cpu_map__aggr_cpu_id_is_empty(struct aggr_cpu_id a);
+struct aggr_cpu_id cpu_map__empty_aggr_cpu_id(void);
+
 #endif /* __PERF_CPUMAP_H */
-- 
2.28.0



[PATCH v5 02/12] perf tools: Use allocator for perf_cpu_map

2020-11-17 Thread James Clark
Use the existing allocator for perf_cpu_map to avoid use
of raw malloc. This could cause an issue in later commits
where the size of perf_cpu_map is changed.

No functional changes.

Signed-off-by: James Clark 
Cc: Peter Zijlstra 
Cc: Ingo Molnar 
Cc: Arnaldo Carvalho de Melo 
Cc: Mark Rutland 
Cc: Alexander Shishkin 
Cc: Jiri Olsa 
Cc: Namhyung Kim 
Cc: Thomas Richter 
Cc: John Garry 
---
 tools/perf/util/cpumap.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c
index dc5c5e6fc502..20e3a75953fc 100644
--- a/tools/perf/util/cpumap.c
+++ b/tools/perf/util/cpumap.c
@@ -132,15 +132,16 @@ int cpu_map__build_map(struct perf_cpu_map *cpus, struct 
perf_cpu_map **res,
   int (*f)(struct perf_cpu_map *map, int cpu, void *data),
   void *data)
 {
-   struct perf_cpu_map *c;
int nr = cpus->nr;
+   struct perf_cpu_map *c = perf_cpu_map__empty_new(nr);
int cpu, s1, s2;
 
-   /* allocate as much as possible */
-   c = calloc(1, sizeof(*c) + nr * sizeof(int));
if (!c)
return -1;
 
+   /* Reset size as it may only be partially filled */
+   c->nr = 0;
+
for (cpu = 0; cpu < nr; cpu++) {
s1 = f(cpus, cpu, data);
for (s2 = 0; s2 < c->nr; s2++) {
@@ -155,7 +156,6 @@ int cpu_map__build_map(struct perf_cpu_map *cpus, struct 
perf_cpu_map **res,
/* ensure we process id in increasing order */
qsort(c->map, c->nr, sizeof(int), cmp_ids);
 
-   refcount_set(&c->refcnt, 1);
*res = c;
return 0;
 }
-- 
2.28.0



[PATCH v5 08/12] perf tools: Add separate node member

2020-11-17 Thread James Clark
Add node as a separate member so that it doesn't have to be
packed into the int value.

Signed-off-by: James Clark 
Cc: Peter Zijlstra 
Cc: Ingo Molnar 
Cc: Arnaldo Carvalho de Melo 
Cc: Mark Rutland 
Cc: Alexander Shishkin 
Cc: Jiri Olsa 
Cc: Namhyung Kim 
Cc: Thomas Richter 
Cc: John Garry 
---
 tools/perf/builtin-stat.c  |  2 +-
 tools/perf/tests/topology.c|  6 +-
 tools/perf/util/cpumap.c   | 16 +++-
 tools/perf/util/cpumap.h   |  1 +
 tools/perf/util/stat-display.c |  2 +-
 5 files changed, 19 insertions(+), 8 deletions(-)

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index afe9fa6112b6..2db2550eef9e 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -1437,7 +1437,7 @@ static struct aggr_cpu_id perf_env__get_node(struct 
perf_cpu_map *map, int idx,
int cpu = perf_env__get_cpu(data, map, idx);
struct aggr_cpu_id id = cpu_map__empty_aggr_cpu_id();
 
-   id.id = perf_env__numa_node(data, cpu);
+   id.node = perf_env__numa_node(data, cpu);
return id;
 }
 
diff --git a/tools/perf/tests/topology.c b/tools/perf/tests/topology.c
index aeca2510dea8..f0c0fc6e243d 100644
--- a/tools/perf/tests/topology.c
+++ b/tools/perf/tests/topology.c
@@ -119,6 +119,7 @@ static int check_cpu_topology(char *path, struct 
perf_cpu_map *map)
 
TEST_ASSERT_VAL("Core map - Die ID doesn't match",
session->header.env.cpu[map->map[i]].die_id == 
cpu_map__id_to_die(id.id));
+   TEST_ASSERT_VAL("Core map - Node ID is set", id.node == -1);
}
 
// Test that die ID contains socket and die
@@ -130,6 +131,7 @@ static int check_cpu_topology(char *path, struct 
perf_cpu_map *map)
 
TEST_ASSERT_VAL("Die map - Die ID doesn't match",
session->header.env.cpu[map->map[i]].die_id == 
cpu_map__id_to_die(id.id));
+   TEST_ASSERT_VAL("Die map - Node ID is set", id.node == -1);
}
 
// Test that socket ID contains only socket
@@ -138,13 +140,15 @@ static int check_cpu_topology(char *path, struct 
perf_cpu_map *map)
TEST_ASSERT_VAL("Socket map - Socket ID doesn't match",
session->header.env.cpu[map->map[i]].socket_id ==
cpu_map__id_to_socket(id.id));
+   TEST_ASSERT_VAL("Socket map - Node ID is set", id.node == -1);
}
 
// Test that node ID contains only node
for (i = 0; i < map->nr; i++) {
id = cpu_map__get_node(map, i, NULL);
TEST_ASSERT_VAL("Node map - Node ID doesn't match",
-   cpu__get_node(map->map[i]) == id.id);
+   cpu__get_node(map->map[i]) == id.node);
+   TEST_ASSERT_VAL("Node map - ID is set", id.id == -1);
}
perf_session__delete(session);
 
diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c
index b50609b9a585..5f9e98ddbe34 100644
--- a/tools/perf/util/cpumap.c
+++ b/tools/perf/util/cpumap.c
@@ -148,7 +148,10 @@ static int cmp_aggr_cpu_id(const void *a_pointer, const 
void *b_pointer)
struct aggr_cpu_id *a = (struct aggr_cpu_id *)a_pointer;
struct aggr_cpu_id *b = (struct aggr_cpu_id *)b_pointer;
 
-   return a->id - b->id;
+   if (a->id != b->id)
+   return a->id - b->id;
+   else
+   return a->node - b->node;
 }
 
 int cpu_map__build_map(struct perf_cpu_map *cpus, struct cpu_aggr_map **res,
@@ -275,7 +278,7 @@ struct aggr_cpu_id cpu_map__get_node(struct perf_cpu_map 
*map, int idx, void *da
if (idx < 0 || idx >= map->nr)
return id;
 
-   id.id = cpu_map__get_node_id(map->map[idx]);
+   id.node = cpu_map__get_node_id(map->map[idx]);
return id;
 }
 
@@ -620,18 +623,21 @@ const struct perf_cpu_map *cpu_map__online(void) /* 
thread unsafe */
 
 bool cpu_map__compare_aggr_cpu_id(struct aggr_cpu_id a, struct aggr_cpu_id b)
 {
-   return a.id == b.id;
+   return a.id == b.id &&
+   a.node == b.node;
 }
 
 bool cpu_map__aggr_cpu_id_is_empty(struct aggr_cpu_id a)
 {
-   return a.id == -1;
+   return a.id == -1 &&
+   a.node == -1;
 }
 
 struct aggr_cpu_id cpu_map__empty_aggr_cpu_id(void)
 {
struct aggr_cpu_id ret = {
-   .id = -1
+   .id = -1,
+   .node = -1
};
return ret;
 }
diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h
index d8fc265bc762..f79e92603024 100644
--- a/tools/perf/util/cpumap.h
+++ b/tools/perf/util/cpumap.h
@@ -9,6 +9,7 @@
 
 struct aggr_cpu_id {
int id;
+   int node;
 };
 
 struct cpu_aggr_map {
diff --git a/tools/perf/util/stat-display.c b/tools/perf/util/stat-dis

[PATCH v5 01/12] perf tools: Improve topology test

2020-11-17 Thread James Clark
Improve the topology test to check all aggregation
types. This is to lock down the behaviour before
'id' is changed into a struct in later commits.

Signed-off-by: James Clark 
Cc: Peter Zijlstra 
Cc: Ingo Molnar 
Cc: Arnaldo Carvalho de Melo 
Cc: Mark Rutland 
Cc: Alexander Shishkin 
Cc: Jiri Olsa 
Cc: Namhyung Kim 
Cc: Thomas Richter 
Cc: John Garry 
---
 tools/perf/tests/topology.c | 53 -
 1 file changed, 46 insertions(+), 7 deletions(-)

diff --git a/tools/perf/tests/topology.c b/tools/perf/tests/topology.c
index 22daf2bdf5fa..7bd8848d36b6 100644
--- a/tools/perf/tests/topology.c
+++ b/tools/perf/tests/topology.c
@@ -64,10 +64,11 @@ static int check_cpu_topology(char *path, struct 
perf_cpu_map *map)
.path = path,
.mode = PERF_DATA_MODE_READ,
};
-   int i;
+   int i, id;
 
session = perf_session__new(&data, false, NULL);
TEST_ASSERT_VAL("can't get session", !IS_ERR(session));
+   cpu__setup_cpunode_map();
 
/* On platforms with large numbers of CPUs process_cpu_topology()
 * might issue an error while reading the perf.data file section
@@ -85,11 +86,18 @@ static int check_cpu_topology(char *path, struct 
perf_cpu_map *map)
 *  "socket_id number is too big. You may need to upgrade the
 *  perf tool."
 *
-*  This is the reason why this test might be skipped.
+*  This is the reason why this test might be skipped. aarch64 and
+*  s390 always write this part of the header, even when the above
+*  condition is true (see do_core_id_test in header.c). So always
+*  run this test on those platforms.
 */
-   if (!session->header.env.cpu)
+   if (!session->header.env.cpu
+   && strncmp(session->header.env.arch, "s390", 4)
+   && strncmp(session->header.env.arch, "aarch64", 7))
return TEST_SKIP;
 
+   TEST_ASSERT_VAL("Session header CPU map not set", 
session->header.env.cpu);
+
for (i = 0; i < session->header.env.nr_cpus_avail; i++) {
if (!cpu_map__has(map, i))
continue;
@@ -98,14 +106,45 @@ static int check_cpu_topology(char *path, struct 
perf_cpu_map *map)
 session->header.env.cpu[i].socket_id);
}
 
+   // Test that core ID contains socket, die and core
+   for (i = 0; i < map->nr; i++) {
+   id = cpu_map__get_core(map, i, NULL);
+   TEST_ASSERT_VAL("Core map - Core ID doesn't match",
+   session->header.env.cpu[map->map[i]].core_id == 
cpu_map__id_to_cpu(id));
+
+   TEST_ASSERT_VAL("Core map - Socket ID doesn't match",
+   session->header.env.cpu[map->map[i]].socket_id ==
+   cpu_map__id_to_socket(id));
+
+   TEST_ASSERT_VAL("Core map - Die ID doesn't match",
+   session->header.env.cpu[map->map[i]].die_id == 
cpu_map__id_to_die(id));
+   }
+
+   // Test that die ID contains socket and die
for (i = 0; i < map->nr; i++) {
-   TEST_ASSERT_VAL("Core ID doesn't match",
-   (session->header.env.cpu[map->map[i]].core_id == 
(cpu_map__get_core(map, i, NULL) & 0x)));
+   id = cpu_map__get_die(map, i, NULL);
+   TEST_ASSERT_VAL("Die map - Socket ID doesn't match",
+   session->header.env.cpu[map->map[i]].socket_id ==
+   cpu_map__id_to_socket(id));
 
-   TEST_ASSERT_VAL("Socket ID doesn't match",
-   (session->header.env.cpu[map->map[i]].socket_id == 
cpu_map__get_socket(map, i, NULL)));
+   TEST_ASSERT_VAL("Die map - Die ID doesn't match",
+   session->header.env.cpu[map->map[i]].die_id == 
cpu_map__id_to_die(id));
}
 
+   // Test that socket ID contains only socket
+   for (i = 0; i < map->nr; i++) {
+   id = cpu_map__get_socket(map, i, NULL);
+   TEST_ASSERT_VAL("Socket map - Socket ID doesn't match",
+   session->header.env.cpu[map->map[i]].socket_id ==
+   cpu_map__id_to_socket(id));
+   }
+
+   // Test that node ID contains only node
+   for (i = 0; i < map->nr; i++) {
+   id = cpu_map__get_node(map, i, NULL);
+   TEST_ASSERT_VAL("Node map - Node ID doesn't match",
+   cpu__get_node(map->map[i]) == id);
+   }
perf_session__delete(session);
 
return 0;
-- 
2.28.0



[PATCH v5 06/12] perf tools: drop in cpu_aggr_map struct

2020-11-17 Thread James Clark
Replace usages of perf_cpu_map with cpu_aggr map in
places that are involved with perf stat aggregation.

This will then later be changed to be a map of
cpu_aggr_id rather than an int so that more data can
be stored.

No functional changes.

Signed-off-by: James Clark 
Cc: Peter Zijlstra 
Cc: Ingo Molnar 
Cc: Arnaldo Carvalho de Melo 
Cc: Mark Rutland 
Cc: Alexander Shishkin 
Cc: Jiri Olsa 
Cc: Namhyung Kim 
Cc: Thomas Richter 
Cc: John Garry 
---
 tools/perf/builtin-stat.c | 29 ++---
 tools/perf/util/cpumap.c  | 12 ++--
 tools/perf/util/cpumap.h  | 10 +-
 tools/perf/util/stat.h|  4 ++--
 4 files changed, 35 insertions(+), 20 deletions(-)

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index f10c67a26472..344e50651b55 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -1322,14 +1322,29 @@ static int perf_stat_init_aggr_mode(void)
 * the aggregation translate cpumap.
 */
nr = perf_cpu_map__max(evsel_list->core.cpus);
-   stat_config.cpus_aggr_map = perf_cpu_map__empty_new(nr + 1);
+   stat_config.cpus_aggr_map = cpu_aggr_map__empty_new(nr + 1);
return stat_config.cpus_aggr_map ? 0 : -ENOMEM;
 }
 
+static void cpu_aggr_map__delete(struct cpu_aggr_map *map)
+{
+   if (map) {
+   WARN_ONCE(refcount_read(&map->refcnt) != 0,
+ "cpu_aggr_map refcnt unbalanced\n");
+   free(map);
+   }
+}
+
+static void cpu_aggr_map__put(struct cpu_aggr_map *map)
+{
+   if (map && refcount_dec_and_test(&map->refcnt))
+   cpu_aggr_map__delete(map);
+}
+
 static void perf_stat__exit_aggr_mode(void)
 {
-   perf_cpu_map__put(stat_config.aggr_map);
-   perf_cpu_map__put(stat_config.cpus_aggr_map);
+   cpu_aggr_map__put(stat_config.aggr_map);
+   cpu_aggr_map__put(stat_config.cpus_aggr_map);
stat_config.aggr_map = NULL;
stat_config.cpus_aggr_map = NULL;
 }
@@ -1427,25 +1442,25 @@ static struct aggr_cpu_id perf_env__get_node(struct 
perf_cpu_map *map, int idx,
 }
 
 static int perf_env__build_socket_map(struct perf_env *env, struct 
perf_cpu_map *cpus,
- struct perf_cpu_map **sockp)
+ struct cpu_aggr_map **sockp)
 {
return cpu_map__build_map(cpus, sockp, perf_env__get_socket, env);
 }
 
 static int perf_env__build_die_map(struct perf_env *env, struct perf_cpu_map 
*cpus,
-  struct perf_cpu_map **diep)
+  struct cpu_aggr_map **diep)
 {
return cpu_map__build_map(cpus, diep, perf_env__get_die, env);
 }
 
 static int perf_env__build_core_map(struct perf_env *env, struct perf_cpu_map 
*cpus,
-   struct perf_cpu_map **corep)
+   struct cpu_aggr_map **corep)
 {
return cpu_map__build_map(cpus, corep, perf_env__get_core, env);
 }
 
 static int perf_env__build_node_map(struct perf_env *env, struct perf_cpu_map 
*cpus,
-   struct perf_cpu_map **nodep)
+   struct cpu_aggr_map **nodep)
 {
return cpu_map__build_map(cpus, nodep, perf_env__get_node, env);
 }
diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c
index b18e53506656..ea81586305f4 100644
--- a/tools/perf/util/cpumap.c
+++ b/tools/perf/util/cpumap.c
@@ -151,12 +151,12 @@ static int cmp_aggr_cpu_id(const void *a_pointer, const 
void *b_pointer)
return a->id - b->id;
 }
 
-int cpu_map__build_map(struct perf_cpu_map *cpus, struct perf_cpu_map **res,
+int cpu_map__build_map(struct perf_cpu_map *cpus, struct cpu_aggr_map **res,
   struct aggr_cpu_id (*f)(struct perf_cpu_map *map, int 
cpu, void *data),
   void *data)
 {
int nr = cpus->nr;
-   struct perf_cpu_map *c = perf_cpu_map__empty_new(nr);
+   struct cpu_aggr_map *c = cpu_aggr_map__empty_new(nr);
int cpu, s2;
struct aggr_cpu_id s1;
 
@@ -279,22 +279,22 @@ struct aggr_cpu_id cpu_map__get_node(struct perf_cpu_map 
*map, int idx, void *da
return id;
 }
 
-int cpu_map__build_socket_map(struct perf_cpu_map *cpus, struct perf_cpu_map 
**sockp)
+int cpu_map__build_socket_map(struct perf_cpu_map *cpus, struct cpu_aggr_map 
**sockp)
 {
return cpu_map__build_map(cpus, sockp, cpu_map__get_socket, NULL);
 }
 
-int cpu_map__build_die_map(struct perf_cpu_map *cpus, struct perf_cpu_map 
**diep)
+int cpu_map__build_die_map(struct perf_cpu_map *cpus, struct cpu_aggr_map 
**diep)
 {
return cpu_map__build_map(cpus, diep, cpu_map__get_die, NULL);
 }
 
-int cpu_map__build_core_map(struct perf_cpu_map *cpus, struct perf_cpu_map 
**corep)
+int cpu_map__build_core_map(struct perf_cpu_map *cpus, struct cpu_aggr_map 
**corep)
 {
return cpu_map__build_map(cpus, corep, cpu_map__get_co

[PATCH 06/13 v4] perf tools: drop in cpu_aggr_map struct

2020-11-13 Thread James Clark
Replace usages of perf_cpu_map with cpu_aggr map in
places that are involved with perf stat aggregation.

This will then later be changed to be a map of
cpu_aggr_id rather than an int so that more data can
be stored.

No functional changes.

Signed-off-by: James Clark 
Cc: Peter Zijlstra 
Cc: Ingo Molnar 
Cc: Arnaldo Carvalho de Melo 
Cc: Mark Rutland 
Cc: Alexander Shishkin 
Cc: Jiri Olsa 
Cc: Namhyung Kim 
Cc: Thomas Richter 
Cc: John Garry 
---
 tools/perf/builtin-stat.c | 14 +++---
 tools/perf/util/cpumap.c  | 12 ++--
 tools/perf/util/cpumap.h  | 10 +-
 tools/perf/util/stat.h|  4 ++--
 4 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index f10c67a26472..7daac139f6cc 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -1322,14 +1322,14 @@ static int perf_stat_init_aggr_mode(void)
 * the aggregation translate cpumap.
 */
nr = perf_cpu_map__max(evsel_list->core.cpus);
-   stat_config.cpus_aggr_map = perf_cpu_map__empty_new(nr + 1);
+   stat_config.cpus_aggr_map = cpu_aggr_map__empty_new(nr + 1);
return stat_config.cpus_aggr_map ? 0 : -ENOMEM;
 }
 
 static void perf_stat__exit_aggr_mode(void)
 {
-   perf_cpu_map__put(stat_config.aggr_map);
-   perf_cpu_map__put(stat_config.cpus_aggr_map);
+   cpu_aggr_map__put(stat_config.aggr_map);
+   cpu_aggr_map__put(stat_config.cpus_aggr_map);
stat_config.aggr_map = NULL;
stat_config.cpus_aggr_map = NULL;
 }
@@ -1427,25 +1427,25 @@ static struct aggr_cpu_id perf_env__get_node(struct 
perf_cpu_map *map, int idx,
 }
 
 static int perf_env__build_socket_map(struct perf_env *env, struct 
perf_cpu_map *cpus,
- struct perf_cpu_map **sockp)
+ struct cpu_aggr_map **sockp)
 {
return cpu_map__build_map(cpus, sockp, perf_env__get_socket, env);
 }
 
 static int perf_env__build_die_map(struct perf_env *env, struct perf_cpu_map 
*cpus,
-  struct perf_cpu_map **diep)
+  struct cpu_aggr_map **diep)
 {
return cpu_map__build_map(cpus, diep, perf_env__get_die, env);
 }
 
 static int perf_env__build_core_map(struct perf_env *env, struct perf_cpu_map 
*cpus,
-   struct perf_cpu_map **corep)
+   struct cpu_aggr_map **corep)
 {
return cpu_map__build_map(cpus, corep, perf_env__get_core, env);
 }
 
 static int perf_env__build_node_map(struct perf_env *env, struct perf_cpu_map 
*cpus,
-   struct perf_cpu_map **nodep)
+   struct cpu_aggr_map **nodep)
 {
return cpu_map__build_map(cpus, nodep, perf_env__get_node, env);
 }
diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c
index 7500225c8571..e831a18ec95e 100644
--- a/tools/perf/util/cpumap.c
+++ b/tools/perf/util/cpumap.c
@@ -166,12 +166,12 @@ static int cmp_aggr_cpu_id(const void *a_pointer, const 
void *b_pointer)
return a->id - b->id;
 }
 
-int cpu_map__build_map(struct perf_cpu_map *cpus, struct perf_cpu_map **res,
+int cpu_map__build_map(struct perf_cpu_map *cpus, struct cpu_aggr_map **res,
   struct aggr_cpu_id (*f)(struct perf_cpu_map *map, int 
cpu, void *data),
   void *data)
 {
int nr = cpus->nr;
-   struct perf_cpu_map *c = perf_cpu_map__empty_new(nr);
+   struct cpu_aggr_map *c = cpu_aggr_map__empty_new(nr);
int cpu, s2;
struct aggr_cpu_id s1;
 
@@ -295,22 +295,22 @@ struct aggr_cpu_id cpu_map__get_node(struct perf_cpu_map 
*map, int idx, void *da
return id;
 }
 
-int cpu_map__build_socket_map(struct perf_cpu_map *cpus, struct perf_cpu_map 
**sockp)
+int cpu_map__build_socket_map(struct perf_cpu_map *cpus, struct cpu_aggr_map 
**sockp)
 {
return cpu_map__build_map(cpus, sockp, cpu_map__get_socket, NULL);
 }
 
-int cpu_map__build_die_map(struct perf_cpu_map *cpus, struct perf_cpu_map 
**diep)
+int cpu_map__build_die_map(struct perf_cpu_map *cpus, struct cpu_aggr_map 
**diep)
 {
return cpu_map__build_map(cpus, diep, cpu_map__get_die, NULL);
 }
 
-int cpu_map__build_core_map(struct perf_cpu_map *cpus, struct perf_cpu_map 
**corep)
+int cpu_map__build_core_map(struct perf_cpu_map *cpus, struct cpu_aggr_map 
**corep)
 {
return cpu_map__build_map(cpus, corep, cpu_map__get_core, NULL);
 }
 
-int cpu_map__build_node_map(struct perf_cpu_map *cpus, struct perf_cpu_map 
**numap)
+int cpu_map__build_node_map(struct perf_cpu_map *cpus, struct cpu_aggr_map 
**numap)
 {
return cpu_map__build_map(cpus, numap, cpu_map__get_node, NULL);
 }
diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h
index ab6be8ef696f..d82822ddcbce 100644
--- a/tools/perf/util/cpumap.h
+++ b/tools/perf/util/cpumap.h
@@ -36,10 +

[PATCH 00/13 v4] perf tools: fix perf stat with large socket IDs

2020-11-13 Thread James Clark
v3 had a mistake in a couple of my signed off lines so I have fixed them
in v4.

v3 breaks up the previous v2 patchset into smaller atomic commits.
The end result is the same as the previous patchset apart from
some minor refactoring, asserting on an empty header and
calling cpu__setup_cpunode_map() in the topology self test.

Testing done:

Tested --per-core, --per-thread, --per-die, --per-node 'perf
stat' outputs on Arm ThunderX2 and Intel KNL.

Also tested 'perf stat record' and 'perf stat report --input'
with recordings from a version of perf before this patchset
to confirm that the output was the same.

Signed-off-by: James Clark 
Cc: Peter Zijlstra 
Cc: Ingo Molnar 
Cc: Arnaldo Carvalho de Melo 
Cc: Mark Rutland 
Cc: Alexander Shishkin 
Cc: Jiri Olsa 
Cc: Namhyung Kim 
Cc: Thomas Richter 
Cc: John Garry 

James Clark (13):
  perf tools: Improve topology test
  perf tools: Use allocator for perf_cpu_map
  perf tools: Add new struct for cpu aggregation
  perf tools: Replace aggregation ID with a struct
  perf tools: add new map type for aggregation
  perf tools: drop in cpu_aggr_map struct
  perf tools: restrict visibility of functions
  perf tools: Start using cpu_aggr_id in map
  perf tools: Add separate node member
  perf tools: Add separate socket member
  perf tools: Add separate die member
  perf tools: Add separate core member
  perf tools: add thread field

 tools/perf/builtin-stat.c  | 128 +
 tools/perf/tests/topology.c|  58 +--
 tools/perf/util/cpumap.c   | 170 ++---
 tools/perf/util/cpumap.h   |  55 ++-
 tools/perf/util/stat-display.c | 106 +++-
 tools/perf/util/stat.c |   2 +-
 tools/perf/util/stat.h |   9 +-
 7 files changed, 332 insertions(+), 196 deletions(-)

-- 
2.28.0



[PATCH 08/13 v4] perf tools: Start using cpu_aggr_id in map

2020-11-13 Thread James Clark
Use the new cpu_aggr_id struct in the cpu map
instead of int so that it can store more data.

No functional changes.

Signed-off-by: James Clark 
Cc: Peter Zijlstra 
Cc: Ingo Molnar 
Cc: Arnaldo Carvalho de Melo 
Cc: Mark Rutland 
Cc: Alexander Shishkin 
Cc: Jiri Olsa 
Cc: Namhyung Kim 
Cc: Thomas Richter 
Cc: John Garry 
---
 tools/perf/builtin-stat.c  | 6 +++---
 tools/perf/util/cpumap.c   | 8 
 tools/perf/util/cpumap.h   | 2 +-
 tools/perf/util/stat-display.c | 6 +++---
 4 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 344e50651b55..afe9fa6112b6 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -1223,10 +1223,10 @@ static struct aggr_cpu_id perf_stat__get_aggr(struct 
perf_stat_config *config,
 
cpu = map->map[idx];
 
-   if (config->cpus_aggr_map->map[cpu] == -1)
-   config->cpus_aggr_map->map[cpu] = get_id(config, map, idx).id;
+   if (cpu_map__aggr_cpu_id_is_empty(config->cpus_aggr_map->map[cpu]))
+   config->cpus_aggr_map->map[cpu] = get_id(config, map, idx);
 
-   id.id = config->cpus_aggr_map->map[cpu];
+   id = config->cpus_aggr_map->map[cpu];
return id;
 }
 
diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c
index e90270f0be57..0f42e6a6b704 100644
--- a/tools/perf/util/cpumap.c
+++ b/tools/perf/util/cpumap.c
@@ -97,14 +97,14 @@ struct perf_cpu_map *perf_cpu_map__empty_new(int nr)
 
 struct cpu_aggr_map *cpu_aggr_map__empty_new(int nr)
 {
-   struct cpu_aggr_map *cpus = malloc(sizeof(*cpus) + sizeof(int) * nr);
+   struct cpu_aggr_map *cpus = malloc(sizeof(*cpus) + sizeof(struct 
aggr_cpu_id) * nr);
 
if (cpus != NULL) {
int i;
 
cpus->nr = nr;
for (i = 0; i < nr; i++)
-   cpus->map[i] = -1;
+   cpus->map[i] = cpu_map__empty_aggr_cpu_id();
 
refcount_set(&cpus->refcnt, 1);
}
@@ -169,11 +169,11 @@ int cpu_map__build_map(struct perf_cpu_map *cpus, struct 
cpu_aggr_map **res,
for (cpu = 0; cpu < nr; cpu++) {
s1 = f(cpus, cpu, data);
for (s2 = 0; s2 < c->nr; s2++) {
-   if (s1.id == c->map[s2])
+   if (cpu_map__compare_aggr_cpu_id(s1, c->map[s2]))
break;
}
if (s2 == c->nr) {
-   c->map[c->nr] = s1.id;
+   c->map[c->nr] = s1;
c->nr++;
}
}
diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h
index b112069038be..d8fc265bc762 100644
--- a/tools/perf/util/cpumap.h
+++ b/tools/perf/util/cpumap.h
@@ -14,7 +14,7 @@ struct aggr_cpu_id {
 struct cpu_aggr_map {
refcount_t refcnt;
int nr;
-   int map[];
+   struct aggr_cpu_id map[];
 };
 
 struct perf_record_cpu_map_data;
diff --git a/tools/perf/util/stat-display.c b/tools/perf/util/stat-display.c
index 01acb7d5e120..ad91e8a7d5af 100644
--- a/tools/perf/util/stat-display.c
+++ b/tools/perf/util/stat-display.c
@@ -509,7 +509,7 @@ static void aggr_update_shadow(struct perf_stat_config 
*config,
struct evsel *counter;
 
for (s = 0; s < config->aggr_map->nr; s++) {
-   id.id = config->aggr_map->map[s];
+   id = config->aggr_map->map[s];
evlist__for_each_entry(evlist, counter) {
val = 0;
for (cpu = 0; cpu < evsel__nr_cpus(counter); cpu++) {
@@ -641,7 +641,7 @@ static void print_counter_aggrdata(struct perf_stat_config 
*config,
struct aggr_cpu_id id;
double uval;
 
-   ad.id.id = id.id = config->aggr_map->map[s];
+   ad.id = id = config->aggr_map->map[s];
ad.val = ad.ena = ad.run = 0;
ad.nr = 0;
if (!collect_data(config, counter, aggr_cb, &ad))
@@ -1167,7 +1167,7 @@ static void print_percore_thread(struct perf_stat_config 
*config,
for (int i = 0; i < evsel__nr_cpus(counter); i++) {
s2 = config->aggr_get_id(config, evsel__cpus(counter), i);
for (s = 0; s < config->aggr_map->nr; s++) {
-   id.id = config->aggr_map->map[s];
+   id = config->aggr_map->map[s];
if (cpu_map__compare_aggr_cpu_id(s2, id))
break;
}
-- 
2.28.0



[PATCH 04/13 v4] perf tools: Replace aggregation ID with a struct

2020-11-13 Thread James Clark
Replace all occurences of the usage of int with the new struct
cpu_aggr_id.

No functional changes.

Signed-off-by: James Clark 
Cc: Peter Zijlstra 
Cc: Ingo Molnar 
Cc: Arnaldo Carvalho de Melo 
Cc: Mark Rutland 
Cc: Alexander Shishkin 
Cc: Jiri Olsa 
Cc: Namhyung Kim 
Cc: Thomas Richter 
Cc: John Garry 
---
 tools/perf/builtin-stat.c  |  76 --
 tools/perf/tests/topology.c|  17 ++---
 tools/perf/util/cpumap.c   |  82 ++--
 tools/perf/util/cpumap.h   |  10 +--
 tools/perf/util/stat-display.c | 112 +++--
 tools/perf/util/stat.c |   2 +-
 tools/perf/util/stat.h |   5 +-
 7 files changed, 174 insertions(+), 130 deletions(-)

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index f15b2f8aa14d..f10c67a26472 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -1188,65 +1188,67 @@ static struct option stat_options[] = {
OPT_END()
 };
 
-static int perf_stat__get_socket(struct perf_stat_config *config 
__maybe_unused,
+static struct aggr_cpu_id perf_stat__get_socket(struct perf_stat_config 
*config __maybe_unused,
 struct perf_cpu_map *map, int cpu)
 {
return cpu_map__get_socket(map, cpu, NULL);
 }
 
-static int perf_stat__get_die(struct perf_stat_config *config __maybe_unused,
+static struct aggr_cpu_id perf_stat__get_die(struct perf_stat_config *config 
__maybe_unused,
  struct perf_cpu_map *map, int cpu)
 {
return cpu_map__get_die(map, cpu, NULL);
 }
 
-static int perf_stat__get_core(struct perf_stat_config *config __maybe_unused,
+static struct aggr_cpu_id perf_stat__get_core(struct perf_stat_config *config 
__maybe_unused,
   struct perf_cpu_map *map, int cpu)
 {
return cpu_map__get_core(map, cpu, NULL);
 }
 
-static int perf_stat__get_node(struct perf_stat_config *config __maybe_unused,
+static struct aggr_cpu_id perf_stat__get_node(struct perf_stat_config *config 
__maybe_unused,
   struct perf_cpu_map *map, int cpu)
 {
return cpu_map__get_node(map, cpu, NULL);
 }
 
-static int perf_stat__get_aggr(struct perf_stat_config *config,
+static struct aggr_cpu_id perf_stat__get_aggr(struct perf_stat_config *config,
   aggr_get_id_t get_id, struct perf_cpu_map *map, 
int idx)
 {
int cpu;
+   struct aggr_cpu_id id = cpu_map__empty_aggr_cpu_id();
 
if (idx >= map->nr)
-   return -1;
+   return id;
 
cpu = map->map[idx];
 
if (config->cpus_aggr_map->map[cpu] == -1)
-   config->cpus_aggr_map->map[cpu] = get_id(config, map, idx);
+   config->cpus_aggr_map->map[cpu] = get_id(config, map, idx).id;
 
-   return config->cpus_aggr_map->map[cpu];
+   id.id = config->cpus_aggr_map->map[cpu];
+   return id;
 }
 
-static int perf_stat__get_socket_cached(struct perf_stat_config *config,
+static struct aggr_cpu_id perf_stat__get_socket_cached(struct perf_stat_config 
*config,
struct perf_cpu_map *map, int idx)
 {
return perf_stat__get_aggr(config, perf_stat__get_socket, map, idx);
 }
 
-static int perf_stat__get_die_cached(struct perf_stat_config *config,
+static struct aggr_cpu_id perf_stat__get_die_cached(struct perf_stat_config 
*config,
struct perf_cpu_map *map, int idx)
 {
return perf_stat__get_aggr(config, perf_stat__get_die, map, idx);
 }
 
-static int perf_stat__get_core_cached(struct perf_stat_config *config,
+static struct aggr_cpu_id perf_stat__get_core_cached(struct perf_stat_config 
*config,
  struct perf_cpu_map *map, int idx)
 {
return perf_stat__get_aggr(config, perf_stat__get_core, map, idx);
 }
 
-static int perf_stat__get_node_cached(struct perf_stat_config *config,
+static struct aggr_cpu_id perf_stat__get_node_cached(struct perf_stat_config 
*config,
  struct perf_cpu_map *map, int idx)
 {
return perf_stat__get_aggr(config, perf_stat__get_node, map, idx);
@@ -1347,18 +1349,23 @@ static inline int perf_env__get_cpu(struct perf_env 
*env, struct perf_cpu_map *m
return cpu;
 }
 
-static int perf_env__get_socket(struct perf_cpu_map *map, int idx, void *data)
+static struct aggr_cpu_id perf_env__get_socket(struct perf_cpu_map *map, int 
idx, void *data)
 {
struct perf_env *env = data;
int cpu = perf_env__get_cpu(env, map, idx);
+   struct aggr_cpu_id id = cpu_map__empty_aggr_cpu_id();
+
+   if (cpu != -1)
+   id.id = env->cpu[cpu].socket_id;
 
-   return cpu == -1 ? -1 : env->cpu[cpu].socket_id;
+   return id;
 }
 
-static int perf_env__get_die(struct perf_cpu_map *map, int idx, void *data)
+static struct aggr_cpu_id perf_e

[PATCH 09/13 v4] perf tools: Add separate node member

2020-11-13 Thread James Clark
Add node as a separate member so that it doesn't have to be
packed into the int value.

Signed-off-by: James Clark 
Cc: Peter Zijlstra 
Cc: Ingo Molnar 
Cc: Arnaldo Carvalho de Melo 
Cc: Mark Rutland 
Cc: Alexander Shishkin 
Cc: Jiri Olsa 
Cc: Namhyung Kim 
Cc: Thomas Richter 
Cc: John Garry 
---
 tools/perf/builtin-stat.c  |  2 +-
 tools/perf/tests/topology.c|  6 +-
 tools/perf/util/cpumap.c   | 16 +++-
 tools/perf/util/cpumap.h   |  1 +
 tools/perf/util/stat-display.c |  2 +-
 5 files changed, 19 insertions(+), 8 deletions(-)

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index afe9fa6112b6..2db2550eef9e 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -1437,7 +1437,7 @@ static struct aggr_cpu_id perf_env__get_node(struct 
perf_cpu_map *map, int idx,
int cpu = perf_env__get_cpu(data, map, idx);
struct aggr_cpu_id id = cpu_map__empty_aggr_cpu_id();
 
-   id.id = perf_env__numa_node(data, cpu);
+   id.node = perf_env__numa_node(data, cpu);
return id;
 }
 
diff --git a/tools/perf/tests/topology.c b/tools/perf/tests/topology.c
index aeca2510dea8..90d9c259d258 100644
--- a/tools/perf/tests/topology.c
+++ b/tools/perf/tests/topology.c
@@ -119,6 +119,7 @@ static int check_cpu_topology(char *path, struct 
perf_cpu_map *map)
 
TEST_ASSERT_VAL("Core map - Die ID doesn't match",
session->header.env.cpu[map->map[i]].die_id == 
cpu_map__id_to_die(id.id));
+   TEST_ASSERT_VAL("Core map - Node ID is set", id.node == -1);
}
 
// Test that die ID contains socket and die
@@ -130,6 +131,7 @@ static int check_cpu_topology(char *path, struct 
perf_cpu_map *map)
 
TEST_ASSERT_VAL("Die map - Die ID doesn't match",
session->header.env.cpu[map->map[i]].die_id == 
cpu_map__id_to_die(id.id));
+   TEST_ASSERT_VAL("Die map - Node ID is set", id.node == -1);
}
 
// Test that socket ID contains only socket
@@ -138,13 +140,15 @@ static int check_cpu_topology(char *path, struct 
perf_cpu_map *map)
TEST_ASSERT_VAL("Socket map - Socket ID doesn't match",
session->header.env.cpu[map->map[i]].socket_id ==
cpu_map__id_to_socket(id.id));
+   TEST_ASSERT_VAL("Socket map - Node ID is set", id.node == -1);
}
 
// Test that node ID contains only node
for (i = 0; i < map->nr; i++) {
id = cpu_map__get_node(map, i, NULL);
TEST_ASSERT_VAL("Node map - Node ID doesn't match",
-   cpu__get_node(map->map[i]) == id.id);
+   cpu__get_node(map->map[i]) == id.node);
+   TEST_ASSERT_VAL("Node map - ID shouldn't be set", id.id == -1);
}
perf_session__delete(session);
 
diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c
index 0f42e6a6b704..9929ee5cf177 100644
--- a/tools/perf/util/cpumap.c
+++ b/tools/perf/util/cpumap.c
@@ -148,7 +148,10 @@ static int cmp_aggr_cpu_id(const void *a_pointer, const 
void *b_pointer)
struct aggr_cpu_id *a = (struct aggr_cpu_id *)a_pointer;
struct aggr_cpu_id *b = (struct aggr_cpu_id *)b_pointer;
 
-   return a->id - b->id;
+   if (a->id != b->id)
+   return a->id - b->id;
+   else
+   return a->node - b->node;
 }
 
 int cpu_map__build_map(struct perf_cpu_map *cpus, struct cpu_aggr_map **res,
@@ -276,7 +279,7 @@ struct aggr_cpu_id cpu_map__get_node(struct perf_cpu_map 
*map, int idx, void *da
if (idx < 0 || idx >= map->nr)
return id;
 
-   id.id = cpu_map__get_node_id(map->map[idx]);
+   id.node = cpu_map__get_node_id(map->map[idx]);
return id;
 }
 
@@ -621,18 +624,21 @@ const struct perf_cpu_map *cpu_map__online(void) /* 
thread unsafe */
 
 bool cpu_map__compare_aggr_cpu_id(struct aggr_cpu_id a, struct aggr_cpu_id b)
 {
-   return a.id == b.id;
+   return a.id == b.id &&
+   a.node == b.node;
 }
 
 bool cpu_map__aggr_cpu_id_is_empty(struct aggr_cpu_id a)
 {
-   return a.id == -1;
+   return a.id == -1 &&
+   a.node == -1;
 }
 
 struct aggr_cpu_id cpu_map__empty_aggr_cpu_id(void)
 {
struct aggr_cpu_id ret = {
-   .id = -1
+   .id = -1,
+   .node = -1
};
return ret;
 }
diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h
index d8fc265bc762..f79e92603024 100644
--- a/tools/perf/util/cpumap.h
+++ b/tools/perf/util/cpumap.h
@@ -9,6 +9,7 @@
 
 struct aggr_cpu_id {
int id;
+   int node;
 };
 
 struct cpu_aggr_map {
diff --git a/tools/perf/util/stat-display.c b/to

[PATCH 12/13 v4] perf tools: Add separate core member

2020-11-13 Thread James Clark
Add core as a separate member so that it doesn't have to be
packed into the int value.

Signed-off-by: James Clark 
Cc: Peter Zijlstra 
Cc: Ingo Molnar 
Cc: Arnaldo Carvalho de Melo 
Cc: Mark Rutland 
Cc: Alexander Shishkin 
Cc: Jiri Olsa 
Cc: Namhyung Kim 
Cc: Thomas Richter 
Cc: John Garry 
---
 tools/perf/builtin-stat.c  |  9 +++--
 tools/perf/tests/topology.c|  3 ++-
 tools/perf/util/cpumap.c   | 18 ++
 tools/perf/util/cpumap.h   |  6 +-
 tools/perf/util/stat-display.c | 16 
 5 files changed, 24 insertions(+), 28 deletions(-)

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 514144dad8b1..d79a29e22dfd 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -1404,15 +1404,12 @@ static struct aggr_cpu_id perf_env__get_core(struct 
perf_cpu_map *map, int idx,
if (cpu != -1) {
/*
 * core_id is relative to socket and die,
-* we need a global id. So we combine
-* socket + die id + core id
+* we need a global id. So we set
+* socket, die id and core id
 */
-   if (WARN_ONCE(env->cpu[cpu].core_id >> 16, "The core id number 
is too big.\n"))
-   return cpu_map__empty_aggr_cpu_id();
-
id.socket = env->cpu[cpu].socket_id;
id.die = env->cpu[cpu].die_id;
-   id.id = env->cpu[cpu].core_id & 0x;
+   id.core = env->cpu[cpu].core_id;
}
 
return id;
diff --git a/tools/perf/tests/topology.c b/tools/perf/tests/topology.c
index 1256cf63d4d0..694f786a77f3 100644
--- a/tools/perf/tests/topology.c
+++ b/tools/perf/tests/topology.c
@@ -111,7 +111,7 @@ static int check_cpu_topology(char *path, struct 
perf_cpu_map *map)
for (i = 0; i < map->nr; i++) {
id = cpu_map__get_core(map, i, NULL);
TEST_ASSERT_VAL("Core map - Core ID doesn't match",
-   session->header.env.cpu[map->map[i]].core_id == 
cpu_map__id_to_cpu(id.id));
+   session->header.env.cpu[map->map[i]].core_id == 
id.core);
 
TEST_ASSERT_VAL("Core map - Socket ID doesn't match",
session->header.env.cpu[map->map[i]].socket_id == 
id.socket);
@@ -119,6 +119,7 @@ static int check_cpu_topology(char *path, struct 
perf_cpu_map *map)
TEST_ASSERT_VAL("Core map - Die ID doesn't match",
session->header.env.cpu[map->map[i]].die_id == id.die);
TEST_ASSERT_VAL("Core map - Node ID is set", id.node == -1);
+   TEST_ASSERT_VAL("Core map - ID is set", id.id == -1);
}
 
// Test that die ID contains socket and die
diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c
index c1d3539222f2..d988dfc4e6b6 100644
--- a/tools/perf/util/cpumap.c
+++ b/tools/perf/util/cpumap.c
@@ -154,8 +154,10 @@ static int cmp_aggr_cpu_id(const void *a_pointer, const 
void *b_pointer)
return a->node - b->node;
else if (a->socket != b->socket)
return a->socket - b->socket;
-   else
+   else if (a->die != b->die)
return a->die - b->die;
+   else
+   return a->core - b->core;
 }
 
 int cpu_map__build_map(struct perf_cpu_map *cpus, struct cpu_aggr_map **res,
@@ -259,10 +261,7 @@ struct aggr_cpu_id cpu_map__get_core(struct perf_cpu_map 
*map, int idx, void *da
 * core_id is relative to socket and die, we need a global id.
 * So we combine the result from cpu_map__get_die with the core id
 */
-   if (WARN_ONCE(cpu >> 16, "The core id number is too big.\n"))
-   return cpu_map__empty_aggr_cpu_id();
-
-   id.id = (cpu & 0x);
+   id.core = cpu;
return id;
 }
 
@@ -621,7 +620,8 @@ bool cpu_map__compare_aggr_cpu_id(struct aggr_cpu_id a, 
struct aggr_cpu_id b)
return a.id == b.id &&
a.node == b.node &&
a.socket == b.socket &&
-   a.die == b.die;
+   a.die == b.die &&
+   a.core == b.core;
 }
 
 bool cpu_map__aggr_cpu_id_is_empty(struct aggr_cpu_id a)
@@ -629,7 +629,8 @@ bool cpu_map__aggr_cpu_id_is_empty(struct aggr_cpu_id a)
return a.id == -1 &&
a.node == -1 &&
a.socket == -1 &&
-   a.die == -1;
+   a.die == -1 &&
+   a.core == -1;
 }
 
 struct aggr_cpu_id cpu_map__empty_aggr_cpu_id(void)
@@ -638,7 +639,8 @@ struct aggr_cpu_id cpu_map__empty_aggr_cpu_id(void)
.id = -1,
.node = -1,
.socket = -1,
-  

[PATCH 07/13 v4] perf tools: restrict visibility of functions

2020-11-13 Thread James Clark
These cpu_aggr_map refcounting functions are only used in
builtin-stat.c so their visibilty can be reduced to just
that file.

No functional changes.

Signed-off-by: James Clark 
Cc: Peter Zijlstra 
Cc: Ingo Molnar 
Cc: Arnaldo Carvalho de Melo 
Cc: Mark Rutland 
Cc: Alexander Shishkin 
Cc: Jiri Olsa 
Cc: Namhyung Kim 
Cc: Thomas Richter 
Cc: John Garry 
---
 tools/perf/builtin-stat.c | 15 +++
 tools/perf/util/cpumap.c  | 15 ---
 tools/perf/util/cpumap.h  |  2 --
 3 files changed, 15 insertions(+), 17 deletions(-)

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 7daac139f6cc..344e50651b55 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -1326,6 +1326,21 @@ static int perf_stat_init_aggr_mode(void)
return stat_config.cpus_aggr_map ? 0 : -ENOMEM;
 }
 
+static void cpu_aggr_map__delete(struct cpu_aggr_map *map)
+{
+   if (map) {
+   WARN_ONCE(refcount_read(&map->refcnt) != 0,
+ "cpu_aggr_map refcnt unbalanced\n");
+   free(map);
+   }
+}
+
+static void cpu_aggr_map__put(struct cpu_aggr_map *map)
+{
+   if (map && refcount_dec_and_test(&map->refcnt))
+   cpu_aggr_map__delete(map);
+}
+
 static void perf_stat__exit_aggr_mode(void)
 {
cpu_aggr_map__put(stat_config.aggr_map);
diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c
index e831a18ec95e..e90270f0be57 100644
--- a/tools/perf/util/cpumap.c
+++ b/tools/perf/util/cpumap.c
@@ -112,21 +112,6 @@ struct cpu_aggr_map *cpu_aggr_map__empty_new(int nr)
return cpus;
 }
 
-void cpu_aggr_map__delete(struct cpu_aggr_map *map)
-{
-   if (map) {
-   WARN_ONCE(refcount_read(&map->refcnt) != 0,
- "cpu_aggr_map refcnt unbalanced\n");
-   free(map);
-   }
-}
-
-void cpu_aggr_map__put(struct cpu_aggr_map *map)
-{
-   if (map && refcount_dec_and_test(&map->refcnt))
-   cpu_aggr_map__delete(map);
-}
-
 static int cpu__get_topology_int(int cpu, const char *name, int *value)
 {
char path[PATH_MAX];
diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h
index d82822ddcbce..b112069038be 100644
--- a/tools/perf/util/cpumap.h
+++ b/tools/perf/util/cpumap.h
@@ -21,8 +21,6 @@ struct perf_record_cpu_map_data;
 
 struct perf_cpu_map *perf_cpu_map__empty_new(int nr);
 struct cpu_aggr_map *cpu_aggr_map__empty_new(int nr);
-void cpu_aggr_map__delete(struct cpu_aggr_map *map);
-void cpu_aggr_map__put(struct cpu_aggr_map *map);
 
 struct perf_cpu_map *cpu_map__new_data(struct perf_record_cpu_map_data *data);
 size_t cpu_map__snprint(struct perf_cpu_map *map, char *buf, size_t size);
-- 
2.28.0



[PATCH 13/13 v4] perf tools: add thread field

2020-11-13 Thread James Clark
A separate field isn't strictly required. The core
field could be re-used for thread IDs as a single
field was used previously.

But separating them will avoid confusion and catch
potential errors where core IDs are read as thread
IDs and vice versa.

Also remove the placeholder id field which is now
no longer used.

Signed-off-by: James Clark 
Cc: Peter Zijlstra 
Cc: Ingo Molnar 
Cc: Arnaldo Carvalho de Melo 
Cc: Mark Rutland 
Cc: Alexander Shishkin 
Cc: Jiri Olsa 
Cc: Namhyung Kim 
Cc: Thomas Richter 
Cc: John Garry 
---
 tools/perf/tests/topology.c|  8 
 tools/perf/util/cpumap.c   | 14 +++---
 tools/perf/util/cpumap.h   |  2 +-
 tools/perf/util/stat-display.c |  8 
 4 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/tools/perf/tests/topology.c b/tools/perf/tests/topology.c
index 694f786a77f3..2276db0b1b6f 100644
--- a/tools/perf/tests/topology.c
+++ b/tools/perf/tests/topology.c
@@ -119,7 +119,7 @@ static int check_cpu_topology(char *path, struct 
perf_cpu_map *map)
TEST_ASSERT_VAL("Core map - Die ID doesn't match",
session->header.env.cpu[map->map[i]].die_id == id.die);
TEST_ASSERT_VAL("Core map - Node ID is set", id.node == -1);
-   TEST_ASSERT_VAL("Core map - ID is set", id.id == -1);
+   TEST_ASSERT_VAL("Core map - Thread is set", id.thread == -1);
}
 
// Test that die ID contains socket and die
@@ -131,7 +131,7 @@ static int check_cpu_topology(char *path, struct 
perf_cpu_map *map)
TEST_ASSERT_VAL("Die map - Die ID doesn't match",
session->header.env.cpu[map->map[i]].die_id == id.die);
TEST_ASSERT_VAL("Die map - Node ID is set", id.node == -1);
-   TEST_ASSERT_VAL("Die map - ID is set", id.id == -1);
+   TEST_ASSERT_VAL("Die map - Thread is set", id.thread == -1);
}
 
// Test that socket ID contains only socket
@@ -141,7 +141,7 @@ static int check_cpu_topology(char *path, struct 
perf_cpu_map *map)
session->header.env.cpu[map->map[i]].socket_id == 
id.socket);
TEST_ASSERT_VAL("Socket map - Node ID is set", id.node == -1);
TEST_ASSERT_VAL("Socket map - Die ID is set", id.die == -1);
-   TEST_ASSERT_VAL("Socket map - ID is set", id.id == -1);
+   TEST_ASSERT_VAL("Socket map - Thread is set", id.thread == -1);
}
 
// Test that node ID contains only node
@@ -149,7 +149,7 @@ static int check_cpu_topology(char *path, struct 
perf_cpu_map *map)
id = cpu_map__get_node(map, i, NULL);
TEST_ASSERT_VAL("Node map - Node ID doesn't match",
cpu__get_node(map->map[i]) == id.node);
-   TEST_ASSERT_VAL("Node map - ID shouldn't be set", id.id == -1);
+   TEST_ASSERT_VAL("Node map - Thread shouldn't be set", id.thread 
== -1);
TEST_ASSERT_VAL("Node map - Die ID is set", id.die == -1);
}
perf_session__delete(session);
diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c
index d988dfc4e6b6..5f824aa0311d 100644
--- a/tools/perf/util/cpumap.c
+++ b/tools/perf/util/cpumap.c
@@ -148,16 +148,16 @@ static int cmp_aggr_cpu_id(const void *a_pointer, const 
void *b_pointer)
struct aggr_cpu_id *a = (struct aggr_cpu_id *)a_pointer;
struct aggr_cpu_id *b = (struct aggr_cpu_id *)b_pointer;
 
-   if (a->id != b->id)
-   return a->id - b->id;
-   else if (a->node != b->node)
+   if (a->node != b->node)
return a->node - b->node;
else if (a->socket != b->socket)
return a->socket - b->socket;
else if (a->die != b->die)
return a->die - b->die;
-   else
+   else if (a->core != b->core)
return a->core - b->core;
+   else
+   return a->thread - b->thread;
 }
 
 int cpu_map__build_map(struct perf_cpu_map *cpus, struct cpu_aggr_map **res,
@@ -617,7 +617,7 @@ const struct perf_cpu_map *cpu_map__online(void) /* thread 
unsafe */
 
 bool cpu_map__compare_aggr_cpu_id(struct aggr_cpu_id a, struct aggr_cpu_id b)
 {
-   return a.id == b.id &&
+   return a.thread == b.thread &&
a.node == b.node &&
a.socket == b.socket &&
a.die == b.die &&
@@ -626,7 +626,7 @@ bool cpu_map__compare_aggr_cpu_id(struct aggr_cpu_id a, 
struct aggr_cpu_id b)
 
 bool cpu_map__aggr_cpu_id_is_empty(struct aggr_cpu_id a)
 {
-   return a.id == -1 &&
+   return a.thread == -1 &&

[PATCH 11/13 v4] perf tools: Add separate die member

2020-11-13 Thread James Clark
Add die as a separate member so that it doesn't have to be
packed into the int value.

Signed-off-by: James Clark 
Cc: Peter Zijlstra 
Cc: Ingo Molnar 
Cc: Arnaldo Carvalho de Melo 
Cc: Mark Rutland 
Cc: Alexander Shishkin 
Cc: Jiri Olsa 
Cc: Namhyung Kim 
Cc: Thomas Richter 
Cc: John Garry 
---
 tools/perf/builtin-stat.c  | 14 +++---
 tools/perf/tests/topology.c|  7 +--
 tools/perf/util/cpumap.c   | 28 ++--
 tools/perf/util/cpumap.h   |  6 +-
 tools/perf/util/stat-display.c |  6 +++---
 5 files changed, 26 insertions(+), 35 deletions(-)

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 193e7a4e0c7b..514144dad8b1 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -1389,11 +1389,7 @@ static struct aggr_cpu_id perf_env__get_die(struct 
perf_cpu_map *map, int idx, v
 * make a unique ID.
 */
id.socket = env->cpu[cpu].socket_id;
-
-   if (WARN_ONCE(env->cpu[cpu].die_id >> 8, "The die id number is 
too big.\n"))
-   return cpu_map__empty_aggr_cpu_id();
-
-   id.id = env->cpu[cpu].die_id & 0xff;
+   id.die = env->cpu[cpu].die_id;
}
 
return id;
@@ -1407,20 +1403,16 @@ static struct aggr_cpu_id perf_env__get_core(struct 
perf_cpu_map *map, int idx,
 
if (cpu != -1) {
/*
-* encode die id in bit range 23:16
 * core_id is relative to socket and die,
 * we need a global id. So we combine
 * socket + die id + core id
 */
-   if (WARN_ONCE(env->cpu[cpu].die_id >> 8, "The die id number is 
too big.\n"))
-   return cpu_map__empty_aggr_cpu_id();
-
if (WARN_ONCE(env->cpu[cpu].core_id >> 16, "The core id number 
is too big.\n"))
return cpu_map__empty_aggr_cpu_id();
 
id.socket = env->cpu[cpu].socket_id;
-   id.id = (env->cpu[cpu].die_id << 16) |
-  (env->cpu[cpu].core_id & 0x);
+   id.die = env->cpu[cpu].die_id;
+   id.id = env->cpu[cpu].core_id & 0x;
}
 
return id;
diff --git a/tools/perf/tests/topology.c b/tools/perf/tests/topology.c
index afefe7456385..1256cf63d4d0 100644
--- a/tools/perf/tests/topology.c
+++ b/tools/perf/tests/topology.c
@@ -117,7 +117,7 @@ static int check_cpu_topology(char *path, struct 
perf_cpu_map *map)
session->header.env.cpu[map->map[i]].socket_id == 
id.socket);
 
TEST_ASSERT_VAL("Core map - Die ID doesn't match",
-   session->header.env.cpu[map->map[i]].die_id == 
cpu_map__id_to_die(id.id));
+   session->header.env.cpu[map->map[i]].die_id == id.die);
TEST_ASSERT_VAL("Core map - Node ID is set", id.node == -1);
}
 
@@ -128,8 +128,9 @@ static int check_cpu_topology(char *path, struct 
perf_cpu_map *map)
session->header.env.cpu[map->map[i]].socket_id == 
id.socket);
 
TEST_ASSERT_VAL("Die map - Die ID doesn't match",
-   session->header.env.cpu[map->map[i]].die_id == 
cpu_map__id_to_die(id.id));
+   session->header.env.cpu[map->map[i]].die_id == id.die);
TEST_ASSERT_VAL("Die map - Node ID is set", id.node == -1);
+   TEST_ASSERT_VAL("Die map - ID is set", id.id == -1);
}
 
// Test that socket ID contains only socket
@@ -138,6 +139,7 @@ static int check_cpu_topology(char *path, struct 
perf_cpu_map *map)
TEST_ASSERT_VAL("Socket map - Socket ID doesn't match",
session->header.env.cpu[map->map[i]].socket_id == 
id.socket);
TEST_ASSERT_VAL("Socket map - Node ID is set", id.node == -1);
+   TEST_ASSERT_VAL("Socket map - Die ID is set", id.die == -1);
TEST_ASSERT_VAL("Socket map - ID is set", id.id == -1);
}
 
@@ -147,6 +149,7 @@ static int check_cpu_topology(char *path, struct 
perf_cpu_map *map)
TEST_ASSERT_VAL("Node map - Node ID doesn't match",
cpu__get_node(map->map[i]) == id.node);
TEST_ASSERT_VAL("Node map - ID shouldn't be set", id.id == -1);
+   TEST_ASSERT_VAL("Node map - Die ID is set", id.die == -1);
}
perf_session__delete(session);
 
diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c
index 233b752cb469..c1d3539222f2 100644
--- a/tools/perf/util/cpumap.c
+++ b/tools/perf/util/cpumap.c
@@ -152,8 +152,1

[PATCH 05/13 v4] perf tools: add new map type for aggregation

2020-11-13 Thread James Clark
Currently this is a duplicate of perf_cpu_map so that
it can be used as a drop in replacement.

In a later commit it will be changed from a map of ints
to use the new cpu_aggr_id struct.

No functional changes.

Signed-off-by: James Clark 
Cc: Peter Zijlstra 
Cc: Ingo Molnar 
Cc: Arnaldo Carvalho de Melo 
Cc: Mark Rutland 
Cc: Alexander Shishkin 
Cc: Jiri Olsa 
Cc: Namhyung Kim 
Cc: Thomas Richter 
Cc: John Garry 
---
 tools/perf/util/cpumap.c | 32 
 tools/perf/util/cpumap.h | 10 ++
 2 files changed, 42 insertions(+)

diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c
index e777024c6676..7500225c8571 100644
--- a/tools/perf/util/cpumap.c
+++ b/tools/perf/util/cpumap.c
@@ -95,6 +95,38 @@ struct perf_cpu_map *perf_cpu_map__empty_new(int nr)
return cpus;
 }
 
+struct cpu_aggr_map *cpu_aggr_map__empty_new(int nr)
+{
+   struct cpu_aggr_map *cpus = malloc(sizeof(*cpus) + sizeof(int) * nr);
+
+   if (cpus != NULL) {
+   int i;
+
+   cpus->nr = nr;
+   for (i = 0; i < nr; i++)
+   cpus->map[i] = -1;
+
+   refcount_set(&cpus->refcnt, 1);
+   }
+
+   return cpus;
+}
+
+void cpu_aggr_map__delete(struct cpu_aggr_map *map)
+{
+   if (map) {
+   WARN_ONCE(refcount_read(&map->refcnt) != 0,
+ "cpu_aggr_map refcnt unbalanced\n");
+   free(map);
+   }
+}
+
+void cpu_aggr_map__put(struct cpu_aggr_map *map)
+{
+   if (map && refcount_dec_and_test(&map->refcnt))
+   cpu_aggr_map__delete(map);
+}
+
 static int cpu__get_topology_int(int cpu, const char *name, int *value)
 {
char path[PATH_MAX];
diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h
index b8c2288a3f6d..ab6be8ef696f 100644
--- a/tools/perf/util/cpumap.h
+++ b/tools/perf/util/cpumap.h
@@ -11,9 +11,19 @@ struct aggr_cpu_id {
int id;
 };
 
+struct cpu_aggr_map {
+   refcount_t refcnt;
+   int nr;
+   int map[];
+};
+
 struct perf_record_cpu_map_data;
 
 struct perf_cpu_map *perf_cpu_map__empty_new(int nr);
+struct cpu_aggr_map *cpu_aggr_map__empty_new(int nr);
+void cpu_aggr_map__delete(struct cpu_aggr_map *map);
+void cpu_aggr_map__put(struct cpu_aggr_map *map);
+
 struct perf_cpu_map *cpu_map__new_data(struct perf_record_cpu_map_data *data);
 size_t cpu_map__snprint(struct perf_cpu_map *map, char *buf, size_t size);
 size_t cpu_map__snprint_mask(struct perf_cpu_map *map, char *buf, size_t size);
-- 
2.28.0



  1   2   >