On Thu, 2010-04-08 at 22:55 +0200, Peter Zijlstra wrote:
> On Thu, 2010-04-08 at 22:45 +0200, Stephane Eranian wrote:
> >     There is a warn_on_once() check for PERF_SAMPLE_RAW which trips
> >     when using PEBS on both Core and Nehalem. Core PEBS sample size is 144
> >     bytes and 176 bytes for Nehalem. Both are multiples of 8, but the size
> >     field is encoded as int, thus the total is never a multiple of 8 which
> >     trips the check. I think the size should have been u64, but now it is
> >     too late to change given it is ABI.
> 
> PEBS hasn't seen -linus yet, so we can fix that.
> 
> There's various things that do indeed rely on the perf buffer to always
> be u64 aligned, so this warning isn't bogus at all.

On the subject of PEBS, we need to change the ABI before it does hit
-linus, I've got something like the below,. but I'm not quite sure of
it.

We could keep the PERF_RECORD_MISC_EXACT bit and allow non-fixed up IPs
in PERF_SAMPLE_EVENT_IP fields, that way we can avoid having to add both
IP and EVENT_IP to samples.

---
Subject: perf: Change the PEBS interface
From: Peter Zijlstra <a.p.zijls...@chello.nl>
Date: Tue Mar 30 13:35:58 CEST 2010

This patch changes the PEBS interface from perf_event_attr::precise
and PERF_RECORD_MISC_EXACT to perf_event_attr::precise and
PERF_SAMPLE_EVENT_IP.

The rationale is that .precise=1 !PERF_SAMPLE_EVENT_IP could be used
to implement buffered PEBS.

Signed-off-by: Peter Zijlstra <a.p.zijls...@chello.nl>
LKML-Reference: <new-submission>
---
 arch/x86/include/asm/perf_event.h         |    9 --
 arch/x86/kernel/cpu/perf_event_intel_ds.c |  133 +++++++++++++-----------------
 include/linux/perf_event.h                |    7 +
 kernel/perf_event.c                       |    6 +
 tools/perf/builtin-top.c                  |    9 +-
 5 files changed, 79 insertions(+), 85 deletions(-)

Index: linux-2.6/arch/x86/include/asm/perf_event.h
===================================================================
--- linux-2.6.orig/arch/x86/include/asm/perf_event.h
+++ linux-2.6/arch/x86/include/asm/perf_event.h
@@ -128,21 +128,12 @@ extern void perf_events_lapic_init(void)
 
 #define PERF_EVENT_INDEX_OFFSET                        0
 
-/*
- * Abuse bit 3 of the cpu eflags register to indicate proper PEBS IP fixups.
- * This flag is otherwise unused and ABI specified to be 0, so nobody should
- * care what we do with it.
- */
-#define PERF_EFLAGS_EXACT      (1UL << 3)
-
 #define perf_misc_flags(regs)                          \
 ({     int misc = 0;                                   \
        if (user_mode(regs))                            \
                misc |= PERF_RECORD_MISC_USER;          \
        else                                            \
                misc |= PERF_RECORD_MISC_KERNEL;        \
-       if (regs->flags & PERF_EFLAGS_EXACT)            \
-               misc |= PERF_RECORD_MISC_EXACT;         \
        misc; })
 
 #define perf_instruction_pointer(regs) ((regs)->ip)
Index: linux-2.6/arch/x86/kernel/cpu/perf_event_intel_ds.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ linux-2.6/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -330,7 +330,8 @@ static void intel_pmu_pebs_enable(struct
        cpuc->pebs_enabled |= 1ULL << hwc->idx;
        WARN_ON_ONCE(cpuc->enabled);
 
-       if (x86_pmu.intel_cap.pebs_trap)
+       if (x86_pmu.intel_cap.pebs_trap &&
+                       (event->attr.sample_type & PERF_SAMPLE_EVENT_IP))
                intel_pmu_lbr_enable(event);
 }
 
@@ -345,7 +346,8 @@ static void intel_pmu_pebs_disable(struc
 
        hwc->config |= ARCH_PERFMON_EVENTSEL_INT;
 
-       if (x86_pmu.intel_cap.pebs_trap)
+       if (x86_pmu.intel_cap.pebs_trap &&
+                       (event->attr.sample_type & PERF_SAMPLE_EVENT_IP))
                intel_pmu_lbr_disable(event);
 }
 
@@ -376,12 +378,12 @@ static inline bool kernel_ip(unsigned lo
 #endif
 }
 
-static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
+static int intel_pmu_pebs_fixup_ip(unsigned long *ipp)
 {
        struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
        unsigned long from = cpuc->lbr_entries[0].from;
        unsigned long old_to, to = cpuc->lbr_entries[0].to;
-       unsigned long ip = regs->ip;
+       unsigned long ip = *ipp;
 
        /*
         * We don't need to fixup if the PEBS assist is fault like
@@ -412,7 +414,7 @@ static int intel_pmu_pebs_fixup_ip(struc
         * We sampled a branch insn, rewind using the LBR stack
         */
        if (ip == to) {
-               regs->ip = from;
+               *ipp = from;
                return 1;
        }
 
@@ -439,7 +441,7 @@ static int intel_pmu_pebs_fixup_ip(struc
        } while (to < ip);
 
        if (to == ip) {
-               regs->ip = old_to;
+               *ipp = old_to;
                return 1;
        }
 
@@ -452,15 +454,62 @@ static int intel_pmu_pebs_fixup_ip(struc
 
 static int intel_pmu_save_and_restart(struct perf_event *event);
 
+static void __intel_pmu_pebs_event(struct perf_event *event,
+                                  struct pt_regs *iregs, void *__pebs)
+{
+       /*
+        * We cast to pebs_record_core since that is a subset of
+        * both formats and we don't use the other fields in this
+        * routine.
+        */
+       struct pebs_record_core *pebs = __pebs;
+       struct perf_sample_data data;
+       struct perf_raw_record raw;
+       struct pt_regs regs;
+
+       if (!intel_pmu_save_and_restart(event))
+               return;
+
+       perf_sample_data_init(&data, 0);
+       data.period = event->hw.last_period;
+
+       if (event->attr.sample_type & PERF_SAMPLE_RAW) {
+               raw.size = x86_pmu.pebs_record_size;
+               raw.data = pebs;
+               data.raw = &raw;
+       }
+
+       /*
+        * We use the interrupt regs as a base because the PEBS record
+        * does not contain a full regs set, specifically it seems to
+        * lack segment descriptors, which get used by things like
+        * user_mode().
+        *
+        * In the simple case fix up only the IP and BP,SP regs, for
+        * PERF_SAMPLE_IP and PERF_SAMPLE_CALLCHAIN to function properly.
+        * A possible PERF_SAMPLE_REGS will have to transfer all regs.
+        */
+       regs = *iregs;
+       regs.ip = pebs->ip;
+       regs.bp = pebs->bp;
+       regs.sp = pebs->sp;
+
+       if (event->attr.sample_type & PERF_SAMPLE_EVENT_IP) {
+               unsigned long event_ip = pebs->ip;
+               if (intel_pmu_pebs_fixup_ip(&event_ip))
+                       data.event_ip = event_ip;
+       }
+
+       if (perf_event_overflow(event, 1, &data, &regs))
+               x86_pmu_stop(event);
+}
+
 static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
 {
        struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
        struct debug_store *ds = cpuc->ds;
        struct perf_event *event = cpuc->events[0]; /* PMC0 only */
        struct pebs_record_core *at, *top;
-       struct perf_sample_data data;
-       struct perf_raw_record raw;
-       struct pt_regs regs;
        int n;
 
        if (!ds || !x86_pmu.pebs)
@@ -486,9 +535,6 @@ static void intel_pmu_drain_pebs_core(st
        if (n <= 0)
                return;
 
-       if (!intel_pmu_save_and_restart(event))
-               return;
-
        /*
         * Should not happen, we program the threshold at 1 and do not
         * set a reset value.
@@ -496,37 +542,7 @@ static void intel_pmu_drain_pebs_core(st
        WARN_ON_ONCE(n > 1);
        at += n - 1;
 
-       perf_sample_data_init(&data, 0);
-       data.period = event->hw.last_period;
-
-       if (event->attr.sample_type & PERF_SAMPLE_RAW) {
-               raw.size = x86_pmu.pebs_record_size;
-               raw.data = at;
-               data.raw = &raw;
-       }
-
-       /*
-        * We use the interrupt regs as a base because the PEBS record
-        * does not contain a full regs set, specifically it seems to
-        * lack segment descriptors, which get used by things like
-        * user_mode().
-        *
-        * In the simple case fix up only the IP and BP,SP regs, for
-        * PERF_SAMPLE_IP and PERF_SAMPLE_CALLCHAIN to function properly.
-        * A possible PERF_SAMPLE_REGS will have to transfer all regs.
-        */
-       regs = *iregs;
-       regs.ip = at->ip;
-       regs.bp = at->bp;
-       regs.sp = at->sp;
-
-       if (intel_pmu_pebs_fixup_ip(&regs))
-               regs.flags |= PERF_EFLAGS_EXACT;
-       else
-               regs.flags &= ~PERF_EFLAGS_EXACT;
-
-       if (perf_event_overflow(event, 1, &data, &regs))
-               x86_pmu_stop(event);
+       __intel_pmu_pebs_event(event, iregs, at);
 }
 
 static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
@@ -534,10 +550,7 @@ static void intel_pmu_drain_pebs_nhm(str
        struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
        struct debug_store *ds = cpuc->ds;
        struct pebs_record_nhm *at, *top;
-       struct perf_sample_data data;
        struct perf_event *event = NULL;
-       struct perf_raw_record raw;
-       struct pt_regs regs;
        u64 status = 0;
        int bit, n;
 
@@ -579,33 +592,7 @@ static void intel_pmu_drain_pebs_nhm(str
                if (!event || bit >= MAX_PEBS_EVENTS)
                        continue;
 
-               if (!intel_pmu_save_and_restart(event))
-                       continue;
-
-               perf_sample_data_init(&data, 0);
-               data.period = event->hw.last_period;
-
-               if (event->attr.sample_type & PERF_SAMPLE_RAW) {
-                       raw.size = x86_pmu.pebs_record_size;
-                       raw.data = at;
-                       data.raw = &raw;
-               }
-
-               /*
-                * See the comment in intel_pmu_drain_pebs_core()
-                */
-               regs = *iregs;
-               regs.ip = at->ip;
-               regs.bp = at->bp;
-               regs.sp = at->sp;
-
-               if (intel_pmu_pebs_fixup_ip(&regs))
-                       regs.flags |= PERF_EFLAGS_EXACT;
-               else
-                       regs.flags &= ~PERF_EFLAGS_EXACT;
-
-               if (perf_event_overflow(event, 1, &data, &regs))
-                       x86_pmu_stop(event);
+               __intel_pmu_pebs_event(event, iregs, at);
        }
 }
 
Index: linux-2.6/include/linux/perf_event.h
===================================================================
--- linux-2.6.orig/include/linux/perf_event.h
+++ linux-2.6/include/linux/perf_event.h
@@ -125,8 +125,9 @@ enum perf_event_sample_format {
        PERF_SAMPLE_PERIOD                      = 1U << 8,
        PERF_SAMPLE_STREAM_ID                   = 1U << 9,
        PERF_SAMPLE_RAW                         = 1U << 10,
+       PERF_SAMPLE_EVENT_IP                    = 1U << 11,
 
-       PERF_SAMPLE_MAX = 1U << 11,             /* non-ABI */
+       PERF_SAMPLE_MAX = 1U << 12,             /* non-ABI */
 };
 
 /*
@@ -294,7 +295,6 @@ struct perf_event_mmap_page {
 #define PERF_RECORD_MISC_USER                  (2 << 0)
 #define PERF_RECORD_MISC_HYPERVISOR            (3 << 0)
 
-#define PERF_RECORD_MISC_EXACT                 (1 << 14)
 /*
  * Reserve the last bit to indicate some extended misc field
  */
@@ -389,6 +389,7 @@ enum perf_event_type {
         *      struct perf_event_header        header;
         *
         *      { u64                   ip;       } && PERF_SAMPLE_IP
+        *      { u64                   event_ip; } && PERF_SAMPLE_EVENT_IP
         *      { u32                   pid, tid; } && PERF_SAMPLE_TID
         *      { u64                   time;     } && PERF_SAMPLE_TIME
         *      { u64                   addr;     } && PERF_SAMPLE_ADDR
@@ -804,6 +805,7 @@ struct perf_sample_data {
        u64                             type;
 
        u64                             ip;
+       u64                             event_ip;
        struct {
                u32     pid;
                u32     tid;
@@ -824,6 +826,7 @@ struct perf_sample_data {
 static inline
 void perf_sample_data_init(struct perf_sample_data *data, u64 addr)
 {
+       data->event_ip = 0;
        data->addr = addr;
        data->raw  = NULL;
 }
Index: linux-2.6/kernel/perf_event.c
===================================================================
--- linux-2.6.orig/kernel/perf_event.c
+++ linux-2.6/kernel/perf_event.c
@@ -3158,6 +3158,9 @@ void perf_output_sample(struct perf_outp
        if (sample_type & PERF_SAMPLE_IP)
                perf_output_put(handle, data->ip);
 
+       if (sample_type & PERF_SAMPLE_EVENT_IP)
+               perf_output_put(handle, data->event_ip);
+
        if (sample_type & PERF_SAMPLE_TID)
                perf_output_put(handle, data->tid_entry);
 
@@ -3237,6 +3240,9 @@ void perf_prepare_sample(struct perf_eve
                header->size += sizeof(data->ip);
        }
 
+       if (sample_type & PERF_SAMPLE_EVENT_IP)
+               header->size += sizeof(data->event_ip);
+
        if (sample_type & PERF_SAMPLE_TID) {
                /* namespace issues */
                data->tid_entry.pid = perf_event_pid(event, current);
Index: linux-2.6/tools/perf/builtin-top.c
===================================================================
--- linux-2.6.orig/tools/perf/builtin-top.c
+++ linux-2.6/tools/perf/builtin-top.c
@@ -975,8 +975,10 @@ static void event__process_sample(const 
                return;
        }
 
-       if (self->header.misc & PERF_RECORD_MISC_EXACT)
+       if (ip)
                exact_samples++;
+       else
+               return;
 
        if (event__preprocess_sample(self, session, &al, symbol_filter) < 0 ||
            al.filtered)
@@ -1169,6 +1171,11 @@ static void start_counter(int i, int cou
 
        attr->sample_type       = PERF_SAMPLE_IP | PERF_SAMPLE_TID;
 
+       if (attr->precise) {
+               attr->sample_type &= ~PERF_SAMPLE_IP;
+               attr->sample_type |= PERF_SAMPLE_EVENT_IP;
+       }
+
        if (freq) {
                attr->sample_type       |= PERF_SAMPLE_PERIOD;
                attr->freq              = 1;



------------------------------------------------------------------------------
Download Intel&#174; Parallel Studio Eval
Try the new software tools for yourself. Speed compiling, find bugs
proactively, and fine-tune applications for parallel performance.
See why Intel Parallel Studio got high marks during beta.
http://p.sf.net/sfu/intel-sw-dev
_______________________________________________
perfmon2-devel mailing list
perfmon2-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/perfmon2-devel

Reply via email to