On Thu, Apr 8, 2010 at 11:03 PM, Peter Zijlstra <pet...@infradead.org> wrote: > On Thu, 2010-04-08 at 22:55 +0200, Peter Zijlstra wrote: >> On Thu, 2010-04-08 at 22:45 +0200, Stephane Eranian wrote: >> > There is a warn_on_once() check for PERF_SAMPLE_RAW which trips >> > when using PEBS on both Core and Nehalem. Core PEBS sample size is 144 >> > bytes and 176 bytes for Nehalem. Both are multiples of 8, but the size >> > field is encoded as int, thus the total is never a multiple of 8 which >> > trips the check. I think the size should have been u64, but now it is >> > too late to change given it is ABI. >> >> PEBS hasn't seen -linus yet, so we can fix that. >> >> There's various things that do indeed rely on the perf buffer to always >> be u64 aligned, so this warning isn't bogus at all. > > On the subject of PEBS, we need to change the ABI before it does hit > -linus, I've got something like the below,. but I'm not quite sure of > it. > > We could keep the PERF_RECORD_MISC_EXACT bit and allow non-fixed up IPs > in PERF_SAMPLE_EVENT_IP fields, that way we can avoid having to add both > IP and EVENT_IP to samples. > You always have the original IP in the PEBS sample, so why add yet another way of getting to the same data?
> --- > Subject: perf: Change the PEBS interface > From: Peter Zijlstra <a.p.zijls...@chello.nl> > Date: Tue Mar 30 13:35:58 CEST 2010 > > This patch changes the PEBS interface from perf_event_attr::precise > and PERF_RECORD_MISC_EXACT to perf_event_attr::precise and > PERF_SAMPLE_EVENT_IP. > > The rationale is that .precise=1 !PERF_SAMPLE_EVENT_IP could be used > to implement buffered PEBS. > I am not sure I understand what you mean by buffered PEBS. Are you talking about using PEBS buffer bigger than one entry? If so, how can you do: - the LBR based fixups for multiple samples (on PMU interrupt) - attribute PID/TID in system-wide mode > Signed-off-by: Peter Zijlstra <a.p.zijls...@chello.nl> > LKML-Reference: <new-submission> > --- > arch/x86/include/asm/perf_event.h | 9 -- > arch/x86/kernel/cpu/perf_event_intel_ds.c | 133 > +++++++++++++----------------- > include/linux/perf_event.h | 7 + > kernel/perf_event.c | 6 + > tools/perf/builtin-top.c | 9 +- > 5 files changed, 79 insertions(+), 85 deletions(-) > > Index: linux-2.6/arch/x86/include/asm/perf_event.h > =================================================================== > --- linux-2.6.orig/arch/x86/include/asm/perf_event.h > +++ linux-2.6/arch/x86/include/asm/perf_event.h > @@ -128,21 +128,12 @@ extern void perf_events_lapic_init(void) > > #define PERF_EVENT_INDEX_OFFSET 0 > > -/* > - * Abuse bit 3 of the cpu eflags register to indicate proper PEBS IP fixups. > - * This flag is otherwise unused and ABI specified to be 0, so nobody should > - * care what we do with it. > - */ > -#define PERF_EFLAGS_EXACT (1UL << 3) > - > #define perf_misc_flags(regs) \ > ({ int misc = 0; \ > if (user_mode(regs)) \ > misc |= PERF_RECORD_MISC_USER; \ > else \ > misc |= PERF_RECORD_MISC_KERNEL; \ > - if (regs->flags & PERF_EFLAGS_EXACT) \ > - misc |= PERF_RECORD_MISC_EXACT; \ > misc; }) > > #define perf_instruction_pointer(regs) ((regs)->ip) > Index: linux-2.6/arch/x86/kernel/cpu/perf_event_intel_ds.c > =================================================================== > --- linux-2.6.orig/arch/x86/kernel/cpu/perf_event_intel_ds.c > +++ linux-2.6/arch/x86/kernel/cpu/perf_event_intel_ds.c > @@ -330,7 +330,8 @@ static void intel_pmu_pebs_enable(struct > cpuc->pebs_enabled |= 1ULL << hwc->idx; > WARN_ON_ONCE(cpuc->enabled); > > - if (x86_pmu.intel_cap.pebs_trap) > + if (x86_pmu.intel_cap.pebs_trap && > + (event->attr.sample_type & PERF_SAMPLE_EVENT_IP)) > intel_pmu_lbr_enable(event); > } > > @@ -345,7 +346,8 @@ static void intel_pmu_pebs_disable(struc > > hwc->config |= ARCH_PERFMON_EVENTSEL_INT; > > - if (x86_pmu.intel_cap.pebs_trap) > + if (x86_pmu.intel_cap.pebs_trap && > + (event->attr.sample_type & PERF_SAMPLE_EVENT_IP)) > intel_pmu_lbr_disable(event); > } > > @@ -376,12 +378,12 @@ static inline bool kernel_ip(unsigned lo > #endif > } > > -static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs) > +static int intel_pmu_pebs_fixup_ip(unsigned long *ipp) > { > struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); > unsigned long from = cpuc->lbr_entries[0].from; > unsigned long old_to, to = cpuc->lbr_entries[0].to; > - unsigned long ip = regs->ip; > + unsigned long ip = *ipp; > > /* > * We don't need to fixup if the PEBS assist is fault like > @@ -412,7 +414,7 @@ static int intel_pmu_pebs_fixup_ip(struc > * We sampled a branch insn, rewind using the LBR stack > */ > if (ip == to) { > - regs->ip = from; > + *ipp = from; > return 1; > } > > @@ -439,7 +441,7 @@ static int intel_pmu_pebs_fixup_ip(struc > } while (to < ip); > > if (to == ip) { > - regs->ip = old_to; > + *ipp = old_to; > return 1; > } > > @@ -452,15 +454,62 @@ static int intel_pmu_pebs_fixup_ip(struc > > static int intel_pmu_save_and_restart(struct perf_event *event); > > +static void __intel_pmu_pebs_event(struct perf_event *event, > + struct pt_regs *iregs, void *__pebs) > +{ > + /* > + * We cast to pebs_record_core since that is a subset of > + * both formats and we don't use the other fields in this > + * routine. > + */ > + struct pebs_record_core *pebs = __pebs; > + struct perf_sample_data data; > + struct perf_raw_record raw; > + struct pt_regs regs; > + > + if (!intel_pmu_save_and_restart(event)) > + return; > + > + perf_sample_data_init(&data, 0); > + data.period = event->hw.last_period; > + > + if (event->attr.sample_type & PERF_SAMPLE_RAW) { > + raw.size = x86_pmu.pebs_record_size; > + raw.data = pebs; > + data.raw = &raw; > + } > + > + /* > + * We use the interrupt regs as a base because the PEBS record > + * does not contain a full regs set, specifically it seems to > + * lack segment descriptors, which get used by things like > + * user_mode(). > + * > + * In the simple case fix up only the IP and BP,SP regs, for > + * PERF_SAMPLE_IP and PERF_SAMPLE_CALLCHAIN to function properly. > + * A possible PERF_SAMPLE_REGS will have to transfer all regs. > + */ > + regs = *iregs; > + regs.ip = pebs->ip; > + regs.bp = pebs->bp; > + regs.sp = pebs->sp; > + > + if (event->attr.sample_type & PERF_SAMPLE_EVENT_IP) { > + unsigned long event_ip = pebs->ip; > + if (intel_pmu_pebs_fixup_ip(&event_ip)) > + data.event_ip = event_ip; > + } > + > + if (perf_event_overflow(event, 1, &data, ®s)) > + x86_pmu_stop(event); > +} > + > static void intel_pmu_drain_pebs_core(struct pt_regs *iregs) > { > struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); > struct debug_store *ds = cpuc->ds; > struct perf_event *event = cpuc->events[0]; /* PMC0 only */ > struct pebs_record_core *at, *top; > - struct perf_sample_data data; > - struct perf_raw_record raw; > - struct pt_regs regs; > int n; > > if (!ds || !x86_pmu.pebs) > @@ -486,9 +535,6 @@ static void intel_pmu_drain_pebs_core(st > if (n <= 0) > return; > > - if (!intel_pmu_save_and_restart(event)) > - return; > - > /* > * Should not happen, we program the threshold at 1 and do not > * set a reset value. > @@ -496,37 +542,7 @@ static void intel_pmu_drain_pebs_core(st > WARN_ON_ONCE(n > 1); > at += n - 1; > > - perf_sample_data_init(&data, 0); > - data.period = event->hw.last_period; > - > - if (event->attr.sample_type & PERF_SAMPLE_RAW) { > - raw.size = x86_pmu.pebs_record_size; > - raw.data = at; > - data.raw = &raw; > - } > - > - /* > - * We use the interrupt regs as a base because the PEBS record > - * does not contain a full regs set, specifically it seems to > - * lack segment descriptors, which get used by things like > - * user_mode(). > - * > - * In the simple case fix up only the IP and BP,SP regs, for > - * PERF_SAMPLE_IP and PERF_SAMPLE_CALLCHAIN to function properly. > - * A possible PERF_SAMPLE_REGS will have to transfer all regs. > - */ > - regs = *iregs; > - regs.ip = at->ip; > - regs.bp = at->bp; > - regs.sp = at->sp; > - > - if (intel_pmu_pebs_fixup_ip(®s)) > - regs.flags |= PERF_EFLAGS_EXACT; > - else > - regs.flags &= ~PERF_EFLAGS_EXACT; > - > - if (perf_event_overflow(event, 1, &data, ®s)) > - x86_pmu_stop(event); > + __intel_pmu_pebs_event(event, iregs, at); > } > > static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) > @@ -534,10 +550,7 @@ static void intel_pmu_drain_pebs_nhm(str > struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); > struct debug_store *ds = cpuc->ds; > struct pebs_record_nhm *at, *top; > - struct perf_sample_data data; > struct perf_event *event = NULL; > - struct perf_raw_record raw; > - struct pt_regs regs; > u64 status = 0; > int bit, n; > > @@ -579,33 +592,7 @@ static void intel_pmu_drain_pebs_nhm(str > if (!event || bit >= MAX_PEBS_EVENTS) > continue; > > - if (!intel_pmu_save_and_restart(event)) > - continue; > - > - perf_sample_data_init(&data, 0); > - data.period = event->hw.last_period; > - > - if (event->attr.sample_type & PERF_SAMPLE_RAW) { > - raw.size = x86_pmu.pebs_record_size; > - raw.data = at; > - data.raw = &raw; > - } > - > - /* > - * See the comment in intel_pmu_drain_pebs_core() > - */ > - regs = *iregs; > - regs.ip = at->ip; > - regs.bp = at->bp; > - regs.sp = at->sp; > - > - if (intel_pmu_pebs_fixup_ip(®s)) > - regs.flags |= PERF_EFLAGS_EXACT; > - else > - regs.flags &= ~PERF_EFLAGS_EXACT; > - > - if (perf_event_overflow(event, 1, &data, ®s)) > - x86_pmu_stop(event); > + __intel_pmu_pebs_event(event, iregs, at); > } > } > > Index: linux-2.6/include/linux/perf_event.h > =================================================================== > --- linux-2.6.orig/include/linux/perf_event.h > +++ linux-2.6/include/linux/perf_event.h > @@ -125,8 +125,9 @@ enum perf_event_sample_format { > PERF_SAMPLE_PERIOD = 1U << 8, > PERF_SAMPLE_STREAM_ID = 1U << 9, > PERF_SAMPLE_RAW = 1U << 10, > + PERF_SAMPLE_EVENT_IP = 1U << 11, > > - PERF_SAMPLE_MAX = 1U << 11, /* non-ABI */ > + PERF_SAMPLE_MAX = 1U << 12, /* non-ABI */ > }; > > /* > @@ -294,7 +295,6 @@ struct perf_event_mmap_page { > #define PERF_RECORD_MISC_USER (2 << 0) > #define PERF_RECORD_MISC_HYPERVISOR (3 << 0) > > -#define PERF_RECORD_MISC_EXACT (1 << 14) > /* > * Reserve the last bit to indicate some extended misc field > */ > @@ -389,6 +389,7 @@ enum perf_event_type { > * struct perf_event_header header; > * > * { u64 ip; } && PERF_SAMPLE_IP > + * { u64 event_ip; } && PERF_SAMPLE_EVENT_IP > * { u32 pid, tid; } && PERF_SAMPLE_TID > * { u64 time; } && PERF_SAMPLE_TIME > * { u64 addr; } && PERF_SAMPLE_ADDR > @@ -804,6 +805,7 @@ struct perf_sample_data { > u64 type; > > u64 ip; > + u64 event_ip; > struct { > u32 pid; > u32 tid; > @@ -824,6 +826,7 @@ struct perf_sample_data { > static inline > void perf_sample_data_init(struct perf_sample_data *data, u64 addr) > { > + data->event_ip = 0; > data->addr = addr; > data->raw = NULL; > } > Index: linux-2.6/kernel/perf_event.c > =================================================================== > --- linux-2.6.orig/kernel/perf_event.c > +++ linux-2.6/kernel/perf_event.c > @@ -3158,6 +3158,9 @@ void perf_output_sample(struct perf_outp > if (sample_type & PERF_SAMPLE_IP) > perf_output_put(handle, data->ip); > > + if (sample_type & PERF_SAMPLE_EVENT_IP) > + perf_output_put(handle, data->event_ip); > + > if (sample_type & PERF_SAMPLE_TID) > perf_output_put(handle, data->tid_entry); > > @@ -3237,6 +3240,9 @@ void perf_prepare_sample(struct perf_eve > header->size += sizeof(data->ip); > } > > + if (sample_type & PERF_SAMPLE_EVENT_IP) > + header->size += sizeof(data->event_ip); > + > if (sample_type & PERF_SAMPLE_TID) { > /* namespace issues */ > data->tid_entry.pid = perf_event_pid(event, current); > Index: linux-2.6/tools/perf/builtin-top.c > =================================================================== > --- linux-2.6.orig/tools/perf/builtin-top.c > +++ linux-2.6/tools/perf/builtin-top.c > @@ -975,8 +975,10 @@ static void event__process_sample(const > return; > } > > - if (self->header.misc & PERF_RECORD_MISC_EXACT) > + if (ip) > exact_samples++; > + else > + return; > > if (event__preprocess_sample(self, session, &al, symbol_filter) < 0 || > al.filtered) > @@ -1169,6 +1171,11 @@ static void start_counter(int i, int cou > > attr->sample_type = PERF_SAMPLE_IP | PERF_SAMPLE_TID; > > + if (attr->precise) { > + attr->sample_type &= ~PERF_SAMPLE_IP; > + attr->sample_type |= PERF_SAMPLE_EVENT_IP; > + } > + > if (freq) { > attr->sample_type |= PERF_SAMPLE_PERIOD; > attr->freq = 1; > > > ------------------------------------------------------------------------------ Download Intel® Parallel Studio Eval Try the new software tools for yourself. Speed compiling, find bugs proactively, and fine-tune applications for parallel performance. See why Intel Parallel Studio got high marks during beta. http://p.sf.net/sfu/intel-sw-dev _______________________________________________ perfmon2-devel mailing list perfmon2-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/perfmon2-devel