Hi, On Thu, Feb 18, 2010 at 11:25 PM, Peter Zijlstra <pet...@infradead.org> wrote: > On Sun, 2010-02-14 at 11:12 +0100, Peter Zijlstra wrote: >> >> Dealing with context switches is also going to be tricky, where we have >> to safe and 'restore' LBR stacks for per-task counters. > > OK, so I poked at the LBR hardware a bit, sadly the TOS really doesn't > count beyond the few bits it requires :-( >
The TOS is also a read-only MSR. > I had hopes it would, since that would make it easier to share the LBR, > simply take a TOS snapshot when you schedule the counter in, and never > roll back further for that particular counter. > > As it stands we'll have to wipe the full LBR state every time we 'touch' > it, which makes it less useful for cpu-bound counters. > Yes, you need to clean it up each time you snapshot it and each time you restore it. The patch does not seem to handle LBR context switches. > Also, not all hw (core and pentium-m) supports the freeze_lbrs_on_pmi > bit, what we could do for those is stick an unconditional LBR disable > very early in the NMI path and simply roll back the stack until we hit a > branch into the NMI vector, that should leave a few usable LBR entries. > You need to be consistent across the CPUs. If a CPU does not provide freeze_on_pmi, then I would simply not support it as a first approach. Same thing if the LBR is less than 4-deep. I don't think you'll get anything useful out of it. > For AMD and P6 there is only a single LBR record, AMD seems to freeze > the thing on #DB traps but the PMI isn't qualified as one afaict, > rendering the single entry useless (didn't look at the P6 details). > > hackery below.. The patch does not address the configuration options available on Intel Nehalem/Westmere, i.e., LBR_SELECT (see Vol 3a table 16-9). We can handle priv level separately as it can be derived from the event exclude_*. But it you want to allow multiple events in a group to use PERF_SAMPLE_LBR then you need to ensure LBR_SELECT is set to the same value, priv levels included. Furthermore, LBR_SELECT is shared between HT threads. We need to either add another field in perf_event_attr or encode this in the config field, though it is ugly because unrelated to the event but rather to the sample_type. The patch is missing the sampling part, i.e., dump of the LBR (in sequential order) into the sampling buffer. I would also select a better name than PERF_SAMPLE_LBR. LBR is an Intel thing. Maybe PERF_SAMPLE_TAKEN_BRANCH. > --- > arch/x86/include/asm/perf_event.h | 24 +++ > arch/x86/kernel/cpu/perf_event.c | 233 > +++++++++++++++++++++++++++++++++++--- > arch/x86/kernel/traps.c | 3 > include/linux/perf_event.h | 7 - > 4 files changed, 251 insertions(+), 16 deletions(-) > > Index: linux-2.6/arch/x86/kernel/cpu/perf_event.c > =================================================================== > --- linux-2.6.orig/arch/x86/kernel/cpu/perf_event.c > +++ linux-2.6/arch/x86/kernel/cpu/perf_event.c > @@ -104,6 +104,10 @@ struct amd_nb { > struct event_constraint event_constraints[X86_PMC_IDX_MAX]; > }; > > +struct lbr_entry { > + u64 from, to, flags; > +}; > + > struct cpu_hw_events { > struct perf_event *events[X86_PMC_IDX_MAX]; /* in counter order > */ > unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; > @@ -117,6 +121,10 @@ struct cpu_hw_events { > u64 tags[X86_PMC_IDX_MAX]; > struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled > order */ > struct amd_nb *amd_nb; > + > + int lbr_users; > + int lbr_entries; > + struct lbr_entry lbr_stack[16]; > }; > > #define __EVENT_CONSTRAINT(c, n, m, w) {\ > @@ -187,6 +195,19 @@ struct x86_pmu { > void (*put_event_constraints)(struct cpu_hw_events *cpuc, > struct perf_event *event); > struct event_constraint *event_constraints; > + > + unsigned long lbr_tos; > + unsigned long lbr_from, lbr_to; > + int lbr_nr; > + int lbr_ctl; > + int lbr_format; > +}; > + > +enum { > + LBR_FORMAT_32 = 0x00, > + LBR_FORMAT_LIP = 0x01, > + LBR_FORMAT_EIP = 0x02, > + LBR_FORMAT_EIP_FLAGS = 0x03, > }; > > static struct x86_pmu x86_pmu __read_mostly; > @@ -1203,6 +1224,52 @@ static void intel_pmu_disable_bts(void) > update_debugctlmsr(debugctlmsr); > } > > +static void __intel_pmu_enable_lbr(void) > +{ > + u64 debugctl; > + > + rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); > + debugctl |= x86_pmu.lbr_ctl; > + wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); > +} > + > +static void intel_pmu_enable_lbr(void) > +{ > + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); > + > + if (!x86_pmu.lbr_nr) > + return; > + > + if (!cpuc->lbr_users) > + __intel_pmu_enable_lbr(); > + > + cpuc->lbr_users++; > +} > + > +static void __intel_pmu_disable_lbr(void) > +{ > + u64 debugctl; > + > + rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); > + debugctl &= ~x86_pmu.lbr_ctl; > + wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); > +} > + > +static void intel_pmu_disable_lbr(void) > +{ > + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); > + > + if (!x86_pmu.lbr_nr) > + return; > + > + cpuc->lbr_users--; > + > + BUG_ON(cpuc->lbr_users < 0); > + > + if (!cpuc->lbr_users) > + __intel_pmu_disable_lbr(); > +} > + > static void intel_pmu_pebs_enable(struct hw_perf_event *hwc) > { > struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); > @@ -1402,6 +1469,9 @@ void hw_perf_disable(void) > cpuc->enabled = 0; > barrier(); > > + if (cpuc->lbr_users) > + __intel_pmu_disable_lbr(); > + > x86_pmu.disable_all(); > } > > @@ -1703,6 +1773,10 @@ void hw_perf_enable(void) > barrier(); > > x86_pmu.enable_all(); > + > + // XXX > + if (cpuc->lbr_users = 1) > + __intel_pmu_enable_lbr(); > } > > static inline u64 intel_pmu_get_status(void) > @@ -2094,7 +2168,6 @@ static void intel_pmu_drain_pebs_core(st > struct perf_event_header header; > struct perf_sample_data data; > struct pt_regs regs; > - u64 > > if (!event || !ds || !x86_pmu.pebs) > return; > @@ -2114,7 +2187,7 @@ static void intel_pmu_drain_pebs_core(st > > perf_prepare_sample(&header, &data, event, ®s); > > - event.hw.interrupts += (top - at); > + event->hw.interrupts += (top - at); > atomic64_add((top - at) * event->hw.last_period, &event->count); > > if (perf_output_begin(&handle, event, header.size * (top - at), 1, 1)) > @@ -2188,6 +2261,84 @@ static void intel_pmu_drain_pebs_nhm(str > } > } > > +static inline u64 intel_pmu_lbr_tos(void) > +{ > + u64 tos; > + > + rdmsrl(x86_pmu.lbr_tos, tos); > + return tos; > +} > + > +static void > +intel_pmu_read_lbr_32(struct cpu_hw_events *cpuc, struct perf_event *event) > +{ > + struct hw_perf_event *hwc = &event->hw; > + unsigned long mask = x86_pmu.lbr_nr - 1; > + u64 tos = intel_pmu_lbr_tos(); > + int i; > + > + for (i = 0; tos > hwc->lbr_tos && i < x86_pmu.lbr_nr; i++, tos--) { > + unsigned long lbr_idx = (tos - i) & mask; > + union { > + struct { > + u32 from; > + u32 to; > + }; > + u64 lbr; > + } msr_lastbranch; > + > + rdmsrl(x86_pmu.lbr_from + lbr_idx, msr_lastbranch.lbr); > + > + cpuc->lbr_stack[i].from = msr_lastbranch.from; > + cpuc->lbr_stack[i].to = msr_lastbranch.to; > + cpuc->lbr_stack[i].flags = 0; > + } > + cpuc->lbr_entries = i; > +} > + > +#define LBR_FROM_FLAG_MISPRED (1ULL << 63) > + > +/* > + * Due to lack of segmentation in Linux the effective address (offset) > + * is the same as the linear address, allowing us to merge the LIP and EIP > + * LBR formats. > + */ > +static void > +intel_pmu_read_lbr_64(struct cpu_hw_events *cpuc, struct perf_event *event) > +{ > + struct hw_perf_event *hwc = &event->hw; > + unsigned long mask = x86_pmu.lbr_nr - 1; > + u64 tos = intel_pmu_lbr_tos(); > + int i; > + > + for (i = 0; tos > hwc->lbr_tos && i < x86_pmu.lbr_nr; i++, tos--) { > + unsigned long lbr_idx = (tos - i) & mask; > + u64 from, to, flags = 0; > + > + rdmsrl(x86_pmu.lbr_from + lbr_idx, from); > + rdmsrl(x86_pmu.lbr_to + lbr_idx, to); > + > + if (x86_pmu.lbr_format == LBR_FORMAT_EIP_FLAGS) { > + flags = !!(from & LBR_FROM_FLAG_MISPRED); > + from = (u64)((((s64)from) << 1) >> 1); > + } > + > + cpuc->lbr_stack[i].from = from; > + cpuc->lbr_stack[i].to = to; > + cpuc->lbr_stack[i].flags = flags; > + } > + cpuc->lbr_entries = i; > +} > + > +static void > +intel_pmu_read_lbr(struct cpu_hw_events *cpuc, struct perf_event *event) > +{ > + if (x86_pmu.lbr_format == LBR_FORMAT_32) > + intel_pmu_read_lbr_32(cpuc, event); > + else > + intel_pmu_read_lbr_64(cpuc, event); > +} > + > static void x86_pmu_stop(struct perf_event *event) > { > struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); > @@ -2456,11 +2607,26 @@ perf_event_nmi_handler(struct notifier_b > * If the first NMI handles both, the latter will be empty and daze > * the CPU. > */ > + trace_printk("LBR TOS: %Ld\n", intel_pmu_lbr_tos()); > x86_pmu.handle_irq(regs); > > return NOTIFY_STOP; > } > > +static __read_mostly struct notifier_block perf_event_nmi_notifier = { > + .notifier_call = perf_event_nmi_handler, > + .next = NULL, > + .priority = 1 > +}; > + > +void perf_nmi_exit(void) > +{ > + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); > + > + if (cpuc->lbr_users) > + __intel_pmu_enable_lbr(); > +} > + > static struct event_constraint unconstrained; /* can schedule */ > static struct event_constraint null_constraint; /* can't schedule */ > static struct event_constraint bts_constraint = > @@ -2761,12 +2927,6 @@ undo: > return ret; > } > > -static __read_mostly struct notifier_block perf_event_nmi_notifier = { > - .notifier_call = perf_event_nmi_handler, > - .next = NULL, > - .priority = 1 > -}; > - > static __initconst struct x86_pmu p6_pmu = { > .name = "p6", > .handle_irq = x86_pmu_handle_irq, > @@ -2793,7 +2953,7 @@ static __initconst struct x86_pmu p6_pmu > .event_bits = 32, > .event_mask = (1ULL << 32) - 1, > .get_event_constraints = intel_get_event_constraints, > - .event_constraints = intel_p6_event_constraints > + .event_constraints = intel_p6_event_constraints, > }; > > static __initconst struct x86_pmu core_pmu = { > @@ -2873,18 +3033,26 @@ static __init int p6_pmu_init(void) > case 7: > case 8: > case 11: /* Pentium III */ > + x86_pmu = p6_pmu; > + > + break; > case 9: > - case 13: > - /* Pentium M */ > + case 13: /* Pentium M */ > + x86_pmu = p6_pmu; > + > + x86_pmu.lbr_nr = 8; > + x86_pmu.lbr_tos = 0x01c9; > + x86_pmu.lbr_ctl = X86_DEBUGCTL_LBR; > + x86_pmu.lbr_from = 0x40; > + > break; > + > default: > pr_cont("unsupported p6 CPU model %d ", > boot_cpu_data.x86_model); > return -ENODEV; > } > > - x86_pmu = p6_pmu; > - > return 0; > } > > @@ -2925,6 +3093,9 @@ static __init int intel_pmu_init(void) > x86_pmu.event_bits = eax.split.bit_width; > x86_pmu.event_mask = (1ULL << eax.split.bit_width) - 1; > > + rdmsrl(MSR_IA32_PERF_CAPABILITIES, capabilities); > + x86_pmu.lbr_format = capabilities & 0x1f; > + > /* > * Quirk: v2 perfmon does not report fixed-purpose events, so > * assume at least 3 events: > @@ -2973,6 +3144,10 @@ no_datastore: > */ > switch (boot_cpu_data.x86_model) { > case 14: /* 65 nm core solo/duo, "Yonah" */ > + x86_pmu.lbr_nr = 8; > + x86_pmu.lbr_tos = 0x01c9; > + x86_pmu.lbr_ctl = X86_DEBUGCTL_LBR; > + x86_pmu.lbr_from = 0x40; > pr_cont("Core events, "); > break; > > @@ -2980,6 +3155,13 @@ no_datastore: > case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */ > case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */ > case 29: /* six-core 45 nm xeon "Dunnington" */ > + x86_pmu.lbr_nr = 4; > + x86_pmu.lbr_tos = 0x01c9; > + x86_pmu.lbr_ctl = X86_DEBUGCTL_LBR | > + X86_DEBUGCTL_FREEZE_LBRS_ON_PMI; > + x86_pmu.lbr_from = 0x40; > + x86_pmu.lbr_to = 0x60; > + > memcpy(hw_cache_event_ids, core2_hw_cache_event_ids, > sizeof(hw_cache_event_ids)); > > @@ -2989,13 +3171,28 @@ no_datastore: > > case 26: /* 45 nm nehalem, "Bloomfield" */ > case 30: /* 45 nm nehalem, "Lynnfield" */ > + x86_pmu.lbr_nr = 16; > + x86_pmu.lbr_tos = 0x01c9; > + x86_pmu.lbr_ctl = X86_DEBUGCTL_LBR | > + X86_DEBUGCTL_FREEZE_LBRS_ON_PMI; > + x86_pmu.lbr_from = 0x680; > + x86_pmu.lbr_to = 0x6c0; > + > memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids, > sizeof(hw_cache_event_ids)); > > x86_pmu.event_constraints = intel_nehalem_event_constraints; > pr_cont("Nehalem/Corei7 events, "); > break; > - case 28: > + > + case 28: /* Atom */ > + x86_pmu.lbr_nr = 8; > + x86_pmu.lbr_tos = 0x01c9; > + x86_pmu.lbr_ctl = X86_DEBUGCTL_LBR | > + X86_DEBUGCTL_FREEZE_LBRS_ON_PMI; > + x86_pmu.lbr_from = 0x40; > + x86_pmu.lbr_to = 0x60; > + > memcpy(hw_cache_event_ids, atom_hw_cache_event_ids, > sizeof(hw_cache_event_ids)); > > @@ -3005,12 +3202,20 @@ no_datastore: > > case 37: /* 32 nm nehalem, "Clarkdale" */ > case 44: /* 32 nm nehalem, "Gulftown" */ > + x86_pmu.lbr_nr = 16; > + x86_pmu.lbr_tos = 0x01c9; > + x86_pmu.lbr_ctl = X86_DEBUGCTL_LBR | > + X86_DEBUGCTL_FREEZE_LBRS_ON_PMI; > + x86_pmu.lbr_from = 0x680; > + x86_pmu.lbr_to = 0x6c0; > + > memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids, > sizeof(hw_cache_event_ids)); > > x86_pmu.event_constraints = intel_westmere_event_constraints; > pr_cont("Westmere events, "); > break; > + > default: > /* > * default constraints for v2 and up > Index: linux-2.6/arch/x86/include/asm/perf_event.h > =================================================================== > --- linux-2.6.orig/arch/x86/include/asm/perf_event.h > +++ linux-2.6/arch/x86/include/asm/perf_event.h > @@ -1,6 +1,8 @@ > #ifndef _ASM_X86_PERF_EVENT_H > #define _ASM_X86_PERF_EVENT_H > > +#include <asm/msr.h> > + > /* > * Performance event hw details: > */ > @@ -122,11 +124,31 @@ union cpuid10_edx { > extern void init_hw_perf_events(void); > extern void perf_events_lapic_init(void); > > +#define X86_DEBUGCTL_LBR (1 << 0) > +#define X86_DEBUGCTL_FREEZE_LBRS_ON_PMI (1 << 11) > + > +static __always_inline void perf_nmi_enter(void) > +{ > + u64 debugctl; > + > + /* > + * Unconditionally disable LBR so as to minimally pollute the LBR > stack. > + * XXX: paravirt will screw us over massive > + */ > + rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); > + debugctl &= ~X86_DEBUGCTL_LBR; > + wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); > +} > + > +extern void perf_nmi_exit(void); > + > #define PERF_EVENT_INDEX_OFFSET 0 > > #else > static inline void init_hw_perf_events(void) { } > -static inline void perf_events_lapic_init(void) { } > +static inline void perf_events_lapic_init(void) { } > +static inline void perf_nmi_enter(void) { } > +static inline void perf_nmi_exit(void) { } > #endif > > #endif /* _ASM_X86_PERF_EVENT_H */ > Index: linux-2.6/arch/x86/kernel/traps.c > =================================================================== > --- linux-2.6.orig/arch/x86/kernel/traps.c > +++ linux-2.6/arch/x86/kernel/traps.c > @@ -45,6 +45,7 @@ > #endif > > #include <asm/kmemcheck.h> > +#include <asm/perf_event.h> > #include <asm/stacktrace.h> > #include <asm/processor.h> > #include <asm/debugreg.h> > @@ -442,6 +443,7 @@ static notrace __kprobes void default_do > dotraplinkage notrace __kprobes void > do_nmi(struct pt_regs *regs, long error_code) > { > + perf_nmi_enter(); > nmi_enter(); > > inc_irq_stat(__nmi_count); > @@ -450,6 +452,7 @@ do_nmi(struct pt_regs *regs, long error_ > default_do_nmi(regs); > > nmi_exit(); > + perf_nmi_exit(); > } > > void stop_nmi(void) > Index: linux-2.6/include/linux/perf_event.h > =================================================================== > --- linux-2.6.orig/include/linux/perf_event.h > +++ linux-2.6/include/linux/perf_event.h > @@ -125,8 +125,9 @@ enum perf_event_sample_format { > PERF_SAMPLE_PERIOD = 1U << 8, > PERF_SAMPLE_STREAM_ID = 1U << 9, > PERF_SAMPLE_RAW = 1U << 10, > + PERF_SAMPLE_LBR = 1U << 11, > > - PERF_SAMPLE_MAX = 1U << 11, /* non-ABI */ > + PERF_SAMPLE_MAX = 1U << 12, /* non-ABI */ > }; > > /* > @@ -396,6 +397,9 @@ enum perf_event_type { > * { u64 nr, > * u64 ips[nr]; } && PERF_SAMPLE_CALLCHAIN > * > + * { u64 nr; > + * struct lbr_format lbr[nr]; } && PERF_SAMPLE_LBR > + * > * # > * # The RAW record below is opaque data wrt the ABI > * # > @@ -483,6 +487,7 @@ struct hw_perf_event { > int idx; > int last_cpu; > int pebs; > + u64 lbr_tos; > }; > struct { /* software */ > s64 remaining; > > > ------------------------------------------------------------------------------ Download Intel® Parallel Studio Eval Try the new software tools for yourself. Speed compiling, find bugs proactively, and fine-tune applications for parallel performance. See why Intel Parallel Studio got high marks during beta. http://p.sf.net/sfu/intel-sw-dev _______________________________________________ perfmon2-devel mailing list perfmon2-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/perfmon2-devel