x86: Improve HT workaround GP counter constraint

Peter Zijlstra Fri, 22 May 2015 04:22:36 -0700

On Fri, May 22, 2015 at 03:04:45AM -0700, Stephane Eranian wrote:
> > +       if (is_ht_workaround_enabled() &&
> > +           sched->state.nr_gp_counters++ >= x86_pmu.num_counters / 2)
> > +               return false;
> > +
> 
> Has to be > and not >= otherwise:


but its a post inc, so we'll return: 0, 1, 2, ... With > we'll match
after 3 gp events.

I'll agree its not working right though.

FWIW, I currently have the below; which also isn't working right.

It should not enforce the limit when there's no exclusive events being
scheduled.

It also doesn't break uncore scheduling.

---
 arch/x86/kernel/cpu/perf_event.c              |   31 ++++++++++++++++++++++----
 arch/x86/kernel/cpu/perf_event.h              |   10 +++++---
 arch/x86/kernel/cpu/perf_event_intel.c        |   28 ++++++-----------------
 arch/x86/kernel/cpu/perf_event_intel_uncore.c |    2 -
 4 files changed, 43 insertions(+), 28 deletions(-)

--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -611,6 +611,7 @@ struct sched_state {
        int     event;          /* event index */
        int     counter;        /* counter index */
        int     unassigned;     /* number of events to be assigned left */
+       int     nr_gp;          /* number of GP counters used */
        unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
 };
 
@@ -620,6 +621,7 @@ struct sched_state {
 struct perf_sched {
        int                     max_weight;
        int                     max_events;
+       int                     max_gp;
        struct event_constraint **constraints;
        struct sched_state      state;
        int                     saved_states;
@@ -630,13 +632,14 @@ struct perf_sched {
  * Initialize interator that runs through all events and counters.
  */
 static void perf_sched_init(struct perf_sched *sched, struct event_constraint 
**constraints,
-                           int num, int wmin, int wmax)
+                           int num, int wmin, int wmax, int gpmax)
 {
        int idx;
 
        memset(sched, 0, sizeof(*sched));
        sched->max_events       = num;
        sched->max_weight       = wmax;
+       sched->max_gp           = gpmax;
        sched->constraints      = constraints;
 
        for (idx = 0; idx < num; idx++) {
@@ -696,6 +699,10 @@ static bool __perf_sched_find_counter(st
                                goto done;
                }
        }
+
+       if (sched->state.nr_gp++ >= sched->max_gp)
+               return false;
+
        /* Grab the first unused counter starting with idx */
        idx = sched->state.counter;
        for_each_set_bit_from(idx, c->idxmsk, INTEL_PMC_IDX_FIXED) {
@@ -757,11 +764,11 @@ static bool perf_sched_next_event(struct
  * Assign a counter for each event.
  */
 int perf_assign_events(struct event_constraint **constraints, int n,
-                       int wmin, int wmax, int *assign)
+                       int wmin, int wmax, int gpmax, int *assign)
 {
        struct perf_sched sched;
 
-       perf_sched_init(&sched, constraints, n, wmin, wmax);
+       perf_sched_init(&sched, constraints, n, wmin, wmax, gpmax);
 
        do {
                if (!perf_sched_find_counter(&sched))
@@ -821,8 +828,24 @@ int x86_schedule_events(struct cpu_hw_ev
 
        /* slow path */
        if (i != n) {
+               int gpmax = x86_pmu.num_counters / 2;
+
+               /*
+                * Do not allow scheduling of more than half the available
+                * generic counters.
+                *
+                * This helps avoid counter starvation of sibling thread by
+                * ensuring at most half the counters cannot be in exclusive
+                * mode. There is no designated counters for the limits. Any
+                * N/2 counters can be used. This helps with events with
+                * specific counter constraints.
+                */
+               if (is_ht_workaround_enabled() && !cpuc->is_fake &&
+                   READ_ONCE(cpuc->excl_cntrs->exclusive_present))
+                       gpmax /= 2;
+
                unsched = perf_assign_events(cpuc->event_constraint, n, wmin,
-                                            wmax, assign);
+                                            wmax, gpmax, assign);
        }
 
        /*
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -134,8 +134,6 @@ enum intel_excl_state_type {
 struct intel_excl_states {
        enum intel_excl_state_type init_state[X86_PMC_IDX_MAX];
        enum intel_excl_state_type state[X86_PMC_IDX_MAX];
-       int  num_alloc_cntrs;/* #counters allocated */
-       int  max_alloc_cntrs;/* max #counters allowed */
        bool sched_started; /* true if scheduling has started */
 };
 
@@ -144,6 +142,11 @@ struct intel_excl_cntrs {
 
        struct intel_excl_states states[2];
 
+       union {
+               u16     has_exclusive[2];
+               u32     exclusive_present;
+       };
+
        int             refcnt;         /* per-core: #HT threads */
        unsigned        core_id;        /* per-core: core id */
 };
@@ -176,6 +179,7 @@ struct cpu_hw_events {
        struct perf_event       *event_list[X86_PMC_IDX_MAX]; /* in enabled 
order */
        struct event_constraint *event_constraint[X86_PMC_IDX_MAX];
 
+       int                     n_excl; /* the number of exclusive events */
 
        unsigned int            group_flag;
        int                     is_fake;
@@ -719,7 +723,7 @@ static inline void __x86_pmu_enable_even
 void x86_pmu_enable_all(int added);
 
 int perf_assign_events(struct event_constraint **constraints, int n,
-                       int wmin, int wmax, int *assign);
+                       int wmin, int wmax, int gpmax, int *assign);
 int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign);
 
 void x86_pmu_stop(struct perf_event *event, int flags);
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1923,7 +1923,6 @@ intel_start_scheduling(struct cpu_hw_eve
        xl = &excl_cntrs->states[tid];
 
        xl->sched_started = true;
-       xl->num_alloc_cntrs = 0;
        /*
         * lock shared state until we are done scheduling
         * in stop_event_scheduling()
@@ -2000,6 +1999,10 @@ intel_get_excl_constraints(struct cpu_hw
         * across HT threads
         */
        is_excl = c->flags & PERF_X86_EVENT_EXCL;
+       if (is_excl) {
+               if (!cpuc->n_excl++)
+                       WRITE_ONCE(excl_cntrs->has_exclusive[tid], 1);
+       }
 
        /*
         * xl = state of current HT
@@ -2008,18 +2011,6 @@ intel_get_excl_constraints(struct cpu_hw
        xl = &excl_cntrs->states[tid];
        xlo = &excl_cntrs->states[o_tid];
 
-       /*
-        * do not allow scheduling of more than max_alloc_cntrs
-        * which is set to half the available generic counters.
-        * this helps avoid counter starvation of sibling thread
-        * by ensuring at most half the counters cannot be in
-        * exclusive mode. There is not designated counters for the
-        * limits. Any N/2 counters can be used. This helps with
-        * events with specifix counter constraints
-        */
-       if (xl->num_alloc_cntrs++ == xl->max_alloc_cntrs)
-               return &emptyconstraint;
-
        cx = c;
 
        /*
@@ -2150,6 +2141,10 @@ static void intel_put_excl_constraints(s
 
        xl = &excl_cntrs->states[tid];
        xlo = &excl_cntrs->states[o_tid];
+       if (hwc->flags & PERF_X86_EVENT_EXCL) {
+               if (!--cpuc->n_excl)
+                       WRITE_ONCE(excl_cntrs->has_exclusive[tid], 0);
+       }
 
        /*
         * put_constraint may be called from x86_schedule_events()
@@ -2632,8 +2627,6 @@ static void intel_pmu_cpu_starting(int c
                cpuc->lbr_sel = &cpuc->shared_regs->regs[EXTRA_REG_LBR];
 
        if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) {
-               int h = x86_pmu.num_counters >> 1;
-
                for_each_cpu(i, topology_thread_cpumask(cpu)) {
                        struct intel_excl_cntrs *c;
 
@@ -2647,11 +2640,6 @@ static void intel_pmu_cpu_starting(int c
                }
                cpuc->excl_cntrs->core_id = core_id;
                cpuc->excl_cntrs->refcnt++;
-               /*
-                * set hard limit to half the number of generic counters
-                */
-               cpuc->excl_cntrs->states[0].max_alloc_cntrs = h;
-               cpuc->excl_cntrs->states[1].max_alloc_cntrs = h;
        }
 }
 
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
@@ -394,7 +394,7 @@ static int uncore_assign_events(struct i
        /* slow path */
        if (i != n)
                ret = perf_assign_events(box->event_constraint, n,
-                                        wmin, wmax, assign);
+                                        wmin, wmax, n, assign);
 
        if (!assign || ret) {
                for (i = 0; i < n; i++)
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 02/10] perf/x86: Improve HT workaround GP counter constraint

Reply via email to