Re: [RFC][PATCH] perf: Rewrite core context handling

Song Liu Thu, 11 Oct 2018 00:53:01 -0700

Hi Peter, 

I am trying to understand this. Pardon me if any question is silly.


I am not sure I fully understand the motivation here. I guess we
see problem when there are two (or more) independent hardware PMUs 
per cpu? Then on a given cpu, there are two (or more) 
perf_cpu_context, but only one task context? 

If this is correct (I really doubt...), I guess perf_rotate_context()
is the problem? And if this is still correct, this patch may not help,
as we are doing rotation for each perf_cpu_pmu_context? (or rotation 
per perf_event_context is the next step?). 

Or step back a little... I see two big changes:

1. struct perf_ctx_context is now per cpu (instead of per pmu per cpu);
2. one perf_event_ctxp per task_struct (instead of 2).  

I think #1 is a bigger change than #2. Is this correct? 


Of course, I could be totally lost. I will continue reading the code 
tomorrow. 

Could you please help me understand it better? 

Thanks,
Song

> On Oct 10, 2018, at 3:45 AM, Peter Zijlstra <[email protected]> wrote:
> 
> Hi all,
> 
> There have been various issues and limitations with the way perf uses
> (task) contexts to track events. Most notable is the single hardware PMU
> task context, which has resulted in a number of yucky things (both
> proposed and merged).
> Notably:
> 
> - HW breakpoint PMU
> - ARM big.little PMU
> - Intel Branch Monitoring PMU
> 
> Since we now track the events in RB trees, we can 'simply' add a pmu
> order to them and have them grouped that way, reducing to a single
> context. Of course, reality never quite works out that simple, and below
> ends up adding an intermediate data structure to bridge the context ->
> pmu mapping.
> 
> Something a little like:
> 
>              ,------------------------[1:n]---------------------.
>              V                                                  V
>    perf_event_context <-[1:n]-> perf_event_pmu_context <--- perf_event
>              ^                      ^     |                     |
>              `--------[1:n]---------'     `-[n:1]-> pmu <-[1:n]-'
> 
> This patch builds (provided you disable CGROUP_PERF), boots and survives
> perf-top without the machine catching fire.
> 
> There's still a fair bit of loose ends (look for XXX), but I think this
> is the direction we should be going.
> 
> Comments?
> 
> Not-Quite-Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
> ---
> arch/powerpc/perf/core-book3s.c |    4 
> arch/x86/events/core.c          |    4 
> arch/x86/events/intel/core.c    |    6 
> arch/x86/events/intel/ds.c      |    6 
> arch/x86/events/intel/lbr.c     |   16 
> arch/x86/events/perf_event.h    |    6 
> include/linux/perf_event.h      |   80 +-
> include/linux/sched.h           |    2 
> kernel/events/core.c            | 1412 
> ++++++++++++++++++++--------------------
> 9 files changed, 815 insertions(+), 721 deletions(-)
> 
> --- a/arch/powerpc/perf/core-book3s.c
> +++ b/arch/powerpc/perf/core-book3s.c
> @@ -125,7 +125,7 @@ static unsigned long ebb_switch_in(bool
> 
> static inline void power_pmu_bhrb_enable(struct perf_event *event) {}
> static inline void power_pmu_bhrb_disable(struct perf_event *event) {}
> -static void power_pmu_sched_task(struct perf_event_context *ctx, bool 
> sched_in) {}
> +static void power_pmu_sched_task(struct perf_event_pmu_context *pmu_ctx, 
> bool sched_in) {}
> static inline void power_pmu_bhrb_read(struct cpu_hw_events *cpuhw) {}
> static void pmao_restore_workaround(bool ebb) { }
> #endif /* CONFIG_PPC32 */
> @@ -395,7 +395,7 @@ static void power_pmu_bhrb_disable(struc
> /* Called from ctxsw to prevent one process's branch entries to
>  * mingle with the other process's entries during context switch.
>  */
> -static void power_pmu_sched_task(struct perf_event_context *ctx, bool 
> sched_in)
> +static void power_pmu_sched_task(struct perf_event_pmu_context *pmu_ctx, 
> bool sched_in)
> {
>       if (!ppmu->bhrb_nr)
>               return;
> --- a/arch/x86/events/core.c
> +++ b/arch/x86/events/core.c
> @@ -2286,10 +2286,10 @@ static const struct attribute_group *x86
>       NULL,
> };
> 
> -static void x86_pmu_sched_task(struct perf_event_context *ctx, bool sched_in)
> +static void x86_pmu_sched_task(struct perf_event_pmu_context *pmu_ctx, bool 
> sched_in)
> {
>       if (x86_pmu.sched_task)
> -             x86_pmu.sched_task(ctx, sched_in);
> +             x86_pmu.sched_task(pmu_ctx, sched_in);
> }
> 
> void perf_check_microcode(void)
> --- a/arch/x86/events/intel/core.c
> +++ b/arch/x86/events/intel/core.c
> @@ -3537,11 +3537,11 @@ static void intel_pmu_cpu_dying(int cpu)
>               disable_counter_freeze();
> }
> 
> -static void intel_pmu_sched_task(struct perf_event_context *ctx,
> +static void intel_pmu_sched_task(struct perf_event_pmu_context *pmu_ctx,
>                                bool sched_in)
> {
> -     intel_pmu_pebs_sched_task(ctx, sched_in);
> -     intel_pmu_lbr_sched_task(ctx, sched_in);
> +     intel_pmu_pebs_sched_task(pmu_ctx, sched_in);
> +     intel_pmu_lbr_sched_task(pmu_ctx, sched_in);
> }
> 
> PMU_FORMAT_ATTR(offcore_rsp, "config1:0-63");
> --- a/arch/x86/events/intel/ds.c
> +++ b/arch/x86/events/intel/ds.c
> @@ -885,7 +885,7 @@ static inline bool pebs_needs_sched_cb(s
>       return cpuc->n_pebs && (cpuc->n_pebs == cpuc->n_large_pebs);
> }
> 
> -void intel_pmu_pebs_sched_task(struct perf_event_context *ctx, bool sched_in)
> +void intel_pmu_pebs_sched_task(struct perf_event_pmu_context *pmu_ctx, bool 
> sched_in)
> {
>       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
> 
> @@ -947,7 +947,7 @@ void intel_pmu_pebs_add(struct perf_even
>       if (hwc->flags & PERF_X86_EVENT_LARGE_PEBS)
>               cpuc->n_large_pebs++;
> 
> -     pebs_update_state(needed_cb, cpuc, event->ctx->pmu);
> +     pebs_update_state(needed_cb, cpuc, event->pmu);
> }
> 
> void intel_pmu_pebs_enable(struct perf_event *event)
> @@ -991,7 +991,7 @@ void intel_pmu_pebs_del(struct perf_even
>       if (hwc->flags & PERF_X86_EVENT_LARGE_PEBS)
>               cpuc->n_large_pebs--;
> 
> -     pebs_update_state(needed_cb, cpuc, event->ctx->pmu);
> +     pebs_update_state(needed_cb, cpuc, event->pmu);
> }
> 
> void intel_pmu_pebs_disable(struct perf_event *event)
> --- a/arch/x86/events/intel/lbr.c
> +++ b/arch/x86/events/intel/lbr.c
> @@ -417,7 +417,7 @@ static void __intel_pmu_lbr_save(struct
>       cpuc->last_log_id = ++task_ctx->log_id;
> }
> 
> -void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in)
> +void intel_pmu_lbr_sched_task(struct perf_event_pmu_context *pmu_ctx, bool 
> sched_in)
> {
>       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
>       struct x86_perf_task_context *task_ctx;
> @@ -430,7 +430,7 @@ void intel_pmu_lbr_sched_task(struct per
>        * the task was scheduled out, restore the stack. Otherwise flush
>        * the LBR stack.
>        */
> -     task_ctx = ctx ? ctx->task_ctx_data : NULL;
> +     task_ctx = pmu_ctx ? pmu_ctx->task_ctx_data : NULL;
>       if (task_ctx) {
>               if (sched_in)
>                       __intel_pmu_lbr_restore(task_ctx);
> @@ -464,8 +464,8 @@ void intel_pmu_lbr_add(struct perf_event
> 
>       cpuc->br_sel = event->hw.branch_reg.reg;
> 
> -     if (branch_user_callstack(cpuc->br_sel) && event->ctx->task_ctx_data) {
> -             task_ctx = event->ctx->task_ctx_data;
> +     if (branch_user_callstack(cpuc->br_sel) && 
> event->pmu_ctx->task_ctx_data) {
> +             task_ctx = event->pmu_ctx->task_ctx_data;
>               task_ctx->lbr_callstack_users++;
>       }
> 
> @@ -488,7 +488,7 @@ void intel_pmu_lbr_add(struct perf_event
>        * be 'new'. Conversely, a new event can get installed through the
>        * context switch path for the first time.
>        */
> -     perf_sched_cb_inc(event->ctx->pmu);
> +     perf_sched_cb_inc(event->pmu);
>       if (!cpuc->lbr_users++ && !event->total_time_running)
>               intel_pmu_lbr_reset();
> }
> @@ -502,14 +502,14 @@ void intel_pmu_lbr_del(struct perf_event
>               return;
> 
>       if (branch_user_callstack(cpuc->br_sel) &&
> -         event->ctx->task_ctx_data) {
> -             task_ctx = event->ctx->task_ctx_data;
> +         event->pmu_ctx->task_ctx_data) {
> +             task_ctx = event->pmu_ctx->task_ctx_data;
>               task_ctx->lbr_callstack_users--;
>       }
> 
>       cpuc->lbr_users--;
>       WARN_ON_ONCE(cpuc->lbr_users < 0);
> -     perf_sched_cb_dec(event->ctx->pmu);
> +     perf_sched_cb_dec(event->pmu);
> }
> 
> void intel_pmu_lbr_enable_all(bool pmi)
> --- a/arch/x86/events/perf_event.h
> +++ b/arch/x86/events/perf_event.h
> @@ -589,7 +589,7 @@ struct x86_pmu {
>       void            (*cpu_dead)(int cpu);
> 
>       void            (*check_microcode)(void);
> -     void            (*sched_task)(struct perf_event_context *ctx,
> +     void            (*sched_task)(struct perf_event_pmu_context *pmu_ctx,
>                                     bool sched_in);
> 
>       /*
> @@ -930,13 +930,13 @@ void intel_pmu_pebs_enable_all(void);
> 
> void intel_pmu_pebs_disable_all(void);
> 
> -void intel_pmu_pebs_sched_task(struct perf_event_context *ctx, bool 
> sched_in);
> +void intel_pmu_pebs_sched_task(struct perf_event_pmu_context *pmu_ctx, bool 
> sched_in);
> 
> void intel_pmu_auto_reload_read(struct perf_event *event);
> 
> void intel_ds_init(void);
> 
> -void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in);
> +void intel_pmu_lbr_sched_task(struct perf_event_pmu_context *pmu_ctx, bool 
> sched_in);
> 
> u64 lbr_from_signext_quirk_wr(u64 val);
> 
> --- a/include/linux/perf_event.h
> +++ b/include/linux/perf_event.h
> @@ -227,6 +227,7 @@ struct hw_perf_event {
> };
> 
> struct perf_event;
> +struct perf_event_pmu_context;
> 
> /*
>  * Common implementation detail of pmu::{start,commit,cancel}_txn
> @@ -263,7 +264,9 @@ struct pmu {
>       int                             capabilities;
> 
>       int * __percpu                  pmu_disable_count;
> -     struct perf_cpu_context * __percpu pmu_cpu_context;
> +     struct perf_cpu_pmu_context * __percpu cpu_pmu_context;
> +
> +
>       atomic_t                        exclusive_cnt; /* < 0: cpu; > 0: tsk */
>       int                             task_ctx_nr;
>       int                             hrtimer_interval_ms;
> @@ -398,7 +401,7 @@ struct pmu {
>       /*
>        * context-switches callback
>        */
> -     void (*sched_task)              (struct perf_event_context *ctx,
> +     void (*sched_task)              (struct perf_event_pmu_context *ctx,
>                                       bool sched_in);
>       /*
>        * PMU specific data size
> @@ -619,6 +622,7 @@ struct perf_event {
>       struct hw_perf_event            hw;
> 
>       struct perf_event_context       *ctx;
> +     struct perf_event_pmu_context   *pmu_ctx;
>       atomic_long_t                   refcount;
> 
>       /*
> @@ -698,6 +702,41 @@ struct perf_event {
> #endif /* CONFIG_PERF_EVENTS */
> };
> 
> +/*
> + *           ,------------------------[1:n]---------------------.
> + *           V                                                  V
> + * perf_event_context <-[1:n]-> perf_event_pmu_context <--- perf_event
> + *           ^                      ^     |                     |
> + *           `--------[1:n]---------'     `-[n:1]-> pmu <-[1:n]-'
> + *
> + *
> + * XXX destroy epc when empty
> + *   refcount, !rcu
> + *
> + * XXX epc locking
> + *
> + *   event->pmu_ctx          ctx->mutex && inactive
> + *   ctx->pmu_ctx_list               ctx->mutex && ctx->lock
> + *
> + */
> +struct perf_event_pmu_context {
> +     struct pmu                      *pmu;
> +     struct perf_event_context       *ctx;
> +
> +     struct list_head                pmu_ctx_entry;
> +
> +     struct list_head                pinned_active;
> +     struct list_head                flexible_active;
> +
> +     unsigned int                    embedded : 1;
> +
> +     unsigned int                    nr_events;
> +     unsigned int                    nr_active;
> +
> +     atomic_t                        refcount; /* event <-> epc */
> +
> +     void                            *task_ctx_data; /* pmu specific data */
> +};
> 
> struct perf_event_groups {
>       struct rb_root  tree;
> @@ -710,7 +749,6 @@ struct perf_event_groups {
>  * Used as a container for task events and CPU events as well:
>  */
> struct perf_event_context {
> -     struct pmu                      *pmu;
>       /*
>        * Protect the states of the events in the list,
>        * nr_active, and the list:
> @@ -723,20 +761,21 @@ struct perf_event_context {
>        */
>       struct mutex                    mutex;
> 
> -     struct list_head                active_ctx_list;
> +     struct list_head                pmu_ctx_list;
> +
>       struct perf_event_groups        pinned_groups;
>       struct perf_event_groups        flexible_groups;
>       struct list_head                event_list;
> 
> -     struct list_head                pinned_active;
> -     struct list_head                flexible_active;
> -
>       int                             nr_events;
>       int                             nr_active;
>       int                             is_active;
> +
> +     int                             nr_task_data;
>       int                             nr_stat;
>       int                             nr_freq;
>       int                             rotate_disable;
> +
>       atomic_t                        refcount;
>       struct task_struct              *task;
> 
> @@ -757,7 +796,6 @@ struct perf_event_context {
> #ifdef CONFIG_CGROUP_PERF
>       int                             nr_cgroups;      /* cgroup evts */
> #endif
> -     void                            *task_ctx_data; /* pmu specific data */
>       struct rcu_head                 rcu_head;
> };
> 
> @@ -767,12 +805,13 @@ struct perf_event_context {
>  */
> #define PERF_NR_CONTEXTS      4
> 
> -/**
> - * struct perf_event_cpu_context - per cpu event context structure
> - */
> -struct perf_cpu_context {
> -     struct perf_event_context       ctx;
> -     struct perf_event_context       *task_ctx;
> +struct perf_cpu_pmu_context {
> +     struct perf_event_pmu_context   epc;
> +     struct perf_event_pmu_context   *task_epc;
> +
> +     struct list_head                sched_cb_entry;
> +     int                             sched_cb_usage;
> +
>       int                             active_oncpu;
>       int                             exclusive;
> 
> @@ -780,15 +819,20 @@ struct perf_cpu_context {
>       struct hrtimer                  hrtimer;
>       ktime_t                         hrtimer_interval;
>       unsigned int                    hrtimer_active;
> +};
> +
> +/**
> + * struct perf_event_cpu_context - per cpu event context structure
> + */
> +struct perf_cpu_context {
> +     struct perf_event_context       ctx;
> +     struct perf_event_context       *task_ctx;
> 
> #ifdef CONFIG_CGROUP_PERF
>       struct perf_cgroup              *cgrp;
>       struct list_head                cgrp_cpuctx_entry;
> #endif
> 
> -     struct list_head                sched_cb_entry;
> -     int                             sched_cb_usage;
> -
>       int                             online;
> };
> 
> @@ -1022,7 +1066,7 @@ static inline int is_software_event(stru
>  */
> static inline int in_software_context(struct perf_event *event)
> {
> -     return event->ctx->pmu->task_ctx_nr == perf_sw_context;
> +     return event->pmu_ctx->pmu->task_ctx_nr == perf_sw_context;
> }
> 
> extern struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1000,7 +1000,7 @@ struct task_struct {
>       struct futex_pi_state           *pi_state_cache;
> #endif
> #ifdef CONFIG_PERF_EVENTS
> -     struct perf_event_context       *perf_event_ctxp[perf_nr_task_contexts];
> +     struct perf_event_context       *perf_event_ctxp;
>       struct mutex                    perf_event_mutex;
>       struct list_head                perf_event_list;
> #endif
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -143,12 +143,6 @@ static int cpu_function_call(int cpu, re
>       return data.ret;
> }
> 
> -static inline struct perf_cpu_context *
> -__get_cpu_context(struct perf_event_context *ctx)
> -{
> -     return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
> -}
> -
> static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
>                         struct perf_event_context *ctx)
> {
> @@ -172,6 +166,8 @@ static bool is_kernel_event(struct perf_
>       return READ_ONCE(event->owner) == TASK_TOMBSTONE;
> }
> 
> +static DEFINE_PER_CPU(struct perf_cpu_context, cpu_context);
> +
> /*
>  * On task ctx scheduling...
>  *
> @@ -205,7 +201,7 @@ static int event_function(void *info)
>       struct event_function_struct *efs = info;
>       struct perf_event *event = efs->event;
>       struct perf_event_context *ctx = event->ctx;
> -     struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
> +     struct perf_cpu_context *cpuctx = this_cpu_ptr(&cpu_context);
>       struct perf_event_context *task_ctx = cpuctx->task_ctx;
>       int ret = 0;
> 
> @@ -302,7 +298,7 @@ static void event_function_call(struct p
> static void event_function_local(struct perf_event *event, event_f func, void 
> *data)
> {
>       struct perf_event_context *ctx = event->ctx;
> -     struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
> +     struct perf_cpu_context *cpuctx = this_cpu_ptr(&cpu_context);
>       struct task_struct *task = READ_ONCE(ctx->task);
>       struct perf_event_context *task_ctx = NULL;
> 
> @@ -376,7 +372,6 @@ static DEFINE_MUTEX(perf_sched_mutex);
> static atomic_t perf_sched_count;
> 
> static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
> -static DEFINE_PER_CPU(int, perf_sched_cb_usages);
> static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
> 
> static atomic_t nr_mmap_events __read_mostly;
> @@ -430,7 +425,7 @@ static void update_perf_cpu_limits(void)
>       WRITE_ONCE(perf_sample_allowed_ns, tmp);
> }
> 
> -static bool perf_rotate_context(struct perf_cpu_context *cpuctx);
> +static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc);
> 
> int perf_proc_update_handler(struct ctl_table *table, int write,
>               void __user *buffer, size_t *lenp,
> @@ -555,13 +550,6 @@ void perf_sample_event_took(u64 sample_l
> 
> static atomic64_t perf_event_id;
> 
> -static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
> -                           enum event_type_t event_type);
> -
> -static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
> -                          enum event_type_t event_type,
> -                          struct task_struct *task);
> -
> static void update_context_time(struct perf_event_context *ctx);
> static u64 perf_event_time(struct perf_event *event);
> 
> @@ -810,7 +798,7 @@ static void perf_cgroup_switch(struct ta
>               perf_pmu_disable(cpuctx->ctx.pmu);
> 
>               if (mode & PERF_CGROUP_SWOUT) {
> -                     cpu_ctx_sched_out(cpuctx, EVENT_ALL);
> +                     ctx_sched_out(&cpuctx->ctx, EVENT_ALL);
>                       /*
>                        * must not be done before ctxswout due
>                        * to event_filter_match() in event_sched_out()
> @@ -827,9 +815,8 @@ static void perf_cgroup_switch(struct ta
>                        * we pass the cpuctx->ctx to perf_cgroup_from_task()
>                        * because cgorup events are only per-cpu
>                        */
> -                     cpuctx->cgrp = perf_cgroup_from_task(task,
> -                                                          &cpuctx->ctx);
> -                     cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
> +                     cpuctx->cgrp = perf_cgroup_from_task(task, 
> &cpuctx->ctx);
> +                     ctx_sched_in(&cpuctx->ctx, EVENT_ALL, task);
>               }
>               perf_pmu_enable(cpuctx->ctx.pmu);
>               perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
> @@ -1063,34 +1050,30 @@ list_update_cgroup_event(struct perf_eve
>  */
> static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr)
> {
> -     struct perf_cpu_context *cpuctx;
> +     struct perf_cpu_pmu_context *cpc;
>       bool rotations;
> 
>       lockdep_assert_irqs_disabled();
> 
> -     cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
> -     rotations = perf_rotate_context(cpuctx);
> +     cpc = container_of(hr, struct perf_cpu_pmu_context, hrtimer);
> +     rotations = perf_rotate_context(cpc);
> 
> -     raw_spin_lock(&cpuctx->hrtimer_lock);
> +     raw_spin_lock(&cpc->hrtimer_lock);
>       if (rotations)
> -             hrtimer_forward_now(hr, cpuctx->hrtimer_interval);
> +             hrtimer_forward_now(hr, cpc->hrtimer_interval);
>       else
> -             cpuctx->hrtimer_active = 0;
> -     raw_spin_unlock(&cpuctx->hrtimer_lock);
> +             cpc->hrtimer_active = 0;
> +     raw_spin_unlock(&cpc->hrtimer_lock);
> 
>       return rotations ? HRTIMER_RESTART : HRTIMER_NORESTART;
> }
> 
> -static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
> +static void __perf_mux_hrtimer_init(struct perf_cpu_pmu_context *cpc, int 
> cpu)
> {
> -     struct hrtimer *timer = &cpuctx->hrtimer;
> -     struct pmu *pmu = cpuctx->ctx.pmu;
> +     struct hrtimer *timer = &cpc->hrtimer;
> +     struct pmu *pmu = cpc->epc.pmu;
>       u64 interval;
> 
> -     /* no multiplexing needed for SW PMU */
> -     if (pmu->task_ctx_nr == perf_sw_context)
> -             return;
> -
>       /*
>        * check default is sane, if not set then force to
>        * default interval (1/tick)
> @@ -1099,30 +1082,25 @@ static void __perf_mux_hrtimer_init(stru
>       if (interval < 1)
>               interval = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
> 
> -     cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
> +     cpc->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
> 
> -     raw_spin_lock_init(&cpuctx->hrtimer_lock);
> +     raw_spin_lock_init(&cpc->hrtimer_lock);
>       hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
>       timer->function = perf_mux_hrtimer_handler;
> }
> 
> -static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx)
> +static int perf_mux_hrtimer_restart(struct perf_cpu_pmu_context *cpc)
> {
> -     struct hrtimer *timer = &cpuctx->hrtimer;
> -     struct pmu *pmu = cpuctx->ctx.pmu;
> +     struct hrtimer *timer = &cpc->hrtimer;
>       unsigned long flags;
> 
> -     /* not for SW PMU */
> -     if (pmu->task_ctx_nr == perf_sw_context)
> -             return 0;
> -
> -     raw_spin_lock_irqsave(&cpuctx->hrtimer_lock, flags);
> -     if (!cpuctx->hrtimer_active) {
> -             cpuctx->hrtimer_active = 1;
> -             hrtimer_forward_now(timer, cpuctx->hrtimer_interval);
> +     raw_spin_lock_irqsave(&cpc->hrtimer_lock, flags);
> +     if (!cpc->hrtimer_active) {
> +             cpc->hrtimer_active = 1;
> +             hrtimer_forward_now(timer, cpc->hrtimer_interval);
>               hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
>       }
> -     raw_spin_unlock_irqrestore(&cpuctx->hrtimer_lock, flags);
> +     raw_spin_unlock_irqrestore(&cpc->hrtimer_lock, flags);
> 
>       return 0;
> }
> @@ -1141,32 +1119,25 @@ void perf_pmu_enable(struct pmu *pmu)
>               pmu->pmu_enable(pmu);
> }
> 
> -static DEFINE_PER_CPU(struct list_head, active_ctx_list);
> -
> -/*
> - * perf_event_ctx_activate(), perf_event_ctx_deactivate(), and
> - * perf_event_task_tick() are fully serialized because they're strictly cpu
> - * affine and perf_event_ctx{activate,deactivate} are called with IRQs
> - * disabled, while perf_event_task_tick is called from IRQ context.
> - */
> -static void perf_event_ctx_activate(struct perf_event_context *ctx)
> +void perf_assert_pmu_disabled(struct pmu *pmu)
> {
> -     struct list_head *head = this_cpu_ptr(&active_ctx_list);
> -
> -     lockdep_assert_irqs_disabled();
> +     WARN_ON_ONCE(*this_cpu_ptr(pmu->pmu_disable_count) == 0);
> +}
> 
> -     WARN_ON(!list_empty(&ctx->active_ctx_list));
> +void perf_ctx_disable(struct perf_event_context *ctx)
> +{
> +     struct perf_event_pmu_context *pmu_ctx;
> 
> -     list_add(&ctx->active_ctx_list, head);
> +     list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry)
> +             perf_pmu_disable(pmu_ctx->pmu);
> }
> 
> -static void perf_event_ctx_deactivate(struct perf_event_context *ctx)
> +void perf_ctx_enable(struct perf_event_context *ctx)
> {
> -     lockdep_assert_irqs_disabled();
> +     struct perf_event_pmu_context *pmu_ctx;
> 
> -     WARN_ON(list_empty(&ctx->active_ctx_list));
> -
> -     list_del_init(&ctx->active_ctx_list);
> +     list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry)
> +             perf_pmu_enable(pmu_ctx->pmu);
> }
> 
> static void get_ctx(struct perf_event_context *ctx)
> @@ -1179,7 +1150,6 @@ static void free_ctx(struct rcu_head *he
>       struct perf_event_context *ctx;
> 
>       ctx = container_of(head, struct perf_event_context, rcu_head);
> -     kfree(ctx->task_ctx_data);
>       kfree(ctx);
> }
> 
> @@ -1363,7 +1333,7 @@ static u64 primary_event_id(struct perf_
>  * the context could get moved to another task.
>  */
> static struct perf_event_context *
> -perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long 
> *flags)
> +perf_lock_task_context(struct task_struct *task, unsigned long *flags)
> {
>       struct perf_event_context *ctx;
> 
> @@ -1379,7 +1349,7 @@ perf_lock_task_context(struct task_struc
>        */
>       local_irq_save(*flags);
>       rcu_read_lock();
> -     ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
> +     ctx = rcu_dereference(task->perf_event_ctxp);
>       if (ctx) {
>               /*
>                * If this context is a clone of another, it might
> @@ -1392,7 +1362,7 @@ perf_lock_task_context(struct task_struc
>                * can't get swapped on us any more.
>                */
>               raw_spin_lock(&ctx->lock);
> -             if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
> +             if (ctx != rcu_dereference(task->perf_event_ctxp)) {
>                       raw_spin_unlock(&ctx->lock);
>                       rcu_read_unlock();
>                       local_irq_restore(*flags);
> @@ -1419,12 +1389,12 @@ perf_lock_task_context(struct task_struc
>  * reference count so that the context can't get freed.
>  */
> static struct perf_event_context *
> -perf_pin_task_context(struct task_struct *task, int ctxn)
> +perf_pin_task_context(struct task_struct *task)
> {
>       struct perf_event_context *ctx;
>       unsigned long flags;
> 
> -     ctx = perf_lock_task_context(task, ctxn, &flags);
> +     ctx = perf_lock_task_context(task, &flags);
>       if (ctx) {
>               ++ctx->pin_count;
>               raw_spin_unlock_irqrestore(&ctx->lock, flags);
> @@ -1528,6 +1498,11 @@ perf_event_groups_less(struct perf_event
>       if (left->cpu > right->cpu)
>               return false;
> 
> +     if (left->pmu_ctx->pmu < right->pmu_ctx->pmu)
> +             return true;
> +     if (left->pmu_ctx->pmu > right->pmu_ctx->pmu)
> +             return false;
> +
>       if (left->group_index < right->group_index)
>               return true;
>       if (left->group_index > right->group_index)
> @@ -1610,7 +1585,7 @@ del_event_from_groups(struct perf_event
>  * Get the leftmost event in the @cpu subtree.
>  */
> static struct perf_event *
> -perf_event_groups_first(struct perf_event_groups *groups, int cpu)
> +perf_event_groups_first(struct perf_event_groups *groups, int cpu, struct 
> pmu *pmu)
> {
>       struct perf_event *node_event = NULL, *match = NULL;
>       struct rb_node *node = groups->tree.rb_node;
> @@ -1623,8 +1598,19 @@ perf_event_groups_first(struct perf_even
>               } else if (cpu > node_event->cpu) {
>                       node = node->rb_right;
>               } else {
> -                     match = node_event;
> -                     node = node->rb_left;
> +                     if (pmu) {
> +                             if (pmu < node_event->pmu_ctx->pmu) {
> +                                     node = node->rb_left;
> +                             } else if (pmu > node_event->pmu_ctx->pmu) {
> +                                     node = node->rb_right;
> +                             } else  {
> +                                     match = node_event;
> +                                     node = node->rb_left;
> +                             }
> +                     } else {
> +                             match = node_event;
> +                             node = node->rb_left;
> +                     }
>               }
>       }
> 
> @@ -1635,13 +1621,17 @@ perf_event_groups_first(struct perf_even
>  * Like rb_entry_next_safe() for the @cpu subtree.
>  */
> static struct perf_event *
> -perf_event_groups_next(struct perf_event *event)
> +perf_event_groups_next(struct perf_event *event, struct pmu *pmu)
> {
>       struct perf_event *next;
> 
>       next = rb_entry_safe(rb_next(&event->group_node), typeof(*event), 
> group_node);
> -     if (next && next->cpu == event->cpu)
> +     if (next && next->cpu == event->cpu) {
> +             if (pmu && next->pmu_ctx->pmu != pmu)
> +                     return NULL;
> +
>               return next;
> +     }
> 
>       return NULL;
> }
> @@ -1687,6 +1677,8 @@ list_add_event(struct perf_event *event,
>               ctx->nr_stat++;
> 
>       ctx->generation++;
> +
> +     event->pmu_ctx->nr_events++;
> }
> 
> /*
> @@ -1883,6 +1875,8 @@ list_del_event(struct perf_event *event,
>               perf_event_set_state(event, PERF_EVENT_STATE_OFF);
> 
>       ctx->generation++;
> +
> +     event->pmu_ctx->nr_events--;
> }
> 
> static void perf_group_detach(struct perf_event *event)
> @@ -1926,8 +1920,9 @@ static void perf_group_detach(struct per
>                       add_event_to_groups(sibling, event->ctx);
> 
>                       if (sibling->state == PERF_EVENT_STATE_ACTIVE) {
> +                             struct perf_event_pmu_context *pmu_ctx = 
> event->pmu_ctx;
>                               struct list_head *list = sibling->attr.pinned ?
> -                                     &ctx->pinned_active : 
> &ctx->flexible_active;
> +                                     &pmu_ctx->pinned_active : 
> &pmu_ctx->flexible_active;
> 
>                               list_add_tail(&sibling->active_list, list);
>                       }
> @@ -1983,12 +1978,14 @@ event_filter_match(struct perf_event *ev
> }
> 
> static void
> -event_sched_out(struct perf_event *event,
> -               struct perf_cpu_context *cpuctx,
> -               struct perf_event_context *ctx)
> +event_sched_out(struct perf_event *event, struct perf_event_context *ctx)
> {
> +     struct perf_event_pmu_context *epc = event->pmu_ctx;
> +     struct perf_cpu_pmu_context *cpc = 
> this_cpu_ptr(epc->pmu->cpu_pmu_context);
>       enum perf_event_state state = PERF_EVENT_STATE_INACTIVE;
> 
> +     // XXX cpc serialization, probably per-cpu IRQ disabled
> +
>       WARN_ON_ONCE(event->ctx != ctx);
>       lockdep_assert_held(&ctx->lock);
> 
> @@ -2014,41 +2011,35 @@ event_sched_out(struct perf_event *event
>       perf_event_set_state(event, state);
> 
>       if (!is_software_event(event))
> -             cpuctx->active_oncpu--;
> +             cpc->active_oncpu--;
>       if (!--ctx->nr_active)
> -             perf_event_ctx_deactivate(ctx);
> +             ;
> +     event->pmu_ctx->nr_active--;
>       if (event->attr.freq && event->attr.sample_freq)
>               ctx->nr_freq--;
> -     if (event->attr.exclusive || !cpuctx->active_oncpu)
> -             cpuctx->exclusive = 0;
> +     if (event->attr.exclusive || !cpc->active_oncpu)
> +             cpc->exclusive = 0;
> 
>       perf_pmu_enable(event->pmu);
> }
> 
> static void
> -group_sched_out(struct perf_event *group_event,
> -             struct perf_cpu_context *cpuctx,
> -             struct perf_event_context *ctx)
> +group_sched_out(struct perf_event *group_event, struct perf_event_context 
> *ctx)
> {
>       struct perf_event *event;
> 
>       if (group_event->state != PERF_EVENT_STATE_ACTIVE)
>               return;
> 
> -     perf_pmu_disable(ctx->pmu);
> +     perf_assert_pmu_disabled(group_event->pmu_ctx->pmu);
> 
> -     event_sched_out(group_event, cpuctx, ctx);
> +     event_sched_out(group_event, ctx);
> 
>       /*
>        * Schedule out siblings (if any):
>        */
>       for_each_sibling_event(event, group_event)
> -             event_sched_out(event, cpuctx, ctx);
> -
> -     perf_pmu_enable(ctx->pmu);
> -
> -     if (group_event->attr.exclusive)
> -             cpuctx->exclusive = 0;
> +             event_sched_out(event, ctx);
> }
> 
> #define DETACH_GROUP  0x01UL
> @@ -2072,7 +2063,7 @@ __perf_remove_from_context(struct perf_e
>               update_cgrp_time_from_cpuctx(cpuctx);
>       }
> 
> -     event_sched_out(event, cpuctx, ctx);
> +     event_sched_out(event, ctx);
>       if (flags & DETACH_GROUP)
>               perf_group_detach(event);
>       list_del_event(event, ctx);
> @@ -2139,12 +2130,16 @@ static void __perf_event_disable(struct
>               update_cgrp_time_from_event(event);
>       }
> 
> +     perf_pmu_disable(event->pmu_ctx->pmu);
> +
>       if (event == event->group_leader)
> -             group_sched_out(event, cpuctx, ctx);
> +             group_sched_out(event, ctx);
>       else
> -             event_sched_out(event, cpuctx, ctx);
> +             event_sched_out(event, ctx);
> 
>       perf_event_set_state(event, PERF_EVENT_STATE_OFF);
> +
> +     perf_pmu_enable(event->pmu_ctx->pmu);
> }
> 
> /*
> @@ -2240,10 +2235,10 @@ static void perf_log_throttle(struct per
> static void perf_log_itrace_start(struct perf_event *event);
> 
> static int
> -event_sched_in(struct perf_event *event,
> -              struct perf_cpu_context *cpuctx,
> -              struct perf_event_context *ctx)
> +event_sched_in(struct perf_event *event, struct perf_event_context *ctx)
> {
> +     struct perf_event_pmu_context *epc = event->pmu_ctx;
> +     struct perf_cpu_pmu_context *cpc = 
> this_cpu_ptr(epc->pmu->cpu_pmu_context);
>       int ret = 0;
> 
>       lockdep_assert_held(&ctx->lock);
> @@ -2284,14 +2279,15 @@ event_sched_in(struct perf_event *event,
>       }
> 
>       if (!is_software_event(event))
> -             cpuctx->active_oncpu++;
> +             cpc->active_oncpu++;
>       if (!ctx->nr_active++)
> -             perf_event_ctx_activate(ctx);
> +             ;
> +     event->pmu_ctx->nr_active++;
>       if (event->attr.freq && event->attr.sample_freq)
>               ctx->nr_freq++;
> 
>       if (event->attr.exclusive)
> -             cpuctx->exclusive = 1;
> +             cpc->exclusive = 1;
> 
> out:
>       perf_pmu_enable(event->pmu);
> @@ -2300,21 +2296,19 @@ event_sched_in(struct perf_event *event,
> }
> 
> static int
> -group_sched_in(struct perf_event *group_event,
> -            struct perf_cpu_context *cpuctx,
> -            struct perf_event_context *ctx)
> +group_sched_in(struct perf_event *group_event, struct perf_event_context 
> *ctx)
> {
>       struct perf_event *event, *partial_group = NULL;
> -     struct pmu *pmu = ctx->pmu;
> +     struct pmu *pmu = group_event->pmu_ctx->pmu;
> 
>       if (group_event->state == PERF_EVENT_STATE_OFF)
>               return 0;
> 
>       pmu->start_txn(pmu, PERF_PMU_TXN_ADD);
> 
> -     if (event_sched_in(group_event, cpuctx, ctx)) {
> +     if (event_sched_in(group_event, ctx)) {
>               pmu->cancel_txn(pmu);
> -             perf_mux_hrtimer_restart(cpuctx);
> +             perf_mux_hrtimer_restart(this_cpu_ptr(pmu->cpu_pmu_context));
>               return -EAGAIN;
>       }
> 
> @@ -2322,7 +2316,7 @@ group_sched_in(struct perf_event *group_
>        * Schedule in siblings as one group (if any):
>        */
>       for_each_sibling_event(event, group_event) {
> -             if (event_sched_in(event, cpuctx, ctx)) {
> +             if (event_sched_in(event, ctx)) {
>                       partial_group = event;
>                       goto group_error;
>               }
> @@ -2341,13 +2335,13 @@ group_sched_in(struct perf_event *group_
>               if (event == partial_group)
>                       break;
> 
> -             event_sched_out(event, cpuctx, ctx);
> +             event_sched_out(event, ctx);
>       }
> -     event_sched_out(group_event, cpuctx, ctx);
> +     event_sched_out(group_event, ctx);
> 
>       pmu->cancel_txn(pmu);
> 
> -     perf_mux_hrtimer_restart(cpuctx);
> +     perf_mux_hrtimer_restart(this_cpu_ptr(pmu->cpu_pmu_context));
> 
>       return -EAGAIN;
> }
> @@ -2355,10 +2349,11 @@ group_sched_in(struct perf_event *group_
> /*
>  * Work out whether we can put this event group on the CPU now.
>  */
> -static int group_can_go_on(struct perf_event *event,
> -                        struct perf_cpu_context *cpuctx,
> -                        int can_add_hw)
> +static int group_can_go_on(struct perf_event *event, int can_add_hw)
> {
> +     struct perf_event_pmu_context *epc = event->pmu_ctx;
> +     struct perf_cpu_pmu_context *cpc = 
> this_cpu_ptr(epc->pmu->cpu_pmu_context);
> +
>       /*
>        * Groups consisting entirely of software events can always go on.
>        */
> @@ -2368,13 +2363,13 @@ static int group_can_go_on(struct perf_e
>        * If an exclusive group is already on, no other hardware
>        * events can go on.
>        */
> -     if (cpuctx->exclusive)
> +     if (cpc->exclusive)
>               return 0;
>       /*
>        * If this group is exclusive and there are already
>        * events on the CPU, it can't go on.
>        */
> -     if (event->attr.exclusive && cpuctx->active_oncpu)
> +     if (event->attr.exclusive && cpc->active_oncpu)
>               return 0;
>       /*
>        * Otherwise, try to add it if all previous groups were able
> @@ -2391,37 +2386,36 @@ static void add_event_to_ctx(struct perf
> }
> 
> static void ctx_sched_out(struct perf_event_context *ctx,
> -                       struct perf_cpu_context *cpuctx,
>                         enum event_type_t event_type);
> static void
> ctx_sched_in(struct perf_event_context *ctx,
> -          struct perf_cpu_context *cpuctx,
>            enum event_type_t event_type,
>            struct task_struct *task);
> 
> -static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
> -                            struct perf_event_context *ctx,
> +static void task_ctx_sched_out(struct perf_event_context *ctx,
>                              enum event_type_t event_type)
> {
> +     struct perf_cpu_context *cpuctx = this_cpu_ptr(&cpu_context);
> +
>       if (!cpuctx->task_ctx)
>               return;
> 
>       if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
>               return;
> 
> -     ctx_sched_out(ctx, cpuctx, event_type);
> +     ctx_sched_out(ctx, event_type);
> }
> 
> static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
>                               struct perf_event_context *ctx,
>                               struct task_struct *task)
> {
> -     cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task);
> +     ctx_sched_in(&cpuctx->ctx, EVENT_PINNED, task);
>       if (ctx)
> -             ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
> -     cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
> +             ctx_sched_in(ctx, EVENT_PINNED, task);
> +     ctx_sched_in(&cpuctx->ctx, EVENT_FLEXIBLE, task);
>       if (ctx)
> -             ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
> +             ctx_sched_in(ctx, EVENT_FLEXIBLE, task);
> }
> 
> /*
> @@ -2438,12 +2432,12 @@ static void perf_event_sched_in(struct p
>  * This can be called after a batch operation on task events, in which case
>  * event_type is a bit mask of the types of events involved. For CPU events,
>  * event_type is only either EVENT_PINNED or EVENT_FLEXIBLE.
> + *
>  */
> static void ctx_resched(struct perf_cpu_context *cpuctx,
>                       struct perf_event_context *task_ctx,
>                       enum event_type_t event_type)
> {
> -     enum event_type_t ctx_event_type;
>       bool cpu_event = !!(event_type & EVENT_CPU);
> 
>       /*
> @@ -2453,11 +2447,13 @@ static void ctx_resched(struct perf_cpu_
>       if (event_type & EVENT_PINNED)
>               event_type |= EVENT_FLEXIBLE;
> 
> -     ctx_event_type = event_type & EVENT_ALL;
> +     event_type &= EVENT_ALL;
> 
> -     perf_pmu_disable(cpuctx->ctx.pmu);
> -     if (task_ctx)
> -             task_ctx_sched_out(cpuctx, task_ctx, event_type);
> +     perf_ctx_disable(&cpuctx->ctx);
> +     if (task_ctx) {
> +             perf_ctx_disable(task_ctx);
> +             task_ctx_sched_out(task_ctx, event_type);
> +     }
> 
>       /*
>        * Decide which cpu ctx groups to schedule out based on the types
> @@ -2467,12 +2463,15 @@ static void ctx_resched(struct perf_cpu_
>        *  - otherwise, do nothing more.
>        */
>       if (cpu_event)
> -             cpu_ctx_sched_out(cpuctx, ctx_event_type);
> -     else if (ctx_event_type & EVENT_PINNED)
> -             cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
> +             ctx_sched_out(&cpuctx->ctx, event_type);
> +     else if (event_type & EVENT_PINNED)
> +             ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE);
> 
>       perf_event_sched_in(cpuctx, task_ctx, current);
> -     perf_pmu_enable(cpuctx->ctx.pmu);
> +
> +     perf_ctx_enable(&cpuctx->ctx);
> +     if (task_ctx)
> +             perf_ctx_enable(task_ctx);
> }
> 
> /*
> @@ -2485,7 +2484,7 @@ static int  __perf_install_in_context(vo
> {
>       struct perf_event *event = info;
>       struct perf_event_context *ctx = event->ctx;
> -     struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
> +     struct perf_cpu_context *cpuctx = this_cpu_ptr(&cpu_context);
>       struct perf_event_context *task_ctx = cpuctx->task_ctx;
>       bool reprogram = true;
>       int ret = 0;
> @@ -2527,7 +2526,7 @@ static int  __perf_install_in_context(vo
> #endif
> 
>       if (reprogram) {
> -             ctx_sched_out(ctx, cpuctx, EVENT_TIME);
> +             ctx_sched_out(ctx, EVENT_TIME);
>               add_event_to_ctx(event, ctx);
>               ctx_resched(cpuctx, task_ctx, get_event_type(event));
>       } else {
> @@ -2648,7 +2647,7 @@ static void __perf_event_enable(struct p
>               return;
> 
>       if (ctx->is_active)
> -             ctx_sched_out(ctx, cpuctx, EVENT_TIME);
> +             ctx_sched_out(ctx, EVENT_TIME);
> 
>       perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
> 
> @@ -2656,7 +2655,7 @@ static void __perf_event_enable(struct p
>               return;
> 
>       if (!event_filter_match(event)) {
> -             ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
> +             ctx_sched_in(ctx, EVENT_TIME, current);
>               return;
>       }
> 
> @@ -2665,7 +2664,7 @@ static void __perf_event_enable(struct p
>        * then don't put it on unless the group is on.
>        */
>       if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) {
> -             ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
> +             ctx_sched_in(ctx, EVENT_TIME, current);
>               return;
>       }
> 
> @@ -2889,11 +2888,46 @@ static int perf_event_modify_attr(struct
>       }
> }
> 
> -static void ctx_sched_out(struct perf_event_context *ctx,
> -                       struct perf_cpu_context *cpuctx,
> -                       enum event_type_t event_type)
> +static void __pmu_ctx_sched_out(struct perf_event_pmu_context *pmu_ctx,
> +                             enum event_type_t event_type)
> {
> +     struct perf_event_context *ctx = pmu_ctx->ctx;
>       struct perf_event *event, *tmp;
> +     struct pmu *pmu = pmu_ctx->pmu;
> +
> +     if (ctx->task && !ctx->is_active) {
> +             struct perf_cpu_pmu_context *cpc;
> +
> +             cpc = this_cpu_ptr(pmu->cpu_pmu_context);
> +             WARN_ON_ONCE(cpc->task_epc != pmu_ctx);
> +             cpc->task_epc = NULL;
> +     }
> +
> +     if (!event_type)
> +             return;
> +
> +     perf_pmu_disable(pmu);
> +     if (event_type & EVENT_PINNED) {
> +             list_for_each_entry_safe(event, tmp,
> +                             &pmu_ctx->pinned_active,
> +                             active_list)
> +                     group_sched_out(event, ctx);
> +     }
> +
> +     if (event_type & EVENT_FLEXIBLE) {
> +             list_for_each_entry_safe(event, tmp,
> +                             &pmu_ctx->flexible_active,
> +                             active_list)
> +                     group_sched_out(event, ctx);
> +     }
> +     perf_pmu_enable(pmu);
> +}
> +
> +static void
> +ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type)
> +{
> +     struct perf_cpu_context *cpuctx = this_cpu_ptr(&cpu_context);
> +     struct perf_event_pmu_context *pmu_ctx;
>       int is_active = ctx->is_active;
> 
>       lockdep_assert_held(&ctx->lock);
> @@ -2936,20 +2970,8 @@ static void ctx_sched_out(struct perf_ev
> 
>       is_active ^= ctx->is_active; /* changed bits */
> 
> -     if (!ctx->nr_active || !(is_active & EVENT_ALL))
> -             return;
> -
> -     perf_pmu_disable(ctx->pmu);
> -     if (is_active & EVENT_PINNED) {
> -             list_for_each_entry_safe(event, tmp, &ctx->pinned_active, 
> active_list)
> -                     group_sched_out(event, cpuctx, ctx);
> -     }
> -
> -     if (is_active & EVENT_FLEXIBLE) {
> -             list_for_each_entry_safe(event, tmp, &ctx->flexible_active, 
> active_list)
> -                     group_sched_out(event, cpuctx, ctx);
> -     }
> -     perf_pmu_enable(ctx->pmu);
> +     list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry)
> +             __pmu_ctx_sched_out(pmu_ctx, is_active);
> }
> 
> /*
> @@ -3054,10 +3076,34 @@ static void perf_event_sync_stat(struct
>       }
> }
> 
> -static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
> -                                      struct task_struct *next)
> +static void perf_event_swap_task_ctx_data(struct perf_event_context 
> *prev_ctx,
> +                                       struct perf_event_context *next_ctx)
> +{
> +     struct perf_event_pmu_context *prev_epc, *next_epc;
> +
> +     if (!prev_ctx->nr_task_data)
> +             return;
> +
> +     prev_epc = list_first_entry(&prev_ctx->pmu_ctx_list,
> +                                 struct perf_event_pmu_context,
> +                                 pmu_ctx_entry);
> +     next_epc = list_first_entry(&next_ctx->pmu_ctx_list,
> +                                 struct perf_event_pmu_context,
> +                                 pmu_ctx_entry);
> +
> +     while (&prev_epc->pmu_ctx_entry != &prev_ctx->pmu_ctx_list &&
> +            &next_epc->pmu_ctx_entry != &next_ctx->pmu_ctx_list) {
> +
> +             WARN_ON_ONCE(prev_epc->pmu != next_epc->pmu);
> +
> +             swap(prev_epc->task_ctx_data, next_epc->task_ctx_data);
> +     }
> +}
> +
> +static void
> +perf_event_context_sched_out(struct task_struct *task, struct task_struct 
> *next)
> {
> -     struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
> +     struct perf_event_context *ctx = task->perf_event_ctxp;
>       struct perf_event_context *next_ctx;
>       struct perf_event_context *parent, *next_parent;
>       struct perf_cpu_context *cpuctx;
> @@ -3066,12 +3112,12 @@ static void perf_event_context_sched_out
>       if (likely(!ctx))
>               return;
> 
> -     cpuctx = __get_cpu_context(ctx);
> +     cpuctx = this_cpu_ptr(&cpu_context);
>       if (!cpuctx->task_ctx)
>               return;
> 
>       rcu_read_lock();
> -     next_ctx = next->perf_event_ctxp[ctxn];
> +     next_ctx = rcu_dereference(next->perf_event_ctxp);
>       if (!next_ctx)
>               goto unlock;
> 
> @@ -3098,7 +3144,7 @@ static void perf_event_context_sched_out
>                       WRITE_ONCE(ctx->task, next);
>                       WRITE_ONCE(next_ctx->task, task);
> 
> -                     swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
> +                     perf_event_swap_task_ctx_data(ctx, next_ctx);
> 
>                       /*
>                        * RCU_INIT_POINTER here is safe because we've not
> @@ -3107,8 +3153,8 @@ static void perf_event_context_sched_out
>                        * since those values are always verified under
>                        * ctx->lock which we're now holding.
>                        */
> -                     RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], next_ctx);
> -                     RCU_INIT_POINTER(next->perf_event_ctxp[ctxn], ctx);
> +                     RCU_INIT_POINTER(task->perf_event_ctxp, next_ctx);
> +                     RCU_INIT_POINTER(next->perf_event_ctxp, ctx);
> 
>                       do_switch = 0;
> 
> @@ -3122,31 +3168,34 @@ static void perf_event_context_sched_out
> 
>       if (do_switch) {
>               raw_spin_lock(&ctx->lock);
> -             task_ctx_sched_out(cpuctx, ctx, EVENT_ALL);
> +             task_ctx_sched_out(ctx, EVENT_ALL);
>               raw_spin_unlock(&ctx->lock);
>       }
> }
> 
> static DEFINE_PER_CPU(struct list_head, sched_cb_list);
> +static DEFINE_PER_CPU(int, perf_sched_cb_usages);
> 
> void perf_sched_cb_dec(struct pmu *pmu)
> {
> -     struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
> +     struct perf_cpu_pmu_context *cpc = this_cpu_ptr(pmu->cpu_pmu_context);
> 
>       this_cpu_dec(perf_sched_cb_usages);
> +     barrier();
> 
> -     if (!--cpuctx->sched_cb_usage)
> -             list_del(&cpuctx->sched_cb_entry);
> +     if (!--cpc->sched_cb_usage)
> +             list_del(&cpc->sched_cb_entry);
> }
> 
> 
> void perf_sched_cb_inc(struct pmu *pmu)
> {
> -     struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
> +     struct perf_cpu_pmu_context *cpc = this_cpu_ptr(pmu->cpu_pmu_context);
> 
> -     if (!cpuctx->sched_cb_usage++)
> -             list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
> +     if (!cpc->sched_cb_usage++)
> +             list_add(&cpc->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
> 
> +     barrier();
>       this_cpu_inc(perf_sched_cb_usages);
> }
> 
> @@ -3162,22 +3211,24 @@ static void perf_pmu_sched_task(struct t
>                               struct task_struct *next,
>                               bool sched_in)
> {
> -     struct perf_cpu_context *cpuctx;
> +     struct perf_cpu_context *cpuctx = this_cpu_ptr(&cpu_context);
> +     struct perf_cpu_pmu_context *cpc;
>       struct pmu *pmu;
> 
>       if (prev == next)
>               return;
> 
> -     list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), 
> sched_cb_entry) {
> -             pmu = cpuctx->ctx.pmu; /* software PMUs will not have 
> sched_task */
> +     list_for_each_entry(cpc, this_cpu_ptr(&sched_cb_list), sched_cb_entry) {
> +             pmu = cpc->epc.pmu;
> 
> +             /* software PMUs will not have sched_task */
>               if (WARN_ON_ONCE(!pmu->sched_task))
>                       continue;
> 
>               perf_ctx_lock(cpuctx, cpuctx->task_ctx);
>               perf_pmu_disable(pmu);
> 
> -             pmu->sched_task(cpuctx->task_ctx, sched_in);
> +             pmu->sched_task(cpc->task_epc, sched_in);
> 
>               perf_pmu_enable(pmu);
>               perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
> @@ -3187,9 +3238,6 @@ static void perf_pmu_sched_task(struct t
> static void perf_event_switch(struct task_struct *task,
>                             struct task_struct *next_prev, bool sched_in);
> 
> -#define for_each_task_context_nr(ctxn)                                       
> \
> -     for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
> -
> /*
>  * Called from scheduler to remove the events of the current task,
>  * with interrupts disabled.
> @@ -3204,16 +3252,13 @@ static void perf_event_switch(struct tas
> void __perf_event_task_sched_out(struct task_struct *task,
>                                struct task_struct *next)
> {
> -     int ctxn;
> -
>       if (__this_cpu_read(perf_sched_cb_usages))
>               perf_pmu_sched_task(task, next, false);
> 
>       if (atomic_read(&nr_switch_events))
>               perf_event_switch(task, next, false);
> 
> -     for_each_task_context_nr(ctxn)
> -             perf_event_context_sched_out(task, ctxn, next);
> +     perf_event_context_sched_out(task, next);
> 
>       /*
>        * if cgroup events exist on this CPU, then we need
> @@ -3224,27 +3269,19 @@ void __perf_event_task_sched_out(struct
>               perf_cgroup_sched_out(task, next);
> }
> 
> -/*
> - * Called with IRQs disabled
> - */
> -static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
> -                           enum event_type_t event_type)
> -{
> -     ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
> -}
> -
> -static int visit_groups_merge(struct perf_event_groups *groups, int cpu,
> -                           int (*func)(struct perf_event *, void *), void 
> *data)
> +static int
> +visit_groups_merge(struct perf_event_groups *groups, int cpu, struct pmu 
> *pmu,
> +                int (*func)(struct perf_event *, void *), void *data)
> {
>       struct perf_event **evt, *evt1, *evt2;
>       int ret;
> 
> -     evt1 = perf_event_groups_first(groups, -1);
> -     evt2 = perf_event_groups_first(groups, cpu);
> +     evt1 = perf_event_groups_first(groups, -1, pmu);
> +     evt2 = perf_event_groups_first(groups, cpu, pmu);
> 
>       while (evt1 || evt2) {
>               if (evt1 && evt2) {
> -                     if (evt1->group_index < evt2->group_index)
> +                     if (perf_event_groups_less(evt1, evt2))
>                               evt = &evt1;
>                       else
>                               evt = &evt2;
> @@ -3258,7 +3295,7 @@ static int visit_groups_merge(struct per
>               if (ret)
>                       return ret;
> 
> -             *evt = perf_event_groups_next(*evt);
> +             *evt = perf_event_groups_next(*evt, pmu);
>       }
> 
>       return 0;
> @@ -3266,91 +3303,106 @@ static int visit_groups_merge(struct per
> 
> struct sched_in_data {
>       struct perf_event_context *ctx;
> -     struct perf_cpu_context *cpuctx;
> +     struct perf_event_pmu_context *epc;
>       int can_add_hw;
> +
> +     int pinned; /* set for pinned semantics */
> +     int busy;   /* set to terminate on busy */
> };
> 
> -static int pinned_sched_in(struct perf_event *event, void *data)
> +static void __link_epc(struct perf_event_pmu_context *pmu_ctx)
> {
> -     struct sched_in_data *sid = data;
> +     struct perf_cpu_pmu_context *cpc;
> 
> -     if (event->state <= PERF_EVENT_STATE_OFF)
> -             return 0;
> -
> -     if (!event_filter_match(event))
> -             return 0;
> -
> -     if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) {
> -             if (!group_sched_in(event, sid->cpuctx, sid->ctx))
> -                     list_add_tail(&event->active_list, 
> &sid->ctx->pinned_active);
> -     }
> -
> -     /*
> -      * If this pinned group hasn't been scheduled,
> -      * put it in error state.
> -      */
> -     if (event->state == PERF_EVENT_STATE_INACTIVE)
> -             perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
> +     if (!pmu_ctx->ctx->task)
> +             return;
> 
> -     return 0;
> +     cpc = this_cpu_ptr(pmu_ctx->pmu->cpu_pmu_context);
> +     WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx);
> +     cpc->task_epc = pmu_ctx;
> }
> 
> -static int flexible_sched_in(struct perf_event *event, void *data)
> +static int merge_sched_in(struct perf_event *event, void *data)
> {
>       struct sched_in_data *sid = data;
> 
> +     if (sid->epc != event->pmu_ctx) {
> +             sid->epc = event->pmu_ctx;
> +             sid->can_add_hw = 1;
> +             __link_epc(event->pmu_ctx);
> +
> +             perf_assert_pmu_disabled(sid->epc->pmu);
> +     }
> +
>       if (event->state <= PERF_EVENT_STATE_OFF)
>               return 0;
> 
>       if (!event_filter_match(event))
>               return 0;
> 
> -     if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) {
> -             if (!group_sched_in(event, sid->cpuctx, sid->ctx))
> -                     list_add_tail(&event->active_list, 
> &sid->ctx->flexible_active);
> -             else
> +     if (group_can_go_on(event, sid->can_add_hw)) {
> +             if (!group_sched_in(event, sid->ctx)) {
> +                     struct list_head *list;
> +
> +                     if (sid->pinned)
> +                             list = &sid->epc->pinned_active;
> +                     else
> +                             list = &sid->epc->flexible_active;
> +
> +                     list_add_tail(&event->active_list, list);
> +             }
> +     }
> +
> +     if (event->state == PERF_EVENT_STATE_INACTIVE) {
> +             if (sid->pinned) {
> +                     /*
> +                      * If this pinned group hasn't been scheduled,
> +                      * put it in error state.
> +                      */
> +                     perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
> +             } else {
>                       sid->can_add_hw = 0;
> +                     return sid->busy;
> +             }
>       }
> 
>       return 0;
> }
> 
> static void
> -ctx_pinned_sched_in(struct perf_event_context *ctx,
> -                 struct perf_cpu_context *cpuctx)
> +ctx_pinned_sched_in(struct perf_event_context *ctx, struct pmu *pmu)
> {
>       struct sched_in_data sid = {
>               .ctx = ctx,
> -             .cpuctx = cpuctx,
> -             .can_add_hw = 1,
> +             .pinned = 1,
>       };
> 
> -     visit_groups_merge(&ctx->pinned_groups,
> -                        smp_processor_id(),
> -                        pinned_sched_in, &sid);
> +     visit_groups_merge(&ctx->pinned_groups, smp_processor_id(), pmu,
> +                        merge_sched_in, &sid);
> }
> 
> static void
> -ctx_flexible_sched_in(struct perf_event_context *ctx,
> -                   struct perf_cpu_context *cpuctx)
> +ctx_flexible_sched_in(struct perf_event_context *ctx, struct pmu *pmu)
> {
>       struct sched_in_data sid = {
>               .ctx = ctx,
> -             .cpuctx = cpuctx,
> -             .can_add_hw = 1,
> +             .busy = pmu ? -EBUSY : 0,
>       };
> 
> -     visit_groups_merge(&ctx->flexible_groups,
> -                        smp_processor_id(),
> -                        flexible_sched_in, &sid);
> +     visit_groups_merge(&ctx->flexible_groups, smp_processor_id(), pmu,
> +                        merge_sched_in, &sid);
> +}
> +
> +static void __pmu_ctx_sched_in(struct perf_event_context *ctx, struct pmu 
> *pmu)
> +{
> +     ctx_flexible_sched_in(ctx, pmu);
> }
> 
> static void
> -ctx_sched_in(struct perf_event_context *ctx,
> -          struct perf_cpu_context *cpuctx,
> -          enum event_type_t event_type,
> +ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type,
>            struct task_struct *task)
> {
> +     struct perf_cpu_context *cpuctx = this_cpu_ptr(&cpu_context);
>       int is_active = ctx->is_active;
>       u64 now;
> 
> @@ -3373,6 +3425,7 @@ ctx_sched_in(struct perf_event_context *
>               /* start ctx time */
>               now = perf_clock();
>               ctx->timestamp = now;
> +             // XXX ctx->task =? task
>               perf_cgroup_set_timestamp(task, ctx);
>       }
> 
> @@ -3381,30 +3434,25 @@ ctx_sched_in(struct perf_event_context *
>        * in order to give them the best chance of going on.
>        */
>       if (is_active & EVENT_PINNED)
> -             ctx_pinned_sched_in(ctx, cpuctx);
> +             ctx_pinned_sched_in(ctx, NULL);
> 
>       /* Then walk through the lower prio flexible groups */
>       if (is_active & EVENT_FLEXIBLE)
> -             ctx_flexible_sched_in(ctx, cpuctx);
> +             ctx_flexible_sched_in(ctx, NULL);
> }
> 
> -static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
> -                          enum event_type_t event_type,
> -                          struct task_struct *task)
> +static void perf_event_context_sched_in(struct task_struct *task)
> {
> -     struct perf_event_context *ctx = &cpuctx->ctx;
> -
> -     ctx_sched_in(ctx, cpuctx, event_type, task);
> -}
> +     struct perf_cpu_context *cpuctx = this_cpu_ptr(&cpu_context);
> +     struct perf_event_context *ctx;
> 
> -static void perf_event_context_sched_in(struct perf_event_context *ctx,
> -                                     struct task_struct *task)
> -{
> -     struct perf_cpu_context *cpuctx;
> +     rcu_read_lock();
> +     ctx = rcu_dereference(task->perf_event_ctxp);
> +     if (!ctx)
> +             goto rcu_unlock;
> 
> -     cpuctx = __get_cpu_context(ctx);
>       if (cpuctx->task_ctx == ctx)
> -             return;
> +             goto rcu_unlock;
> 
>       perf_ctx_lock(cpuctx, ctx);
>       /*
> @@ -3414,7 +3462,7 @@ static void perf_event_context_sched_in(
>       if (!ctx->nr_events)
>               goto unlock;
> 
> -     perf_pmu_disable(ctx->pmu);
> +     perf_ctx_disable(ctx);
>       /*
>        * We want to keep the following priority order:
>        * cpu pinned (that don't need to move), task pinned,
> @@ -3423,13 +3471,21 @@ static void perf_event_context_sched_in(
>        * However, if task's ctx is not carrying any pinned
>        * events, no need to flip the cpuctx's events around.
>        */
> -     if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree))
> -             cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
> +     if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) {
> +             perf_ctx_disable(&cpuctx->ctx);
> +             ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE);
> +     }
> +
>       perf_event_sched_in(cpuctx, ctx, task);
> -     perf_pmu_enable(ctx->pmu);
> +
> +     if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree))
> +             perf_ctx_enable(&cpuctx->ctx);
> +     perf_ctx_enable(ctx);
> 
> unlock:
>       perf_ctx_unlock(cpuctx, ctx);
> +rcu_unlock:
> +     rcu_read_unlock();
> }
> 
> /*
> @@ -3446,9 +3502,6 @@ static void perf_event_context_sched_in(
> void __perf_event_task_sched_in(struct task_struct *prev,
>                               struct task_struct *task)
> {
> -     struct perf_event_context *ctx;
> -     int ctxn;
> -
>       /*
>        * If cgroup events exist on this CPU, then we need to check if we have
>        * to switch in PMU state; cgroup event are system-wide mode only.
> @@ -3459,13 +3512,7 @@ void __perf_event_task_sched_in(struct t
>       if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
>               perf_cgroup_sched_in(prev, task);
> 
> -     for_each_task_context_nr(ctxn) {
> -             ctx = task->perf_event_ctxp[ctxn];
> -             if (likely(!ctx))
> -                     continue;
> -
> -             perf_event_context_sched_in(ctx, task);
> -     }
> +     perf_event_context_sched_in(task);
> 
>       if (atomic_read(&nr_switch_events))
>               perf_event_switch(task, prev, true);
> @@ -3584,8 +3631,8 @@ static void perf_adjust_period(struct pe
>  * events. At the same time, make sure, having freq events does not change
>  * the rate of unthrottling as that would introduce bias.
>  */
> -static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
> -                                        int needs_unthr)
> +static void
> +perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool 
> unthrottle)
> {
>       struct perf_event *event;
>       struct hw_perf_event *hwc;
> @@ -3597,16 +3644,16 @@ static void perf_adjust_freq_unthr_conte
>        * - context have events in frequency mode (needs freq adjust)
>        * - there are events to unthrottle on this cpu
>        */
> -     if (!(ctx->nr_freq || needs_unthr))
> +     if (!(ctx->nr_freq || unthrottle))
>               return;
> 
>       raw_spin_lock(&ctx->lock);
> -     perf_pmu_disable(ctx->pmu);
> 
>       list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
>               if (event->state != PERF_EVENT_STATE_ACTIVE)
>                       continue;
> 
> +             // XXX use visit thingy to avoid the -1,cpu match
>               if (!event_filter_match(event))
>                       continue;
> 
> @@ -3647,7 +3694,6 @@ static void perf_adjust_freq_unthr_conte
>               perf_pmu_enable(event->pmu);
>       }
> 
> -     perf_pmu_enable(ctx->pmu);
>       raw_spin_unlock(&ctx->lock);
> }
> 
> @@ -3668,71 +3714,97 @@ static void rotate_ctx(struct perf_event
> }
> 
> static inline struct perf_event *
> -ctx_first_active(struct perf_event_context *ctx)
> +ctx_first_active(struct perf_event_pmu_context *pmu_ctx)
> {
> -     return list_first_entry_or_null(&ctx->flexible_active,
> +     return list_first_entry_or_null(&pmu_ctx->flexible_active,
>                                       struct perf_event, active_list);
> }
> 
> -static bool perf_rotate_context(struct perf_cpu_context *cpuctx)
> +/*
> + * XXX somewhat completely buggered; this is in cpu_pmu_context, but we need
> + * event_pmu_context for rotations. We also need event_pmu_context specific
> + * scheduling routines. ARGH
> + *
> + *  - fixed the cpu_pmu_context vs event_pmu_context thingy
> + *    (cpu_pmu_context embeds an event_pmu_context)
> + *
> + *  - need nr_events/nr_active in epc to do per epc rotation
> + *    (done)
> + *
> + *  - need cpu and task pmu ctx together...
> + *    (cpc->task_epc)
> + */
> +static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc)
> {
> +     struct perf_cpu_context *cpuctx = this_cpu_ptr(&cpu_context);
> +     struct perf_event_pmu_context *cpu_epc, *task_epc = NULL;
>       struct perf_event *cpu_event = NULL, *task_event = NULL;
>       bool cpu_rotate = false, task_rotate = false;
>       struct perf_event_context *ctx = NULL;
> +     struct pmu *pmu;
> 
>       /*
>        * Since we run this from IRQ context, nobody can install new
>        * events, thus the event count values are stable.
>        */
> 
> -     if (cpuctx->ctx.nr_events) {
> -             if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
> -                     cpu_rotate = true;
> -     }
> +     cpu_epc = &cpc->epc;
> +     pmu = cpu_epc->pmu;
> 
> -     ctx = cpuctx->task_ctx;
> -     if (ctx && ctx->nr_events) {
> -             if (ctx->nr_events != ctx->nr_active)
> +     if (cpu_epc->nr_events && cpu_epc->nr_events != cpu_epc->nr_active)
> +             cpu_rotate = true;
> +
> +     task_epc = cpc->task_epc;
> +     if (task_epc) {
> +             WARN_ON_ONCE(task_epc->pmu != pmu);
> +             if (task_epc->nr_events && task_epc->nr_events != 
> task_epc->nr_active)
>                       task_rotate = true;
>       }
> 
>       if (!(cpu_rotate || task_rotate))
>               return false;
> 
> -     perf_ctx_lock(cpuctx, cpuctx->task_ctx);
> -     perf_pmu_disable(cpuctx->ctx.pmu);
> +     perf_ctx_lock(cpuctx, ctx);
> +     perf_pmu_disable(pmu);
> 
>       if (task_rotate)
> -             task_event = ctx_first_active(ctx);
> +             task_event = ctx_first_active(task_epc);
> +
>       if (cpu_rotate)
> -             cpu_event = ctx_first_active(&cpuctx->ctx);
> +             cpu_event = ctx_first_active(cpu_epc);
> 
>       /*
>        * As per the order given at ctx_resched() first 'pop' task flexible
>        * and then, if needed CPU flexible.
>        */
> -     if (task_event || (ctx && cpu_event))
> -             ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
> -     if (cpu_event)
> -             cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
> +     if (task_event || (task_epc && cpu_event)) {
> +             update_context_time(ctx);
> +             __pmu_ctx_sched_out(task_epc, EVENT_FLEXIBLE);
> +     }
> +
> +     if (cpu_event) {
> +             update_context_time(&cpuctx->ctx);
> +             __pmu_ctx_sched_out(cpu_epc, EVENT_FLEXIBLE);
> +             rotate_ctx(&cpuctx->ctx, cpu_event);
> +             __pmu_ctx_sched_in(&cpuctx->ctx, pmu);
> +     }
> 
>       if (task_event)
>               rotate_ctx(ctx, task_event);
> -     if (cpu_event)
> -             rotate_ctx(&cpuctx->ctx, cpu_event);
> 
> -     perf_event_sched_in(cpuctx, ctx, current);
> +     if (task_event || (task_epc && cpu_event))
> +             __pmu_ctx_sched_in(ctx, pmu);
> 
> -     perf_pmu_enable(cpuctx->ctx.pmu);
> -     perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
> +     perf_pmu_enable(pmu);
> +     perf_ctx_unlock(cpuctx, ctx);
> 
>       return true;
> }
> 
> void perf_event_task_tick(void)
> {
> -     struct list_head *head = this_cpu_ptr(&active_ctx_list);
> -     struct perf_event_context *ctx, *tmp;
> +     struct perf_cpu_context *cpuctx = this_cpu_ptr(&cpu_context);
> +     struct perf_event_context *ctx;
>       int throttled;
> 
>       lockdep_assert_irqs_disabled();
> @@ -3741,8 +3813,13 @@ void perf_event_task_tick(void)
>       throttled = __this_cpu_xchg(perf_throttled_count, 0);
>       tick_dep_clear_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
> 
> -     list_for_each_entry_safe(ctx, tmp, head, active_ctx_list)
> -             perf_adjust_freq_unthr_context(ctx, throttled);
> +     perf_adjust_freq_unthr_context(&cpuctx->ctx, !!throttled);
> +
> +     rcu_read_lock();
> +     ctx = rcu_dereference(current->perf_event_ctxp);
> +     if (ctx)
> +             perf_adjust_freq_unthr_context(ctx, !!throttled);
> +     rcu_read_unlock();
> }
> 
> static int event_enable_on_exec(struct perf_event *event,
> @@ -3764,9 +3841,9 @@ static int event_enable_on_exec(struct p
>  * Enable all of a task's events that have been marked enable-on-exec.
>  * This expects task == current.
>  */
> -static void perf_event_enable_on_exec(int ctxn)
> +static void perf_event_enable_on_exec(struct perf_event_context *ctx)
> {
> -     struct perf_event_context *ctx, *clone_ctx = NULL;
> +     struct perf_event_context *clone_ctx = NULL;
>       enum event_type_t event_type = 0;
>       struct perf_cpu_context *cpuctx;
>       struct perf_event *event;
> @@ -3774,13 +3851,16 @@ static void perf_event_enable_on_exec(in
>       int enabled = 0;
> 
>       local_irq_save(flags);
> -     ctx = current->perf_event_ctxp[ctxn];
> -     if (!ctx || !ctx->nr_events)
> +     if (WARN_ON_ONCE(current->perf_event_ctxp != ctx))
>               goto out;
> 
> -     cpuctx = __get_cpu_context(ctx);
> +     if (!ctx->nr_events)
> +             goto out;
> +
> +     cpuctx = this_cpu_ptr(&cpu_context);
>       perf_ctx_lock(cpuctx, ctx);
> -     ctx_sched_out(ctx, cpuctx, EVENT_TIME);
> +     ctx_sched_out(ctx, EVENT_TIME);
> +
>       list_for_each_entry(event, &ctx->event_list, event_entry) {
>               enabled |= event_enable_on_exec(event, ctx);
>               event_type |= get_event_type(event);
> @@ -3793,7 +3873,7 @@ static void perf_event_enable_on_exec(in
>               clone_ctx = unclone_ctx(ctx);
>               ctx_resched(cpuctx, ctx, event_type);
>       } else {
> -             ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
> +             ctx_sched_in(ctx, EVENT_TIME, current);
>       }
>       perf_ctx_unlock(cpuctx, ctx);
> 
> @@ -3835,7 +3915,7 @@ static void __perf_event_read(void *info
>       struct perf_read_data *data = info;
>       struct perf_event *sub, *event = data->event;
>       struct perf_event_context *ctx = event->ctx;
> -     struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
> +     struct perf_cpu_context *cpuctx = this_cpu_ptr(&cpu_context);
>       struct pmu *pmu = event->pmu;
> 
>       /*
> @@ -4050,17 +4130,25 @@ static void __perf_event_init_context(st
> {
>       raw_spin_lock_init(&ctx->lock);
>       mutex_init(&ctx->mutex);
> -     INIT_LIST_HEAD(&ctx->active_ctx_list);
> +     INIT_LIST_HEAD(&ctx->pmu_ctx_list);
>       perf_event_groups_init(&ctx->pinned_groups);
>       perf_event_groups_init(&ctx->flexible_groups);
>       INIT_LIST_HEAD(&ctx->event_list);
> -     INIT_LIST_HEAD(&ctx->pinned_active);
> -     INIT_LIST_HEAD(&ctx->flexible_active);
>       atomic_set(&ctx->refcount, 1);
> }
> 
> +static void
> +__perf_init_event_pmu_context(struct perf_event_pmu_context *epc, struct pmu 
> *pmu)
> +{
> +     epc->pmu = pmu;
> +     INIT_LIST_HEAD(&epc->pmu_ctx_entry);
> +     INIT_LIST_HEAD(&epc->pinned_active);
> +     INIT_LIST_HEAD(&epc->flexible_active);
> +     atomic_set(&epc->refcount, 1);
> +}
> +
> static struct perf_event_context *
> -alloc_perf_context(struct pmu *pmu, struct task_struct *task)
> +alloc_perf_context(struct task_struct *task)
> {
>       struct perf_event_context *ctx;
> 
> @@ -4073,7 +4161,6 @@ alloc_perf_context(struct pmu *pmu, stru
>               ctx->task = task;
>               get_task_struct(task);
>       }
> -     ctx->pmu = pmu;
> 
>       return ctx;
> }
> @@ -4102,22 +4189,19 @@ find_lively_task_by_vpid(pid_t vpid)
>  * Returns a matching context with refcount and pincount.
>  */
> static struct perf_event_context *
> -find_get_context(struct pmu *pmu, struct task_struct *task,
> -             struct perf_event *event)
> +find_get_context(struct task_struct *task, struct perf_event *event)
> {
>       struct perf_event_context *ctx, *clone_ctx = NULL;
>       struct perf_cpu_context *cpuctx;
> -     void *task_ctx_data = NULL;
>       unsigned long flags;
> -     int ctxn, err;
> -     int cpu = event->cpu;
> +     int err;
> 
>       if (!task) {
>               /* Must be root to operate on a CPU event: */
>               if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
>                       return ERR_PTR(-EACCES);
> 
> -             cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
> +             cpuctx = per_cpu_ptr(&cpu_context, event->cpu);
>               ctx = &cpuctx->ctx;
>               get_ctx(ctx);
>               ++ctx->pin_count;
> @@ -4126,43 +4210,22 @@ find_get_context(struct pmu *pmu, struct
>       }
> 
>       err = -EINVAL;
> -     ctxn = pmu->task_ctx_nr;
> -     if (ctxn < 0)
> -             goto errout;
> -
> -     if (event->attach_state & PERF_ATTACH_TASK_DATA) {
> -             task_ctx_data = kzalloc(pmu->task_ctx_size, GFP_KERNEL);
> -             if (!task_ctx_data) {
> -                     err = -ENOMEM;
> -                     goto errout;
> -             }
> -     }
> -
> retry:
> -     ctx = perf_lock_task_context(task, ctxn, &flags);
> +     ctx = perf_lock_task_context(task, &flags);
>       if (ctx) {
>               clone_ctx = unclone_ctx(ctx);
>               ++ctx->pin_count;
> 
> -             if (task_ctx_data && !ctx->task_ctx_data) {
> -                     ctx->task_ctx_data = task_ctx_data;
> -                     task_ctx_data = NULL;
> -             }
>               raw_spin_unlock_irqrestore(&ctx->lock, flags);
> 
>               if (clone_ctx)
>                       put_ctx(clone_ctx);
>       } else {
> -             ctx = alloc_perf_context(pmu, task);
> +             ctx = alloc_perf_context(task);
>               err = -ENOMEM;
>               if (!ctx)
>                       goto errout;
> 
> -             if (task_ctx_data) {
> -                     ctx->task_ctx_data = task_ctx_data;
> -                     task_ctx_data = NULL;
> -             }
> -
>               err = 0;
>               mutex_lock(&task->perf_event_mutex);
>               /*
> @@ -4171,12 +4234,12 @@ find_get_context(struct pmu *pmu, struct
>                */
>               if (task->flags & PF_EXITING)
>                       err = -ESRCH;
> -             else if (task->perf_event_ctxp[ctxn])
> +             else if (task->perf_event_ctxp)
>                       err = -EAGAIN;
>               else {
>                       get_ctx(ctx);
>                       ++ctx->pin_count;
> -                     rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
> +                     rcu_assign_pointer(task->perf_event_ctxp, ctx);
>               }
>               mutex_unlock(&task->perf_event_mutex);
> 
> @@ -4189,14 +4252,117 @@ find_get_context(struct pmu *pmu, struct
>               }
>       }
> 
> -     kfree(task_ctx_data);
>       return ctx;
> 
> errout:
> -     kfree(task_ctx_data);
>       return ERR_PTR(err);
> }
> 
> +struct perf_event_pmu_context *
> +find_get_pmu_context(struct pmu *pmu, struct perf_event_context *ctx,
> +                  struct perf_event *event)
> +{
> +     struct perf_event_pmu_context *new = NULL, *epc;
> +     void *task_ctx_data = NULL;
> +
> +     if (!ctx->task) {
> +             struct perf_cpu_pmu_context *cpc;
> +
> +             cpc = per_cpu_ptr(pmu->cpu_pmu_context, event->cpu);
> +             epc = &cpc->epc;
> +
> +             if (!epc->ctx) {
> +                     atomic_set(&epc->refcount, 1);
> +                     epc->embedded = 1;
> +                     raw_spin_lock_irq(&ctx->lock);
> +                     list_add(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list);
> +                     epc->ctx = ctx;
> +                     raw_spin_unlock_irq(&ctx->lock);
> +             } else {
> +                     WARN_ON_ONCE(epc->ctx != ctx);
> +                     atomic_inc(&epc->refcount);
> +             }
> +
> +             return epc;
> +     }
> +
> +     new = kzalloc(sizeof(*epc), GFP_KERNEL);
> +     if (!new)
> +             return ERR_PTR(-ENOMEM);
> +
> +     if (event->attach_state & PERF_ATTACH_TASK_DATA) {
> +             task_ctx_data = kzalloc(pmu->task_ctx_size, GFP_KERNEL);
> +             if (!task_ctx_data) {
> +                     kfree(new);
> +                     return ERR_PTR(-ENOMEM);
> +             }
> +     }
> +
> +     __perf_init_event_pmu_context(new, pmu);
> +
> +     raw_spin_lock_irq(&ctx->lock);
> +     list_for_each_entry(epc, &ctx->pmu_ctx_list, pmu_ctx_entry) {
> +             if (epc->pmu == pmu) {
> +                     WARN_ON_ONCE(epc->ctx != ctx);
> +                     atomic_inc(&epc->refcount);
> +                     goto found_epc;
> +             }
> +     }
> +
> +     epc = new;
> +     new = NULL;
> +
> +     list_add(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list);
> +     epc->ctx = ctx;
> +
> +found_epc:
> +     if (task_ctx_data && !epc->task_ctx_data) {
> +             epc->task_ctx_data = task_ctx_data;
> +             task_ctx_data = NULL;
> +             ctx->nr_task_data++;
> +     }
> +     raw_spin_unlock_irq(&ctx->lock);
> +
> +     kfree(task_ctx_data);
> +     kfree(new);
> +
> +     return epc;
> +}
> +
> +static void get_pmu_ctx(struct perf_event_pmu_context *epc)
> +{
> +     WARN_ON_ONCE(!atomic_inc_not_zero(&epc->refcount));
> +}
> +
> +static void put_pmu_ctx(struct perf_event_pmu_context *epc)
> +{
> +     unsigned long flags;
> +
> +     if (!atomic_dec_and_test(&epc->refcount))
> +             return;
> +
> +     if (epc->ctx) {
> +             struct perf_event_context *ctx = epc->ctx;
> +
> +             // XXX ctx->mutex
> +
> +             WARN_ON_ONCE(list_empty(&epc->pmu_ctx_entry));
> +             raw_spin_lock_irqsave(&ctx->lock, flags);
> +             list_del_init(&epc->pmu_ctx_entry);
> +             epc->ctx = NULL;
> +             raw_spin_unlock_irqrestore(&ctx->lock, flags);
> +     }
> +
> +     WARN_ON_ONCE(!list_empty(&epc->pinned_active));
> +     WARN_ON_ONCE(!list_empty(&epc->flexible_active));
> +
> +     if (epc->embedded)
> +             return;
> +
> +     kfree(epc->task_ctx_data);
> +     kfree(epc);
> +}
> +
> static void perf_event_free_filter(struct perf_event *event);
> static void perf_event_free_bpf_prog(struct perf_event *event);
> 
> @@ -4445,6 +4611,9 @@ static void _free_event(struct perf_even
>       if (event->destroy)
>               event->destroy(event);
> 
> +     if (event->pmu_ctx)
> +             put_pmu_ctx(event->pmu_ctx);
> +
>       if (event->ctx)
>               put_ctx(event->ctx);
> 
> @@ -4943,7 +5112,7 @@ static void __perf_event_period(struct p
> 
>       active = (event->state == PERF_EVENT_STATE_ACTIVE);
>       if (active) {
> -             perf_pmu_disable(ctx->pmu);
> +             perf_pmu_disable(event->pmu);
>               /*
>                * We could be throttled; unthrottle now to avoid the tick
>                * trying to unthrottle while we already re-started the event.
> @@ -4959,7 +5128,7 @@ static void __perf_event_period(struct p
> 
>       if (active) {
>               event->pmu->start(event, PERF_EF_RELOAD);
> -             perf_pmu_enable(ctx->pmu);
> +             perf_pmu_enable(event->pmu);
>       }
> }
> 
> @@ -6634,7 +6803,6 @@ perf_iterate_sb(perf_iterate_f output, v
>              struct perf_event_context *task_ctx)
> {
>       struct perf_event_context *ctx;
> -     int ctxn;
> 
>       rcu_read_lock();
>       preempt_disable();
> @@ -6651,11 +6819,9 @@ perf_iterate_sb(perf_iterate_f output, v
> 
>       perf_iterate_sb_cpu(output, data);
> 
> -     for_each_task_context_nr(ctxn) {
> -             ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
> -             if (ctx)
> -                     perf_iterate_ctx(ctx, output, data, false);
> -     }
> +     ctx = rcu_dereference(current->perf_event_ctxp);
> +     if (ctx)
> +             perf_iterate_ctx(ctx, output, data, false);
> done:
>       preempt_enable();
>       rcu_read_unlock();
> @@ -6696,18 +6862,12 @@ static void perf_event_addr_filters_exec
> void perf_event_exec(void)
> {
>       struct perf_event_context *ctx;
> -     int ctxn;
> 
>       rcu_read_lock();
> -     for_each_task_context_nr(ctxn) {
> -             ctx = current->perf_event_ctxp[ctxn];
> -             if (!ctx)
> -                     continue;
> -
> -             perf_event_enable_on_exec(ctxn);
> -
> -             perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL,
> -                                true);
> +     ctx = rcu_dereference(current->perf_event_ctxp);
> +     if (ctx) {
> +             perf_event_enable_on_exec(ctx);
> +             perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL, true);
>       }
>       rcu_read_unlock();
> }
> @@ -6749,8 +6909,7 @@ static void __perf_event_output_stop(str
> static int __perf_pmu_output_stop(void *info)
> {
>       struct perf_event *event = info;
> -     struct pmu *pmu = event->pmu;
> -     struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
> +     struct perf_cpu_context *cpuctx = this_cpu_ptr(&cpu_context);
>       struct remote_output ro = {
>               .rb     = event->rb,
>       };
> @@ -7398,7 +7557,6 @@ static void __perf_addr_filters_adjust(s
> static void perf_addr_filters_adjust(struct vm_area_struct *vma)
> {
>       struct perf_event_context *ctx;
> -     int ctxn;
> 
>       /*
>        * Data tracing isn't supported yet and as such there is no need
> @@ -7408,13 +7566,9 @@ static void perf_addr_filters_adjust(str
>               return;
> 
>       rcu_read_lock();
> -     for_each_task_context_nr(ctxn) {
> -             ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
> -             if (!ctx)
> -                     continue;
> -
> +     ctx = rcu_dereference(current->perf_event_ctxp);
> +     if (ctx)
>               perf_iterate_ctx(ctx, __perf_addr_filters_adjust, vma, true);
> -     }
>       rcu_read_unlock();
> }
> 
> @@ -8309,10 +8463,13 @@ void perf_tp_event(u16 event_type, u64 c
>               struct trace_entry *entry = record;
> 
>               rcu_read_lock();
> -             ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]);
> +             ctx = rcu_dereference(task->perf_event_ctxp);
>               if (!ctx)
>                       goto unlock;
> 
> +             // XXX iterate groups instead, we should be able to
> +             // find the subtree for the perf_tracepoint pmu and CPU.
> +
>               list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
>                       if (event->cpu != smp_processor_id())
>                               continue;
> @@ -9404,25 +9561,6 @@ static int perf_event_idx_default(struct
>       return 0;
> }
> 
> -/*
> - * Ensures all contexts with the same task_ctx_nr have the same
> - * pmu_cpu_context too.
> - */
> -static struct perf_cpu_context __percpu *find_pmu_context(int ctxn)
> -{
> -     struct pmu *pmu;
> -
> -     if (ctxn < 0)
> -             return NULL;
> -
> -     list_for_each_entry(pmu, &pmus, entry) {
> -             if (pmu->task_ctx_nr == ctxn)
> -                     return pmu->pmu_cpu_context;
> -     }
> -
> -     return NULL;
> -}
> -
> static void free_pmu_context(struct pmu *pmu)
> {
>       /*
> @@ -9433,7 +9571,7 @@ static void free_pmu_context(struct pmu
>       if (pmu->task_ctx_nr > perf_invalid_context)
>               return;
> 
> -     free_percpu(pmu->pmu_cpu_context);
> +     free_percpu(pmu->cpu_pmu_context);
> }
> 
> /*
> @@ -9497,12 +9635,12 @@ perf_event_mux_interval_ms_store(struct
>       /* update all cpuctx for this PMU */
>       cpus_read_lock();
>       for_each_online_cpu(cpu) {
> -             struct perf_cpu_context *cpuctx;
> -             cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
> -             cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
> +             struct perf_cpu_pmu_context *cpc;
> +             cpc = per_cpu_ptr(pmu->cpu_pmu_context, cpu);
> +             cpc->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
> 
>               cpu_function_call(cpu,
> -                     (remote_function_f)perf_mux_hrtimer_restart, cpuctx);
> +                     (remote_function_f)perf_mux_hrtimer_restart, cpc);
>       }
>       cpus_read_unlock();
>       mutex_unlock(&mux_interval_mutex);
> @@ -9602,44 +9740,19 @@ int perf_pmu_register(struct pmu *pmu, c
>       }
> 
> skip_type:
> -     if (pmu->task_ctx_nr == perf_hw_context) {
> -             static int hw_context_taken = 0;
> -
> -             /*
> -              * Other than systems with heterogeneous CPUs, it never makes
> -              * sense for two PMUs to share perf_hw_context. PMUs which are
> -              * uncore must use perf_invalid_context.
> -              */
> -             if (WARN_ON_ONCE(hw_context_taken &&
> -                 !(pmu->capabilities & PERF_PMU_CAP_HETEROGENEOUS_CPUS)))
> -                     pmu->task_ctx_nr = perf_invalid_context;
> -
> -             hw_context_taken = 1;
> -     }
> -
> -     pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
> -     if (pmu->pmu_cpu_context)
> -             goto got_cpu_context;
> -
>       ret = -ENOMEM;
> -     pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
> -     if (!pmu->pmu_cpu_context)
> +     pmu->cpu_pmu_context = alloc_percpu(struct perf_cpu_pmu_context);
> +     if (!pmu->cpu_pmu_context)
>               goto free_dev;
> 
>       for_each_possible_cpu(cpu) {
> -             struct perf_cpu_context *cpuctx;
> +             struct perf_cpu_pmu_context *cpc;
> 
> -             cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
> -             __perf_event_init_context(&cpuctx->ctx);
> -             lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
> -             lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
> -             cpuctx->ctx.pmu = pmu;
> -             cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask);
> -
> -             __perf_mux_hrtimer_init(cpuctx, cpu);
> +             cpc = per_cpu_ptr(pmu->cpu_pmu_context, cpu);
> +             __perf_init_event_pmu_context(&cpc->epc, pmu);
> +             __perf_mux_hrtimer_init(cpc, cpu);
>       }
> 
> -got_cpu_context:
>       if (!pmu->start_txn) {
>               if (pmu->pmu_enable) {
>                       /*
> @@ -10349,37 +10462,6 @@ static int perf_event_set_clock(struct p
>       return 0;
> }
> 
> -/*
> - * Variation on perf_event_ctx_lock_nested(), except we take two context
> - * mutexes.
> - */
> -static struct perf_event_context *
> -__perf_event_ctx_lock_double(struct perf_event *group_leader,
> -                          struct perf_event_context *ctx)
> -{
> -     struct perf_event_context *gctx;
> -
> -again:
> -     rcu_read_lock();
> -     gctx = READ_ONCE(group_leader->ctx);
> -     if (!atomic_inc_not_zero(&gctx->refcount)) {
> -             rcu_read_unlock();
> -             goto again;
> -     }
> -     rcu_read_unlock();
> -
> -     mutex_lock_double(&gctx->mutex, &ctx->mutex);
> -
> -     if (group_leader->ctx != gctx) {
> -             mutex_unlock(&ctx->mutex);
> -             mutex_unlock(&gctx->mutex);
> -             put_ctx(gctx);
> -             goto again;
> -     }
> -
> -     return gctx;
> -}
> -
> /**
>  * sys_perf_event_open - open a performance event, associate it to a task/cpu
>  *
> @@ -10393,9 +10475,10 @@ SYSCALL_DEFINE5(perf_event_open,
>               pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
> {
>       struct perf_event *group_leader = NULL, *output_event = NULL;
> +     struct perf_event_pmu_context *pmu_ctx;
>       struct perf_event *event, *sibling;
>       struct perf_event_attr attr;
> -     struct perf_event_context *ctx, *uninitialized_var(gctx);
> +     struct perf_event_context *ctx;
>       struct file *event_file = NULL;
>       struct fd group = {NULL, 0};
>       struct task_struct *task = NULL;
> @@ -10506,6 +10589,8 @@ SYSCALL_DEFINE5(perf_event_open,
>               goto err_cred;
>       }
> 
> +     // XXX premature; what if this is allowed, but we get moved to a PMU
> +     // that doesn't have this.
>       if (is_sampling_event(event)) {
>               if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
>                       err = -EOPNOTSUPP;
> @@ -10525,50 +10610,45 @@ SYSCALL_DEFINE5(perf_event_open,
>                       goto err_alloc;
>       }
> 
> +     if (pmu->task_ctx_nr < 0 && task) {
> +             err = -EINVAL;
> +             goto err_alloc;
> +     }
> +
>       if (pmu->task_ctx_nr == perf_sw_context)
>               event->event_caps |= PERF_EV_CAP_SOFTWARE;
> 
> -     if (group_leader) {
> -             if (is_software_event(event) &&
> -                 !in_software_context(group_leader)) {
> -                     /*
> -                      * If the event is a sw event, but the group_leader
> -                      * is on hw context.
> -                      *
> -                      * Allow the addition of software events to hw
> -                      * groups, this is safe because software events
> -                      * never fail to schedule.
> -                      */
> -                     pmu = group_leader->ctx->pmu;
> -             } else if (!is_software_event(event) &&
> -                        is_software_event(group_leader) &&
> -                        (group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
> -                     /*
> -                      * In case the group is a pure software group, and we
> -                      * try to add a hardware event, move the whole group to
> -                      * the hardware context.
> -                      */
> -                     move_group = 1;
> -             }
> -     }
> -
>       /*
>        * Get the target context (task or percpu):
>        */
> -     ctx = find_get_context(pmu, task, event);
> +     ctx = find_get_context(task, event);
>       if (IS_ERR(ctx)) {
>               err = PTR_ERR(ctx);
>               goto err_alloc;
>       }
> 
> -     if ((pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) && group_leader) {
> -             err = -EBUSY;
> -             goto err_context;
> +     mutex_lock(&ctx->mutex);
> +
> +     if (ctx->task == TASK_TOMBSTONE) {
> +             err = -ESRCH;
> +             goto err_locked;
> +     }
> +
> +     if (!task) {
> +             /*
> +              * Check if the @cpu we're creating an event for is online.
> +              *
> +              * We use the perf_cpu_context::ctx::mutex to serialize against
> +              * the hotplug notifiers. See perf_event_{init,exit}_cpu().
> +              */
> +             struct perf_cpu_context *cpuctx = per_cpu_ptr(&cpu_context, 
> event->cpu);
> +
> +             if (!cpuctx->online) {
> +                     err = -ENODEV;
> +                     goto err_locked;
> +             }
>       }
> 
> -     /*
> -      * Look up the group leader (we will attach this event to it):
> -      */
>       if (group_leader) {
>               err = -EINVAL;
> 
> @@ -10577,11 +10657,11 @@ SYSCALL_DEFINE5(perf_event_open,
>                * becoming part of another group-sibling):
>                */
>               if (group_leader->group_leader != group_leader)
> -                     goto err_context;
> +                     goto err_locked;
> 
>               /* All events in a group should have the same clock */
>               if (group_leader->clock != event->clock)
> -                     goto err_context;
> +                     goto err_locked;
> 
>               /*
>                * Make sure we're both events for the same CPU;
> @@ -10589,28 +10669,57 @@ SYSCALL_DEFINE5(perf_event_open,
>                * you can never concurrently schedule them anyhow.
>                */
>               if (group_leader->cpu != event->cpu)
> -                     goto err_context;
> -
> -             /*
> -              * Make sure we're both on the same task, or both
> -              * per-CPU events.
> -              */
> -             if (group_leader->ctx->task != ctx->task)
> -                     goto err_context;
> +                     goto err_locked;
> 
>               /*
> -              * Do not allow to attach to a group in a different task
> -              * or CPU context. If we're moving SW events, we'll fix
> -              * this up later, so allow that.
> +              * Make sure we're both on the same context; either task or cpu.
>                */
> -             if (!move_group && group_leader->ctx != ctx)
> -                     goto err_context;
> +             if (group_leader->ctx != ctx)
> +                     goto err_locked;
> 
>               /*
>                * Only a group leader can be exclusive or pinned
>                */
>               if (attr.exclusive || attr.pinned)
> -                     goto err_context;
> +                     goto err_locked;
> +
> +             if (is_software_event(event) &&
> +                 !in_software_context(group_leader)) {
> +                     /*
> +                      * If the event is a sw event, but the group_leader
> +                      * is on hw context.
> +                      *
> +                      * Allow the addition of software events to hw
> +                      * groups, this is safe because software events
> +                      * never fail to schedule.
> +                      */
> +                     pmu = group_leader->pmu_ctx->pmu;
> +             } else if (!is_software_event(event) &&
> +                        is_software_event(group_leader) &&
> +                        (group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
> +                     /*
> +                      * In case the group is a pure software group, and we
> +                      * try to add a hardware event, move the whole group to
> +                      * the hardware context.
> +                      */
> +                     move_group = 1;
> +             }
> +     }
> +
> +     /*
> +      * Now that we're certain of the pmu; find the pmu_ctx.
> +      */
> +     pmu_ctx = find_get_pmu_context(pmu, ctx, event);
> +     if (IS_ERR(pmu_ctx)) {
> +             err = PTR_ERR(pmu_ctx);
> +             goto err_locked;
> +     }
> +     event->pmu_ctx = pmu_ctx;
> +
> +     // XXX think about exclusive
> +     if ((pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) && group_leader) {
> +             err = -EBUSY;
> +             goto err_context;
>       }
> 
>       if (output_event) {
> @@ -10619,71 +10728,18 @@ SYSCALL_DEFINE5(perf_event_open,
>                       goto err_context;
>       }
> 
> -     event_file = anon_inode_getfile("[perf_event]", &perf_fops, event,
> -                                     f_flags);
> +     event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, 
> f_flags);
>       if (IS_ERR(event_file)) {
>               err = PTR_ERR(event_file);
>               event_file = NULL;
>               goto err_context;
>       }
> 
> -     if (move_group) {
> -             gctx = __perf_event_ctx_lock_double(group_leader, ctx);
> -
> -             if (gctx->task == TASK_TOMBSTONE) {
> -                     err = -ESRCH;
> -                     goto err_locked;
> -             }
> -
> -             /*
> -              * Check if we raced against another sys_perf_event_open() call
> -              * moving the software group underneath us.
> -              */
> -             if (!(group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
> -                     /*
> -                      * If someone moved the group out from under us, check
> -                      * if this new event wound up on the same ctx, if so
> -                      * its the regular !move_group case, otherwise fail.
> -                      */
> -                     if (gctx != ctx) {
> -                             err = -EINVAL;
> -                             goto err_locked;
> -                     } else {
> -                             perf_event_ctx_unlock(group_leader, gctx);
> -                             move_group = 0;
> -                     }
> -             }
> -     } else {
> -             mutex_lock(&ctx->mutex);
> -     }
> -
> -     if (ctx->task == TASK_TOMBSTONE) {
> -             err = -ESRCH;
> -             goto err_locked;
> -     }
> -
>       if (!perf_event_validate_size(event)) {
>               err = -E2BIG;
> -             goto err_locked;
> +             goto err_file;
>       }
> 
> -     if (!task) {
> -             /*
> -              * Check if the @cpu we're creating an event for is online.
> -              *
> -              * We use the perf_cpu_context::ctx::mutex to serialize against
> -              * the hotplug notifiers. See perf_event_{init,exit}_cpu().
> -              */
> -             struct perf_cpu_context *cpuctx =
> -                     container_of(ctx, struct perf_cpu_context, ctx);
> -
> -             if (!cpuctx->online) {
> -                     err = -ENODEV;
> -                     goto err_locked;
> -             }
> -     }
> -
> -
>       /*
>        * Must be under the same ctx::mutex as perf_install_in_context(),
>        * because we need to serialize with concurrent event creation.
> @@ -10693,7 +10749,7 @@ SYSCALL_DEFINE5(perf_event_open,
>               WARN_ON_ONCE(move_group);
> 
>               err = -EBUSY;
> -             goto err_locked;
> +             goto err_file;
>       }
> 
>       WARN_ON_ONCE(ctx->parent_ctx);
> @@ -10704,25 +10760,15 @@ SYSCALL_DEFINE5(perf_event_open,
>        */
> 
>       if (move_group) {
> -             /*
> -              * See perf_event_ctx_lock() for comments on the details
> -              * of swizzling perf_event::ctx.
> -              */
>               perf_remove_from_context(group_leader, 0);
> -             put_ctx(gctx);
> +             put_pmu_ctx(group_leader->pmu_ctx);
> 
>               for_each_sibling_event(sibling, group_leader) {
>                       perf_remove_from_context(sibling, 0);
> -                     put_ctx(gctx);
> +                     put_pmu_ctx(sibling->pmu_ctx);
>               }
> 
>               /*
> -              * Wait for everybody to stop referencing the events through
> -              * the old lists, before installing it on new lists.
> -              */
> -             synchronize_rcu();
> -
> -             /*
>                * Install the group siblings before the group leader.
>                *
>                * Because a group leader will try and install the entire group
> @@ -10733,9 +10779,10 @@ SYSCALL_DEFINE5(perf_event_open,
>                * reachable through the group lists.
>                */
>               for_each_sibling_event(sibling, group_leader) {
> +                     sibling->pmu_ctx = pmu_ctx;
> +                     get_pmu_ctx(pmu_ctx);
>                       perf_event__state_init(sibling);
>                       perf_install_in_context(ctx, sibling, sibling->cpu);
> -                     get_ctx(ctx);
>               }
> 
>               /*
> @@ -10743,9 +10790,10 @@ SYSCALL_DEFINE5(perf_event_open,
>                * event. What we want here is event in the initial
>                * startup state, ready to be add into new context.
>                */
> +             group_leader->pmu_ctx = pmu_ctx;
> +             get_pmu_ctx(pmu_ctx);
>               perf_event__state_init(group_leader);
>               perf_install_in_context(ctx, group_leader, group_leader->cpu);
> -             get_ctx(ctx);
>       }
> 
>       /*
> @@ -10762,8 +10810,6 @@ SYSCALL_DEFINE5(perf_event_open,
>       perf_install_in_context(ctx, event, event->cpu);
>       perf_unpin_context(ctx);
> 
> -     if (move_group)
> -             perf_event_ctx_unlock(group_leader, gctx);
>       mutex_unlock(&ctx->mutex);
> 
>       if (task) {
> @@ -10785,13 +10831,12 @@ SYSCALL_DEFINE5(perf_event_open,
>       fd_install(event_fd, event_file);
>       return event_fd;
> 
> -err_locked:
> -     if (move_group)
> -             perf_event_ctx_unlock(group_leader, gctx);
> -     mutex_unlock(&ctx->mutex);
> -/* err_file: */
> +err_file:
>       fput(event_file);
> err_context:
> +     /* event->pmu_ctx freed by free_event() */
> +err_locked:
> +     mutex_unlock(&ctx->mutex);
>       perf_unpin_context(ctx);
>       put_ctx(ctx);
> err_alloc:
> @@ -10827,8 +10872,10 @@ perf_event_create_kernel_counter(struct
>                                perf_overflow_handler_t overflow_handler,
>                                void *context)
> {
> +     struct perf_event_pmu_context *pmu_ctx;
>       struct perf_event_context *ctx;
>       struct perf_event *event;
> +     struct pmu *pmu;
>       int err;
> 
>       /*
> @@ -10844,12 +10891,28 @@ perf_event_create_kernel_counter(struct
> 
>       /* Mark owner so we could distinguish it from user events. */
>       event->owner = TASK_TOMBSTONE;
> +     pmu = event->pmu;
> +
> +     if (pmu->task_ctx_nr < 0 && task) {
> +             err = -EINVAL;
> +             goto err_alloc;
> +     }
> +
> +     if (pmu->task_ctx_nr == perf_sw_context)
> +             event->event_caps |= PERF_EV_CAP_SOFTWARE;
> 
> -     ctx = find_get_context(event->pmu, task, event);
> +     ctx = find_get_context(task, event);
>       if (IS_ERR(ctx)) {
>               err = PTR_ERR(ctx);
> -             goto err_free;
> +             goto err_alloc;
> +     }
> +
> +     pmu_ctx = find_get_pmu_context(pmu, ctx, event);
> +     if (IS_ERR(pmu_ctx)) {
> +             err = PTR_ERR(pmu_ctx);
> +             goto err_ctx;
>       }
> +     event->pmu_ctx = pmu_ctx;
> 
>       WARN_ON_ONCE(ctx->parent_ctx);
>       mutex_lock(&ctx->mutex);
> @@ -10886,9 +10949,10 @@ perf_event_create_kernel_counter(struct
> 
> err_unlock:
>       mutex_unlock(&ctx->mutex);
> +err_ctx:
>       perf_unpin_context(ctx);
>       put_ctx(ctx);
> -err_free:
> +err_alloc:
>       free_event(event);
> err:
>       return ERR_PTR(err);
> @@ -10897,6 +10961,7 @@ EXPORT_SYMBOL_GPL(perf_event_create_kern
> 
> void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
> {
> +#if 0 // XXX buggered - cpu hotplug, who cares
>       struct perf_event_context *src_ctx;
>       struct perf_event_context *dst_ctx;
>       struct perf_event *event, *tmp;
> @@ -10957,6 +11022,7 @@ void perf_pmu_migrate_context(struct pmu
>       }
>       mutex_unlock(&dst_ctx->mutex);
>       mutex_unlock(&src_ctx->mutex);
> +#endif
> }
> EXPORT_SYMBOL_GPL(perf_pmu_migrate_context);
> 
> @@ -11038,14 +11104,14 @@ perf_event_exit_event(struct perf_event
>       put_event(parent_event);
> }
> 
> -static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
> +static void perf_event_exit_task_context(struct task_struct *child)
> {
>       struct perf_event_context *child_ctx, *clone_ctx = NULL;
>       struct perf_event *child_event, *next;
> 
>       WARN_ON_ONCE(child != current);
> 
> -     child_ctx = perf_pin_task_context(child, ctxn);
> +     child_ctx = perf_pin_task_context(child);
>       if (!child_ctx)
>               return;
> 
> @@ -11067,13 +11133,13 @@ static void perf_event_exit_task_context
>        * in.
>        */
>       raw_spin_lock_irq(&child_ctx->lock);
> -     task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx, EVENT_ALL);
> +     task_ctx_sched_out(child_ctx, EVENT_ALL);
> 
>       /*
>        * Now that the context is inactive, destroy the task <-> ctx relation
>        * and mark the context dead.
>        */
> -     RCU_INIT_POINTER(child->perf_event_ctxp[ctxn], NULL);
> +     RCU_INIT_POINTER(child->perf_event_ctxp, NULL);
>       put_ctx(child_ctx); /* cannot be last */
>       WRITE_ONCE(child_ctx->task, TASK_TOMBSTONE);
>       put_task_struct(current); /* cannot be last */
> @@ -11108,7 +11174,6 @@ static void perf_event_exit_task_context
> void perf_event_exit_task(struct task_struct *child)
> {
>       struct perf_event *event, *tmp;
> -     int ctxn;
> 
>       mutex_lock(&child->perf_event_mutex);
>       list_for_each_entry_safe(event, tmp, &child->perf_event_list,
> @@ -11124,8 +11189,7 @@ void perf_event_exit_task(struct task_st
>       }
>       mutex_unlock(&child->perf_event_mutex);
> 
> -     for_each_task_context_nr(ctxn)
> -             perf_event_exit_task_context(child, ctxn);
> +     perf_event_exit_task_context(child);
> 
>       /*
>        * The perf_event_exit_task_context calls perf_event_task
> @@ -11168,40 +11232,34 @@ void perf_event_free_task(struct task_st
> {
>       struct perf_event_context *ctx;
>       struct perf_event *event, *tmp;
> -     int ctxn;
> 
> -     for_each_task_context_nr(ctxn) {
> -             ctx = task->perf_event_ctxp[ctxn];
> -             if (!ctx)
> -                     continue;
> +     ctx = rcu_dereference(task->perf_event_ctxp);
> +     if (!ctx)
> +             return;
> 
> -             mutex_lock(&ctx->mutex);
> -             raw_spin_lock_irq(&ctx->lock);
> -             /*
> -              * Destroy the task <-> ctx relation and mark the context dead.
> -              *
> -              * This is important because even though the task hasn't been
> -              * exposed yet the context has been (through child_list).
> -              */
> -             RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], NULL);
> -             WRITE_ONCE(ctx->task, TASK_TOMBSTONE);
> -             put_task_struct(task); /* cannot be last */
> -             raw_spin_unlock_irq(&ctx->lock);
> +     mutex_lock(&ctx->mutex);
> +     raw_spin_lock_irq(&ctx->lock);
> +     /*
> +      * Destroy the task <-> ctx relation and mark the context dead.
> +      *
> +      * This is important because even though the task hasn't been
> +      * exposed yet the context has been (through child_list).
> +      */
> +     RCU_INIT_POINTER(task->perf_event_ctxp, NULL);
> +     WRITE_ONCE(ctx->task, TASK_TOMBSTONE);
> +     put_task_struct(task); /* cannot be last */
> +     raw_spin_unlock_irq(&ctx->lock);
> 
> -             list_for_each_entry_safe(event, tmp, &ctx->event_list, 
> event_entry)
> -                     perf_free_event(event, ctx);
> +     list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry)
> +             perf_free_event(event, ctx);
> 
> -             mutex_unlock(&ctx->mutex);
> -             put_ctx(ctx);
> -     }
> +     mutex_unlock(&ctx->mutex);
> +     put_ctx(ctx);
> }
> 
> void perf_event_delayed_put(struct task_struct *task)
> {
> -     int ctxn;
> -
> -     for_each_task_context_nr(ctxn)
> -             WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
> +     WARN_ON_ONCE(task->perf_event_ctxp);
> }
> 
> struct file *perf_event_get(unsigned int fd)
> @@ -11253,6 +11311,7 @@ inherit_event(struct perf_event *parent_
>             struct perf_event_context *child_ctx)
> {
>       enum perf_event_state parent_state = parent_event->state;
> +     struct perf_event_pmu_context *pmu_ctx;
>       struct perf_event *child_event;
>       unsigned long flags;
> 
> @@ -11273,18 +11332,12 @@ inherit_event(struct perf_event *parent_
>       if (IS_ERR(child_event))
>               return child_event;
> 
> -
> -     if ((child_event->attach_state & PERF_ATTACH_TASK_DATA) &&
> -         !child_ctx->task_ctx_data) {
> -             struct pmu *pmu = child_event->pmu;
> -
> -             child_ctx->task_ctx_data = kzalloc(pmu->task_ctx_size,
> -                                                GFP_KERNEL);
> -             if (!child_ctx->task_ctx_data) {
> -                     free_event(child_event);
> -                     return NULL;
> -             }
> +     pmu_ctx = find_get_pmu_context(child_event->pmu, child_ctx, 
> child_event);
> +     if (!pmu_ctx) {
> +             free_event(child_event);
> +             return NULL;
>       }
> +     child_event->pmu_ctx = pmu_ctx;
> 
>       /*
>        * is_orphaned_event() and list_add_tail(&parent_event->child_list)
> @@ -11402,18 +11455,18 @@ static int inherit_group(struct perf_eve
> static int
> inherit_task_group(struct perf_event *event, struct task_struct *parent,
>                  struct perf_event_context *parent_ctx,
> -                struct task_struct *child, int ctxn,
> +                struct task_struct *child,
>                  int *inherited_all)
> {
> -     int ret;
>       struct perf_event_context *child_ctx;
> +     int ret;
> 
>       if (!event->attr.inherit) {
>               *inherited_all = 0;
>               return 0;
>       }
> 
> -     child_ctx = child->perf_event_ctxp[ctxn];
> +     child_ctx = child->perf_event_ctxp;
>       if (!child_ctx) {
>               /*
>                * This is executed from the parent task context, so
> @@ -11421,16 +11474,14 @@ inherit_task_group(struct perf_event *ev
>                * First allocate and initialize a context for the
>                * child.
>                */
> -             child_ctx = alloc_perf_context(parent_ctx->pmu, child);
> +             child_ctx = alloc_perf_context(child);
>               if (!child_ctx)
>                       return -ENOMEM;
> 
> -             child->perf_event_ctxp[ctxn] = child_ctx;
> +             child->perf_event_ctxp = child_ctx;
>       }
> 
> -     ret = inherit_group(event, parent, parent_ctx,
> -                         child, child_ctx);
> -
> +     ret = inherit_group(event, parent, parent_ctx, child, child_ctx);
>       if (ret)
>               *inherited_all = 0;
> 
> @@ -11440,7 +11491,7 @@ inherit_task_group(struct perf_event *ev
> /*
>  * Initialize the perf_event context in task_struct
>  */
> -static int perf_event_init_context(struct task_struct *child, int ctxn)
> +static int perf_event_init_context(struct task_struct *child)
> {
>       struct perf_event_context *child_ctx, *parent_ctx;
>       struct perf_event_context *cloned_ctx;
> @@ -11450,14 +11501,14 @@ static int perf_event_init_context(struc
>       unsigned long flags;
>       int ret = 0;
> 
> -     if (likely(!parent->perf_event_ctxp[ctxn]))
> +     if (likely(!parent->perf_event_ctxp))
>               return 0;
> 
>       /*
>        * If the parent's context is a clone, pin it so it won't get
>        * swapped under us.
>        */
> -     parent_ctx = perf_pin_task_context(parent, ctxn);
> +     parent_ctx = perf_pin_task_context(parent);
>       if (!parent_ctx)
>               return 0;
> 
> @@ -11480,7 +11531,7 @@ static int perf_event_init_context(struc
>        */
>       perf_event_groups_for_each(event, &parent_ctx->pinned_groups) {
>               ret = inherit_task_group(event, parent, parent_ctx,
> -                                      child, ctxn, &inherited_all);
> +                                      child, &inherited_all);
>               if (ret)
>                       goto out_unlock;
>       }
> @@ -11496,7 +11547,7 @@ static int perf_event_init_context(struc
> 
>       perf_event_groups_for_each(event, &parent_ctx->flexible_groups) {
>               ret = inherit_task_group(event, parent, parent_ctx,
> -                                      child, ctxn, &inherited_all);
> +                                      child, &inherited_all);
>               if (ret)
>                       goto out_unlock;
>       }
> @@ -11504,7 +11555,7 @@ static int perf_event_init_context(struc
>       raw_spin_lock_irqsave(&parent_ctx->lock, flags);
>       parent_ctx->rotate_disable = 0;
> 
> -     child_ctx = child->perf_event_ctxp[ctxn];
> +     child_ctx = child->perf_event_ctxp;
> 
>       if (child_ctx && inherited_all) {
>               /*
> @@ -11540,18 +11591,16 @@ static int perf_event_init_context(struc
>  */
> int perf_event_init_task(struct task_struct *child)
> {
> -     int ctxn, ret;
> +     int ret;
> 
> -     memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp));
> +     child->perf_event_ctxp = NULL;
>       mutex_init(&child->perf_event_mutex);
>       INIT_LIST_HEAD(&child->perf_event_list);
> 
> -     for_each_task_context_nr(ctxn) {
> -             ret = perf_event_init_context(child, ctxn);
> -             if (ret) {
> -                     perf_event_free_task(child);
> -                     return ret;
> -             }
> +     ret = perf_event_init_context(child);
> +     if (ret) {
> +             perf_event_free_task(child);
> +             return ret;
>       }
> 
>       return 0;
> @@ -11560,6 +11609,7 @@ int perf_event_init_task(struct task_str
> static void __init perf_event_init_all_cpus(void)
> {
>       struct swevent_htable *swhash;
> +     struct perf_cpu_context *cpuctx;
>       int cpu;
> 
>       zalloc_cpumask_var(&perf_online_mask, GFP_KERNEL);
> @@ -11567,7 +11617,6 @@ static void __init perf_event_init_all_c
>       for_each_possible_cpu(cpu) {
>               swhash = &per_cpu(swevent_htable, cpu);
>               mutex_init(&swhash->hlist_mutex);
> -             INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu));
> 
>               INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu));
>               raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu));
> @@ -11576,6 +11625,12 @@ static void __init perf_event_init_all_c
>               INIT_LIST_HEAD(&per_cpu(cgrp_cpuctx_list, cpu));
> #endif
>               INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu));
> +
> +             cpuctx = per_cpu_ptr(&cpu_context, cpu);
> +             __perf_event_init_context(&cpuctx->ctx);
> +             lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
> +             lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
> +             cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask);
>       }
> }
> 
> @@ -11597,12 +11652,12 @@ void perf_swevent_init_cpu(unsigned int
> #if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE
> static void __perf_event_exit_context(void *__info)
> {
> +     struct perf_cpu_context *cpuctx = this_cpu_ptr(&cpu_context);
>       struct perf_event_context *ctx = __info;
> -     struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
>       struct perf_event *event;
> 
>       raw_spin_lock(&ctx->lock);
> -     ctx_sched_out(ctx, cpuctx, EVENT_TIME);
> +     ctx_sched_out(ctx, EVENT_TIME);
>       list_for_each_entry(event, &ctx->event_list, event_entry)
>               __perf_remove_from_context(event, cpuctx, ctx, (void 
> *)DETACH_GROUP);
>       raw_spin_unlock(&ctx->lock);
> @@ -11612,18 +11667,16 @@ static void perf_event_exit_cpu_context(
> {
>       struct perf_cpu_context *cpuctx;
>       struct perf_event_context *ctx;
> -     struct pmu *pmu;
> 
> +     // XXX simplify cpuctx->online
>       mutex_lock(&pmus_lock);
> -     list_for_each_entry(pmu, &pmus, entry) {
> -             cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
> -             ctx = &cpuctx->ctx;
> +     cpuctx = per_cpu_ptr(&cpu_context, cpu);
> +     ctx = &cpuctx->ctx;
> 
> -             mutex_lock(&ctx->mutex);
> -             smp_call_function_single(cpu, __perf_event_exit_context, ctx, 
> 1);
> -             cpuctx->online = 0;
> -             mutex_unlock(&ctx->mutex);
> -     }
> +     mutex_lock(&ctx->mutex);
> +     smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
> +     cpuctx->online = 0;
> +     mutex_unlock(&ctx->mutex);
>       cpumask_clear_cpu(cpu, perf_online_mask);
>       mutex_unlock(&pmus_lock);
> }
> @@ -11637,20 +11690,17 @@ int perf_event_init_cpu(unsigned int cpu
> {
>       struct perf_cpu_context *cpuctx;
>       struct perf_event_context *ctx;
> -     struct pmu *pmu;
> 
>       perf_swevent_init_cpu(cpu);
> 
>       mutex_lock(&pmus_lock);
>       cpumask_set_cpu(cpu, perf_online_mask);
> -     list_for_each_entry(pmu, &pmus, entry) {
> -             cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
> -             ctx = &cpuctx->ctx;
> +     cpuctx = per_cpu_ptr(&cpu_context, cpu);
> +     ctx = &cpuctx->ctx;
> 
> -             mutex_lock(&ctx->mutex);
> -             cpuctx->online = 1;
> -             mutex_unlock(&ctx->mutex);
> -     }
> +     mutex_lock(&ctx->mutex);
> +     cpuctx->online = 1;
> +     mutex_unlock(&ctx->mutex);
>       mutex_unlock(&pmus_lock);
> 
>       return 0;

Re: [RFC][PATCH] perf: Rewrite core context handling

Reply via email to