This kernel patch adds the ability to filter monitoring based on container groups (cgroups). This is for use in per-cpu mode only. The patch adds perf_event_attr.cgroup, a boolean, to activate this new mode. The cgroup is designated by passing in perf_event_attr.cgroup_fd, an opened file descriptor to the <mnt>/<cgroup>/perf_event.perf file. This is the second version of this patch. It corrects the way time_enabled is accounted for. In cgroup mode, time_enabled reflects the time the cgroup was active, i.e., threads from the cgroup executed on the monitored CPU. This is a more useful metric than just wall-clock. The meaning of time_enabled without cgroup is unaffected.
Signed-off-by: Stephane Eranian <eran...@google.com> -- diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 3cb7d04..ed76357 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -618,6 +618,8 @@ bool css_is_ancestor(struct cgroup_subsys_state *cg, unsigned short css_id(struct cgroup_subsys_state *css); unsigned short css_depth(struct cgroup_subsys_state *css); +struct cgroup_subsys_state *cgroup_css_from_file(struct file *f, int id); + #else /* !CONFIG_CGROUPS */ static inline int cgroup_init_early(void) { return 0; } diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h index ccefff0..93f86b7 100644 --- a/include/linux/cgroup_subsys.h +++ b/include/linux/cgroup_subsys.h @@ -65,4 +65,8 @@ SUBSYS(net_cls) SUBSYS(blkio) #endif +#ifdef CONFIG_PERF_EVENTS +SUBSYS(perf) +#endif + /* */ diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 000610c..f84b38e 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -215,8 +215,9 @@ struct perf_event_attr { */ precise_ip : 2, /* skid constraint */ mmap_data : 1, /* non-exec mmap data */ + cgroup : 1, /* cgroup aggregation */ - __reserved_1 : 46; + __reserved_1 : 45; union { __u32 wakeup_events; /* wakeup every n events */ @@ -226,6 +227,8 @@ struct perf_event_attr { __u32 bp_type; __u64 bp_addr; __u64 bp_len; + + int cgroup_fd; }; /* @@ -461,6 +464,7 @@ enum perf_callchain_context { */ #ifdef CONFIG_PERF_EVENTS +# include <linux/cgroup.h> # include <asm/perf_event.h> # include <asm/local64.h> #endif @@ -657,6 +661,18 @@ struct swevent_hlist { #define PERF_ATTACH_CONTEXT 0x01 #define PERF_ATTACH_GROUP 0x02 +#ifdef CONFIG_CGROUPS +struct perf_cgroup_time { + u64 time; + u64 timestamp; +}; + +struct perf_cgroup { + struct cgroup_subsys_state css; + struct perf_cgroup_time *time; +}; +#endif + /** * struct perf_event - performance event kernel representation: */ @@ -759,7 +775,9 @@ struct perf_event { struct ftrace_event_call *tp_event; struct event_filter *filter; #endif - +#ifdef CONFIG_CGROUPS + struct perf_cgroup *css; +#endif #endif /* CONFIG_PERF_EVENTS */ }; @@ -806,6 +824,8 @@ struct perf_event_context { u64 generation; int pin_count; struct rcu_head rcu_head; + + int nr_cgroups; }; /* diff --git a/kernel/cgroup.c b/kernel/cgroup.c index e5c5497..3e56354 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4722,6 +4722,23 @@ css_get_next(struct cgroup_subsys *ss, int id, return ret; } +struct cgroup_subsys_state *cgroup_css_from_file(struct file *f, int id) +{ + struct cgroup *cgrp; + + /* check in cgroup filesystem */ + if (f->f_op != &cgroup_seqfile_operations) + return ERR_PTR(-EBADF); + + if (id < 0 || id >= CGROUP_SUBSYS_COUNT) + return ERR_PTR(-EINVAL); + + /* get cgroup */ + cgrp = __d_cgrp(f->f_dentry->d_parent); + + return cgrp->subsys[id]; +} + #ifdef CONFIG_CGROUP_DEBUG static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss, struct cgroup *cont) diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 4b84e63..2723d52 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -35,6 +35,7 @@ #include <asm/irq_regs.h> +#define PERF_TSTAMP_ENABLE_INVALID (~0) /* invalid marker, cannot be zero */ /* * Each CPU has a list of per CPU events: */ @@ -49,6 +50,228 @@ static atomic_t nr_mmap_events __read_mostly; static atomic_t nr_comm_events __read_mostly; static atomic_t nr_task_events __read_mostly; +enum event_type_t { + EVENT_FLEXIBLE = 0x1, + EVENT_PINNED = 0x2, + EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, +}; + +static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, + enum event_type_t event_type); + +static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, + enum event_type_t event_type, + struct task_struct *task, int css_sw); +static inline u64 perf_clock(void) +{ + return local_clock(); +} + +#ifdef CONFIG_CGROUPS + +static inline struct perf_cgroup * +perf_cgroup_from_task(struct task_struct *task) +{ + if (!task) + return NULL; + return container_of(task_subsys_state(task, perf_subsys_id), + struct perf_cgroup, css); +} + +static inline +struct perf_cgroup *perf_cgroup_from_cont(struct cgroup *cont) +{ + return container_of(cgroup_subsys_state(cont, perf_subsys_id), + struct perf_cgroup, css); +} + +static inline bool +perf_cgroup_match(struct perf_event *event, struct task_struct *task) +{ + struct perf_cgroup *css = perf_cgroup_from_task(task); + return !event->css || event->css == css; +} + +static void *perf_get_cgroup(int fd) +{ + struct cgroup_subsys_state *css; + struct file *file; + int fput_needed; + + file = fget_light(fd, &fput_needed); + if (!file) + return ERR_PTR(-EBADF); + + css = cgroup_css_from_file(file, perf_subsys_id); + if (!IS_ERR(css)) + css_get(css); + + fput_light(file, fput_needed); + + return css; +} + +static inline void perf_put_cgroup(struct perf_event *event) +{ + if (event->css) + css_put(&event->css->css); +} + +static inline int is_cgroup_event(struct perf_event *event) +{ + return event->css != NULL; +} + +static inline int is_css_current(struct perf_event *event) +{ + struct perf_cgroup *css = perf_cgroup_from_task(current); + + return css == event->css; +} + +static inline u64 __perf_event_css_time(struct perf_event *event) +{ + struct perf_cgroup_time *t; + t = per_cpu_ptr(event->css->time, event->cpu); + return t->time; +} + +static inline void __update_css_time(struct perf_cgroup *css) +{ + u64 now; + struct perf_cgroup_time *t; + int cpu = smp_processor_id(); + + if (!css) + return; + + now = perf_clock(); + + t = per_cpu_ptr(css->time, cpu); + + t->time += now - t->timestamp; + t->timestamp = now; +} + +static inline void update_task_css_time(struct task_struct *task) +{ + struct perf_cgroup *css_out = perf_cgroup_from_task(task); + __update_css_time(css_out); +} + +static inline void update_event_css_time(struct perf_event *event) +{ + if (!is_css_current(event)) + return; + __update_css_time(event->css); +} + +static inline void perf_cgroup_switch(struct task_struct *task, + struct task_struct *next) +{ + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + struct perf_cgroup *css_out = perf_cgroup_from_task(task); + struct perf_cgroup *css_in = perf_cgroup_from_task(next); + struct perf_cgroup_time *t; + int css_sw; + + if (css_out != css_in) { + css_sw = 1; + update_task_css_time(task); + t = per_cpu_ptr(css_in->time, smp_processor_id()); + t->timestamp = perf_clock(); + } + + /* + * if cpu context has at least one event with cgroup constraint, + * then flushout all existing events and scheduled again taking + * into account the incoming cgroup. This is a cgroup switch + */ + if (cpuctx->ctx.nr_cgroups > 0 && css_sw) { + cpu_ctx_sched_out(cpuctx, EVENT_ALL); + cpu_ctx_sched_in(cpuctx, EVENT_ALL, next, 1); + } +} + +static inline int perf_connect_cgroup(struct perf_event *event, + struct perf_event_attr *attr, + struct perf_event *group_leader) +{ + struct perf_cgroup *css; + + css = perf_get_cgroup(attr->cgroup_fd); + if (IS_ERR(css)) + return PTR_ERR(css); + /* + * all events in a group must monitor + * the same cgroup because a thread belongs + * to only one cgroup at a time + */ + if (group_leader && group_leader->css != css) { + event->css = css; + perf_put_cgroup(event); + return -EINVAL; + } + + event->css = css; + + return 0; +} + +#else /* !CONFIG_CGROUP */ + +static inline bool +perf_cgroup_match(struct perf_event *event, struct task_struct *task) +{ + return true; +} + +static inline void *perf_get_cgroup(int fd) +{ + return ERR_PTR(-ENOTSUPP); +} + +static inline void perf_put_cgroup(struct perf_event *event) +{} + +static inline int is_cgroup_event(struct perf_event *event) +{ + return 0; +} + +static inline int is_css_current(struct perf_event *event) +{ + return 0; +} + +static inline u64 __perf_event_css_time(struct perf_event *event) +{ + return 0; +} + +static inline void update_css_time(void *css) +{} + +static inline void update_event_css_time(struct perf_event *event) +{} + +static inline void update_task_css_time(struct task_struct *t) +{} +static inline void perf_cgroup_switch(struct task_struct *task, + struct task_struct *next) +{} + +static inline int perf_connect_cgroup(struct perf_event *event, + struct perf_event_attr *attr, + struct perf_event *gorup_leader) +{ + return -EINVAL; +} + +#endif + + + /* * perf event paranoia level: * -1 - not paranoid at all @@ -212,11 +435,6 @@ static void perf_unpin_context(struct perf_event_context *ctx) put_ctx(ctx); } -static inline u64 perf_clock(void) -{ - return local_clock(); -} - /* * Update the record of the current time in a context. */ @@ -228,29 +446,46 @@ static void update_context_time(struct perf_event_context *ctx) ctx->timestamp = now; } +static u64 perf_event_time(struct perf_event *event) +{ + struct perf_event_context *ctx = event->ctx; + + if (is_cgroup_event(event)) { + if (event->cpu == -1) { + WARN_ON(event->cpu != smp_processor_id()); + return 0; + } + return __perf_event_css_time(event); + } + + return ctx ? ctx->time : 0; +} + /* * Update the total_time_enabled and total_time_running fields for a event. */ static void update_event_times(struct perf_event *event) { - struct perf_event_context *ctx = event->ctx; - u64 run_end; + u64 run_end, run_start; if (event->state < PERF_EVENT_STATE_INACTIVE || event->group_leader->state < PERF_EVENT_STATE_INACTIVE) return; - if (ctx->is_active) - run_end = ctx->time; - else - run_end = event->tstamp_stopped; + run_end = perf_event_time(event); + run_start = event->tstamp_enabled; - event->total_time_enabled = run_end - event->tstamp_enabled; + /* + * that means the cgroup never got scheduled in + * so ensure total_time_enabled is zero + */ + if (run_start == PERF_TSTAMP_ENABLE_INVALID) + run_start = run_end; + + event->total_time_enabled = run_end - run_start; if (event->state == PERF_EVENT_STATE_INACTIVE) run_end = event->tstamp_stopped; - else - run_end = ctx->time; event->total_time_running = run_end - event->tstamp_running; } @@ -301,6 +536,9 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) list_add_tail(&event->group_entry, list); } + if (is_cgroup_event(event)) + ctx->nr_cgroups++; + list_add_rcu(&event->event_entry, &ctx->event_list); ctx->nr_events++; if (event->attr.inherit_stat) @@ -340,6 +578,9 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) event->attach_state &= ~PERF_ATTACH_CONTEXT; + if (is_cgroup_event(event)) + ctx->nr_cgroups--; + ctx->nr_events--; if (event->attr.inherit_stat) ctx->nr_stat--; @@ -403,9 +644,10 @@ static void perf_group_detach(struct perf_event *event) } static inline int -event_filter_match(struct perf_event *event) +event_filter_match(struct perf_event *event, struct task_struct *task) { - return event->cpu == -1 || event->cpu == smp_processor_id(); + return (event->cpu == -1 || event->cpu == smp_processor_id()) + && perf_cgroup_match(event, task); } static void @@ -413,6 +655,7 @@ event_sched_out(struct perf_event *event, struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) { + u64 tstamp = perf_event_time(event); u64 delta; /* * An event which could not be activated because of @@ -421,10 +664,10 @@ event_sched_out(struct perf_event *event, * via read() for time_enabled, time_running: */ if (event->state == PERF_EVENT_STATE_INACTIVE - && !event_filter_match(event)) { - delta = ctx->time - event->tstamp_stopped; + && !event_filter_match(event, current)) { + delta = tstamp - event->tstamp_stopped; event->tstamp_running += delta; - event->tstamp_stopped = ctx->time; + event->tstamp_stopped = tstamp; } if (event->state != PERF_EVENT_STATE_ACTIVE) @@ -435,7 +678,7 @@ event_sched_out(struct perf_event *event, event->pending_disable = 0; event->state = PERF_EVENT_STATE_OFF; } - event->tstamp_stopped = ctx->time; + event->tstamp_stopped = tstamp; event->pmu->disable(event); event->oncpu = -1; @@ -589,6 +832,11 @@ static void __perf_event_disable(void *info) * If it is in error state, leave it in error state. */ if (event->state >= PERF_EVENT_STATE_INACTIVE) { + /* + * update css time only if current->css corresponds + * to event. This is used to update tstamp->stopped + */ + update_event_css_time(event); update_context_time(ctx); update_group_times(event); if (event == event->group_leader) @@ -673,7 +921,8 @@ event_sched_in(struct perf_event *event, return -EAGAIN; } - event->tstamp_running += ctx->time - event->tstamp_stopped; + event->tstamp_running += + perf_event_time(event) - event->tstamp_stopped; if (!is_software_event(event)) cpuctx->active_oncpu++; @@ -775,11 +1024,33 @@ static int group_can_go_on(struct perf_event *event, static void add_event_to_ctx(struct perf_event *event, struct perf_event_context *ctx) { + u64 tstamp = perf_event_time(event); + list_add_event(event, ctx); perf_group_attach(event); - event->tstamp_enabled = ctx->time; - event->tstamp_running = ctx->time; - event->tstamp_stopped = ctx->time; + + event->tstamp_running = tstamp; + event->tstamp_stopped = tstamp; + event->tstamp_enabled = tstamp; + + /* + * an event is added to a context even if the css constraint + * is not satisfied. In per-cgroup mode, time_enabled only + * counts when threads from the css are active on the CPU. + * + * tstamp_enabled denotes the first time the event CAN be + * enabled, i.e., the first time threads from the css are + * scheduled in. Note that the event may not be scheduled + * immediately if the PMU is overcommitted yet the timestamp + * points to the first css activation. + * + * If css is not currently active, then we mark + * tstamp_enabled = ~0 to remember that it needs to be + * corrected in ctx_flexible_sched_in() and + * ctx_pinned_sched_in() + */ + if (is_cgroup_event(event) && !is_css_current(event)) + event->tstamp_enabled = PERF_TSTAMP_ENABLE_INVALID; } /* @@ -818,9 +1089,16 @@ static void __perf_install_in_context(void *info) */ perf_disable(); + /* + * in cgroup mode, we know the event matches + * the current cgroup, so update the cgroup's + * time so we timestamp correctly. + */ + update_event_css_time(event); + add_event_to_ctx(event, ctx); - if (event->cpu != -1 && event->cpu != smp_processor_id()) + if (!event_filter_match(event, current)) goto unlock; /* @@ -928,13 +1206,14 @@ static void __perf_event_mark_enabled(struct perf_event *event, struct perf_event_context *ctx) { struct perf_event *sub; + u64 tstamp = perf_event_time(event); event->state = PERF_EVENT_STATE_INACTIVE; - event->tstamp_enabled = ctx->time - event->total_time_enabled; + event->tstamp_enabled = tstamp - event->total_time_enabled; + list_for_each_entry(sub, &event->sibling_list, group_entry) if (sub->state >= PERF_EVENT_STATE_INACTIVE) - sub->tstamp_enabled = - ctx->time - sub->total_time_enabled; + sub->tstamp_enabled = tstamp - sub->total_time_enabled; } /* @@ -964,9 +1243,17 @@ static void __perf_event_enable(void *info) if (event->state >= PERF_EVENT_STATE_INACTIVE) goto unlock; + + /* + * in cgroup mode, we know the event matches + * the current cgroup, so update the cgroup's + * time so we timestamp correctly. + */ + update_event_css_time(event); + __perf_event_mark_enabled(event, ctx); - if (event->cpu != -1 && event->cpu != smp_processor_id()) + if (!event_filter_match(event, current)) goto unlock; /* @@ -1079,12 +1366,6 @@ static int perf_event_refresh(struct perf_event *event, int refresh) return 0; } -enum event_type_t { - EVENT_FLEXIBLE = 0x1, - EVENT_PINNED = 0x2, - EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, -}; - static void ctx_sched_out(struct perf_event_context *ctx, struct perf_cpu_context *cpuctx, enum event_type_t event_type) @@ -1096,6 +1377,7 @@ static void ctx_sched_out(struct perf_event_context *ctx, if (likely(!ctx->nr_events)) goto out; update_context_time(ctx); + update_task_css_time(current); perf_disable(); if (!ctx->nr_active) @@ -1209,71 +1491,6 @@ static void perf_event_sync_stat(struct perf_event_context *ctx, } } -/* - * Called from scheduler to remove the events of the current task, - * with interrupts disabled. - * - * We stop each event and update the event value in event->count. - * - * This does not protect us against NMI, but disable() - * sets the disabled bit in the control field of event _before_ - * accessing the event control register. If a NMI hits, then it will - * not restart the event. - */ -void perf_event_task_sched_out(struct task_struct *task, - struct task_struct *next) -{ - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); - struct perf_event_context *ctx = task->perf_event_ctxp; - struct perf_event_context *next_ctx; - struct perf_event_context *parent; - int do_switch = 1; - - perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0); - - if (likely(!ctx || !cpuctx->task_ctx)) - return; - - rcu_read_lock(); - parent = rcu_dereference(ctx->parent_ctx); - next_ctx = next->perf_event_ctxp; - if (parent && next_ctx && - rcu_dereference(next_ctx->parent_ctx) == parent) { - /* - * Looks like the two contexts are clones, so we might be - * able to optimize the context switch. We lock both - * contexts and check that they are clones under the - * lock (including re-checking that neither has been - * uncloned in the meantime). It doesn't matter which - * order we take the locks because no other cpu could - * be trying to lock both of these tasks. - */ - raw_spin_lock(&ctx->lock); - raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); - if (context_equiv(ctx, next_ctx)) { - /* - * XXX do we need a memory barrier of sorts - * wrt to rcu_dereference() of perf_event_ctxp - */ - task->perf_event_ctxp = next_ctx; - next->perf_event_ctxp = ctx; - ctx->task = next; - next_ctx->task = task; - do_switch = 0; - - perf_event_sync_stat(ctx, next_ctx); - } - raw_spin_unlock(&next_ctx->lock); - raw_spin_unlock(&ctx->lock); - } - rcu_read_unlock(); - - if (do_switch) { - ctx_sched_out(ctx, cpuctx, EVENT_ALL); - cpuctx->task_ctx = NULL; - } -} - static void task_ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type) { @@ -1308,16 +1525,40 @@ static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, static void ctx_pinned_sched_in(struct perf_event_context *ctx, - struct perf_cpu_context *cpuctx) + struct perf_cpu_context *cpuctx, + struct task_struct *task, int css_sw) { struct perf_event *event; list_for_each_entry(event, &ctx->pinned_groups, group_entry) { + u64 tstamp = perf_event_time(event); + if (event->state <= PERF_EVENT_STATE_OFF) continue; - if (event->cpu != -1 && event->cpu != smp_processor_id()) + if (!event_filter_match(event, task)) continue; + if (is_cgroup_event(event)) { + /* + * if css was not active when the event was + * added to ctx, then this is the first time + * the event can be effectively scheduled, thus + * we update tstamp_enabled + */ + if (event->tstamp_enabled == PERF_TSTAMP_ENABLE_INVALID) + event->tstamp_enabled = tstamp; + /* + * if we come here because of a context switch + * with cgroup switch, then we need to update + * the point in time at which all cgroup events + * have been stopped. Oterwise, we would compute + * bogus tstamp_running deltas, which would include + * time the cgorup is not active. + */ + if (css_sw) + event->tstamp_stopped = tstamp; + } + if (group_can_go_on(event, cpuctx, 1)) group_sched_in(event, cpuctx, ctx); @@ -1334,7 +1575,8 @@ ctx_pinned_sched_in(struct perf_event_context *ctx, static void ctx_flexible_sched_in(struct perf_event_context *ctx, - struct perf_cpu_context *cpuctx) + struct perf_cpu_context *cpuctx, + struct task_struct *task, int css_sw) { struct perf_event *event; int can_add_hw = 1; @@ -1347,9 +1589,31 @@ ctx_flexible_sched_in(struct perf_event_context *ctx, * Listen to the 'cpu' scheduling filter constraint * of events: */ - if (event->cpu != -1 && event->cpu != smp_processor_id()) + if (!event_filter_match(event, task)) continue; + if (is_cgroup_event(event)) { + u64 tstamp = perf_event_time(event); + /* + * if css was not active when the event was + * added to ctx, then this is the first time + * the event can be effectively scheduled, thus + * we update tstamp_enabled + */ + if (event->tstamp_enabled == PERF_TSTAMP_ENABLE_INVALID) + event->tstamp_enabled = tstamp; + /* + * if we come here because of a context switch + * with cgroup switch, then we need to update + * the point in time at which all cgroup events + * have been stopped. Oterwise, we would compute + * bogus tstamp_running deltas, which would include + * time the cgorup is not active. + */ + if (css_sw) + event->tstamp_stopped = tstamp; + } + if (group_can_go_on(event, cpuctx, can_add_hw)) if (group_sched_in(event, cpuctx, ctx)) can_add_hw = 0; @@ -1359,7 +1623,8 @@ ctx_flexible_sched_in(struct perf_event_context *ctx, static void ctx_sched_in(struct perf_event_context *ctx, struct perf_cpu_context *cpuctx, - enum event_type_t event_type) + enum event_type_t event_type, + struct task_struct *task, int css_sw) { raw_spin_lock(&ctx->lock); ctx->is_active = 1; @@ -1375,11 +1640,11 @@ ctx_sched_in(struct perf_event_context *ctx, * in order to give them the best chance of going on. */ if (event_type & EVENT_PINNED) - ctx_pinned_sched_in(ctx, cpuctx); + ctx_pinned_sched_in(ctx, cpuctx, task, css_sw); /* Then walk through the lower prio flexible groups */ if (event_type & EVENT_FLEXIBLE) - ctx_flexible_sched_in(ctx, cpuctx); + ctx_flexible_sched_in(ctx, cpuctx, task, css_sw); perf_enable(); out: @@ -1387,11 +1652,12 @@ ctx_sched_in(struct perf_event_context *ctx, } static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, - enum event_type_t event_type) + enum event_type_t event_type, + struct task_struct *task, int css_sw) { struct perf_event_context *ctx = &cpuctx->ctx; - ctx_sched_in(ctx, cpuctx, event_type); + ctx_sched_in(ctx, cpuctx, event_type, task, css_sw); } static void task_ctx_sched_in(struct task_struct *task, @@ -1404,7 +1670,7 @@ static void task_ctx_sched_in(struct task_struct *task, return; if (cpuctx->task_ctx == ctx) return; - ctx_sched_in(ctx, cpuctx, event_type); + ctx_sched_in(ctx, cpuctx, event_type, task, 0); cpuctx->task_ctx = ctx; } /* @@ -1438,15 +1704,88 @@ void perf_event_task_sched_in(struct task_struct *task) */ cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); - ctx_sched_in(ctx, cpuctx, EVENT_PINNED); - cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); - ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE); + ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task, 0); + cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task, 0); + ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task, 0); cpuctx->task_ctx = ctx; perf_enable(); } +/* + * Called from scheduler to remove the events of the current task, + * with interrupts disabled. + * + * We stop each event and update the event value in event->count. + * + * This does not protect us against NMI, but disable() + * sets the disabled bit in the control field of event _before_ + * accessing the event control register. If a NMI hits, then it will + * not restart the event. + */ +void perf_event_task_sched_out(struct task_struct *task, + struct task_struct *next) +{ + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + struct perf_event_context *ctx = task->perf_event_ctxp; + struct perf_event_context *next_ctx; + struct perf_event_context *parent; + int do_switch = 1; + + perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0); + + /* + * switching cgroups + * must update time in going out cgroup + * mark new start time in coming in cgroup + */ + perf_cgroup_switch(task, next); + + if (likely(!ctx || !cpuctx->task_ctx)) + return; + + rcu_read_lock(); + parent = rcu_dereference(ctx->parent_ctx); + next_ctx = next->perf_event_ctxp; + if (parent && next_ctx && + rcu_dereference(next_ctx->parent_ctx) == parent) { + /* + * Looks like the two contexts are clones, so we might be + * able to optimize the context switch. We lock both + * contexts and check that they are clones under the + * lock (including re-checking that neither has been + * uncloned in the meantime). It doesn't matter which + * order we take the locks because no other cpu could + * be trying to lock both of these tasks. + */ + raw_spin_lock(&ctx->lock); + raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); + if (context_equiv(ctx, next_ctx)) { + /* + * XXX do we need a memory barrier of sorts + * wrt to rcu_dereference() of perf_event_ctxp + */ + task->perf_event_ctxp = next_ctx; + next->perf_event_ctxp = ctx; + ctx->task = next; + next_ctx->task = task; + do_switch = 0; + + perf_event_sync_stat(ctx, next_ctx); + } + raw_spin_unlock(&next_ctx->lock); + raw_spin_unlock(&ctx->lock); + } + rcu_read_unlock(); + + if (do_switch) { + ctx_sched_out(ctx, cpuctx, EVENT_ALL); + cpuctx->task_ctx = NULL; + } +} + + #define MAX_INTERRUPTS (~0ULL) static void perf_log_throttle(struct perf_event *event, int enable); @@ -1579,7 +1918,7 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx) if (event->state != PERF_EVENT_STATE_ACTIVE) continue; - if (event->cpu != -1 && event->cpu != smp_processor_id()) + if (!event_filter_match(event, current)) continue; hwc = &event->hw; @@ -1660,7 +1999,7 @@ void perf_event_task_tick(struct task_struct *curr) if (ctx) rotate_ctx(ctx); - cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); + cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, curr, 0); if (ctx) task_ctx_sched_in(curr, EVENT_FLEXIBLE); perf_enable(); @@ -1747,6 +2086,7 @@ static void __perf_event_read(void *info) return; raw_spin_lock(&ctx->lock); + update_event_css_time(event); update_context_time(ctx); update_event_times(event); raw_spin_unlock(&ctx->lock); @@ -1773,6 +2113,7 @@ static u64 perf_event_read(struct perf_event *event) unsigned long flags; raw_spin_lock_irqsave(&ctx->lock, flags); + update_event_css_time(event); update_context_time(ctx); update_event_times(event); raw_spin_unlock_irqrestore(&ctx->lock, flags); @@ -2132,6 +2473,9 @@ static void free_event(struct perf_event *event) event->buffer = NULL; } + if (is_cgroup_event(event)) + perf_put_cgroup(event); + if (event->destroy) event->destroy(event); @@ -3764,7 +4108,7 @@ static int perf_event_task_match(struct perf_event *event) if (event->state < PERF_EVENT_STATE_INACTIVE) return 0; - if (event->cpu != -1 && event->cpu != smp_processor_id()) + if (!event_filter_match(event, current)) return 0; if (event->attr.comm || event->attr.mmap || @@ -3878,7 +4222,7 @@ static int perf_event_comm_match(struct perf_event *event) if (event->state < PERF_EVENT_STATE_INACTIVE) return 0; - if (event->cpu != -1 && event->cpu != smp_processor_id()) + if (!event_filter_match(event, current)) return 0; if (event->attr.comm) @@ -3999,7 +4343,7 @@ static int perf_event_mmap_match(struct perf_event *event, if (event->state < PERF_EVENT_STATE_INACTIVE) return 0; - if (event->cpu != -1 && event->cpu != smp_processor_id()) + if (!event_filter_match(event, current)) return 0; if ((!executable && event->attr.mmap_data) || @@ -4660,6 +5004,7 @@ static void task_clock_perf_event_read(struct perf_event *event) u64 time; if (!in_nmi()) { + update_event_css_time(event); update_context_time(event->ctx); time = event->ctx->time; } else { @@ -5037,6 +5382,14 @@ perf_event_alloc(struct perf_event_attr *attr, if (!event) return ERR_PTR(-ENOMEM); + if (attr->cgroup) { + err = perf_connect_cgroup(event, attr, group_leader); + if (err) { + kfree(event); + return ERR_PTR(err); + } + } + /* * Single events are their own group leaders, with an * empty sibling list: @@ -5125,6 +5478,7 @@ done: if (err) { if (event->ns) put_pid_ns(event->ns); + perf_put_cgroup(event); kfree(event); return ERR_PTR(err); } @@ -5320,6 +5674,10 @@ SYSCALL_DEFINE5(perf_event_open, return -EINVAL; } + /* cgroup reserved for system-wide */ + if (attr.cgroup && pid != -1) + return -EINVAL; + event_fd = get_unused_fd_flags(O_RDWR); if (event_fd < 0) return event_fd; @@ -6094,3 +6452,69 @@ static int __init perf_event_sysfs_init(void) &perfclass_attr_group); } device_initcall(perf_event_sysfs_init); + +#ifdef CONFIG_CGROUPS +static int perf_cgroup_read_map(struct cgroup *cgrp, struct cftype *cft, + struct cgroup_map_cb *cb) +{ + return 0; +} + +static struct cftype perf_cgroup_files[] = { + { .name = "perf", + .read_map = perf_cgroup_read_map, + }, +}; + +static struct cgroup_subsys_state *perf_cgroup_create( + struct cgroup_subsys *ss, struct cgroup *cont) +{ + struct perf_cgroup *jc; + struct perf_cgroup_time *t; + int c; + + jc = vmalloc(sizeof(*jc)); + if (!jc) + return ERR_PTR(-ENOMEM); + + memset(jc, 0, sizeof(*jc)); + + jc->time = alloc_percpu(struct perf_cgroup_time); + if (!jc->time) { + vfree(jc); + return ERR_PTR(-ENOMEM); + } + + for_each_possible_cpu(c) { + t = per_cpu_ptr(jc->time, c); + t->time = 0; + t->timestamp = 0; + } + return &jc->css; +} + +static void perf_cgroup_destroy(struct cgroup_subsys *ss, + struct cgroup *cont) +{ + struct perf_cgroup *jc = perf_cgroup_from_cont(cont); + + free_percpu(jc->time); + vfree(jc); +} + +static int perf_cgroup_populate(struct cgroup_subsys *ss, + struct cgroup *cont) +{ + return cgroup_add_files(cont, ss, perf_cgroup_files, + ARRAY_SIZE(perf_cgroup_files)); +} + +struct cgroup_subsys perf_subsys = { + .name = "perf_event", + .subsys_id = perf_subsys_id, + .create = perf_cgroup_create, + .destroy = perf_cgroup_destroy, + .populate = perf_cgroup_populate, + .early_init = 0, +}; +#endif /* CONFIG_CGROUP */ ------------------------------------------------------------------------------ This SF.net Dev2Dev email is sponsored by: Show off your parallel programming skills. Enter the Intel(R) Threading Challenge 2010. http://p.sf.net/sfu/intel-thread-sfd _______________________________________________ perfmon2-devel mailing list perfmon2-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/perfmon2-devel