Re: [PATCH] perf: update perf_cgroup time for ancestor cgroup(s)
> On Mar 12, 2018, at 5:38 AM, Peter Zijlstra wrote: > > On Sun, Mar 11, 2018 at 10:32:44PM -0700, Song Liu wrote: >> When a perf_event is attached to parent cgroup, it should count events >> for all children cgroups: >> >> parent_group < perf_event >> \ >>- child_group < process(es) >> >> However, in our tests, we found this perf_event cannot report reliable >> results. This is because perf_event->cgrp and cpuctx->cgrp are not >> identical, thus perf_event->cgrp are not updated properly. > > It might help now and in for our older selves, if you could provide a > simple reproducer for this. I will add a repro to v2. > >> Signed-off-by: Song Liu >> Reported-by: Ephraim Park >> --- >> kernel/events/core.c | 68 >> +++- >> 1 file changed, 67 insertions(+), 1 deletion(-) >> >> diff --git a/kernel/events/core.c b/kernel/events/core.c >> index 5789810..623d38f 100644 >> --- a/kernel/events/core.c >> +++ b/kernel/events/core.c > >> @@ -766,6 +821,17 @@ perf_cgroup_set_timestamp(struct task_struct *task, >> cgrp = perf_cgroup_from_task(task, ctx); >> info = this_cpu_ptr(cgrp->info); >> info->timestamp = ctx->timestamp; >> + >> +/* set timestamp for ancestor cgroups */ >> +if (cgrp->css.cgroup->level > 1) { >> +struct perf_cgroup_info *info = this_cpu_ptr(cgrp->info); >> +struct perf_event *event; >> + >> +list_for_each_entry(event, &ctx->pinned_groups, group_entry) >> +perf_ancestor_cgroup_set_timestamp(event, cgrp, info); >> +list_for_each_entry(event, &ctx->flexible_groups, group_entry) >> +perf_ancestor_cgroup_set_timestamp(event, cgrp, info); >> +} >> } > > That doesn't make any kind of sense... if we're interested in ancestor > groups, then WTH are you iterating _events_ ? > > I'm expecting something like: > > struct cgroup_subsys_state *css; > > for (css = cgrp->css; css; css = css->parent) { > /* fudge time */ > } I guess this is what I have been looking for! Thanks Peter! V2 coming soon... Song
Re: [PATCH] perf: update perf_cgroup time for ancestor cgroup(s)
On Sun, Mar 11, 2018 at 10:32:44PM -0700, Song Liu wrote: > When a perf_event is attached to parent cgroup, it should count events > for all children cgroups: > > parent_group < perf_event >\ > - child_group < process(es) > > However, in our tests, we found this perf_event cannot report reliable > results. This is because perf_event->cgrp and cpuctx->cgrp are not > identical, thus perf_event->cgrp are not updated properly. It might help now and in for our older selves, if you could provide a simple reproducer for this. > Signed-off-by: Song Liu > Reported-by: Ephraim Park > --- > kernel/events/core.c | 68 > +++- > 1 file changed, 67 insertions(+), 1 deletion(-) > > diff --git a/kernel/events/core.c b/kernel/events/core.c > index 5789810..623d38f 100644 > --- a/kernel/events/core.c > +++ b/kernel/events/core.c > @@ -766,6 +821,17 @@ perf_cgroup_set_timestamp(struct task_struct *task, > cgrp = perf_cgroup_from_task(task, ctx); > info = this_cpu_ptr(cgrp->info); > info->timestamp = ctx->timestamp; > + > + /* set timestamp for ancestor cgroups */ > + if (cgrp->css.cgroup->level > 1) { > + struct perf_cgroup_info *info = this_cpu_ptr(cgrp->info); > + struct perf_event *event; > + > + list_for_each_entry(event, &ctx->pinned_groups, group_entry) > + perf_ancestor_cgroup_set_timestamp(event, cgrp, info); > + list_for_each_entry(event, &ctx->flexible_groups, group_entry) > + perf_ancestor_cgroup_set_timestamp(event, cgrp, info); > + } > } That doesn't make any kind of sense... if we're interested in ancestor groups, then WTH are you iterating _events_ ? I'm expecting something like: struct cgroup_subsys_state *css; for (css = cgrp->css; css; css = css->parent) { /* fudge time */ }
[PATCH] perf: update perf_cgroup time for ancestor cgroup(s)
When a perf_event is attached to parent cgroup, it should count events for all children cgroups: parent_group < perf_event \ - child_group < process(es) However, in our tests, we found this perf_event cannot report reliable results. This is because perf_event->cgrp and cpuctx->cgrp are not identical, thus perf_event->cgrp are not updated properly. This patch fixes this by updating perf_cgroup properly for ancestor cgroup(s). Signed-off-by: Song Liu Reported-by: Ephraim Park --- kernel/events/core.c | 68 +++- 1 file changed, 67 insertions(+), 1 deletion(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 5789810..623d38f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -722,11 +722,48 @@ static inline void __update_cgrp_time(struct perf_cgroup *cgrp) info->timestamp = now; } +static inline void +update_ancestor_cgroup_time(struct perf_event *event, + struct perf_cgroup *cgrp, + struct perf_cgroup_info *cgrp_info) +{ + if (!is_cgroup_event(event)) + return; + + if (cgroup_is_descendant(cgrp->css.cgroup, +event->cgrp->css.cgroup) && + event->cgrp != cgrp) { + struct perf_cgroup_info *info; + + info = this_cpu_ptr(event->cgrp->info); + info->time += cgrp_info->timestamp - info->timestamp; + info->timestamp = cgrp_info->timestamp; + } +} + static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx) { struct perf_cgroup *cgrp_out = cpuctx->cgrp; - if (cgrp_out) + + if (cgrp_out) { __update_cgrp_time(cgrp_out); + + /* update time for ancestor cgroups */ + if (cgrp_out->css.cgroup->level > 1) { + struct perf_cgroup_info *info + = this_cpu_ptr(cgrp_out->info); + struct perf_event *event; + + list_for_each_entry(event, &cpuctx->ctx.pinned_groups, + group_entry) + update_ancestor_cgroup_time(event, cgrp_out, + info); + list_for_each_entry(event, &cpuctx->ctx.flexible_groups, + group_entry) + update_ancestor_cgroup_time(event, cgrp_out, + info); + } + } } static inline void update_cgrp_time_from_event(struct perf_event *event) @@ -749,6 +786,24 @@ static inline void update_cgrp_time_from_event(struct perf_event *event) } static inline void +perf_ancestor_cgroup_set_timestamp(struct perf_event *event, + struct perf_cgroup *cgrp, + struct perf_cgroup_info *cgrp_info) +{ + if (!is_cgroup_event(event)) + return; + + if (cgroup_is_descendant(cgrp->css.cgroup, +event->cgrp->css.cgroup) && + event->cgrp != cgrp) { + struct perf_cgroup_info *info + = this_cpu_ptr(event->cgrp->info); + + info->timestamp = cgrp_info->timestamp; + } +} + +static inline void perf_cgroup_set_timestamp(struct task_struct *task, struct perf_event_context *ctx) { @@ -766,6 +821,17 @@ perf_cgroup_set_timestamp(struct task_struct *task, cgrp = perf_cgroup_from_task(task, ctx); info = this_cpu_ptr(cgrp->info); info->timestamp = ctx->timestamp; + + /* set timestamp for ancestor cgroups */ + if (cgrp->css.cgroup->level > 1) { + struct perf_cgroup_info *info = this_cpu_ptr(cgrp->info); + struct perf_event *event; + + list_for_each_entry(event, &ctx->pinned_groups, group_entry) + perf_ancestor_cgroup_set_timestamp(event, cgrp, info); + list_for_each_entry(event, &ctx->flexible_groups, group_entry) + perf_ancestor_cgroup_set_timestamp(event, cgrp, info); + } } static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list); -- 2.9.5