Re: [PATCH] perf: update perf_cgroup time for ancestor cgroup(s)

2018-03-12 Thread Song Liu


> On Mar 12, 2018, at 5:38 AM, Peter Zijlstra  wrote:
> 
> On Sun, Mar 11, 2018 at 10:32:44PM -0700, Song Liu wrote:
>> When a perf_event is attached to parent cgroup, it should count events
>> for all children cgroups:
>> 
>> parent_group   < perf_event
>>   \
>>- child_group  < process(es)
>> 
>> However, in our tests, we found this perf_event cannot report reliable
>> results. This is because perf_event->cgrp and cpuctx->cgrp are not
>> identical, thus perf_event->cgrp are not updated properly.
> 
> It might help now and in for our older selves, if you could provide a
> simple reproducer for this.

I will add a repro to v2. 

> 
>> Signed-off-by: Song Liu 
>> Reported-by: Ephraim Park 
>> ---
>> kernel/events/core.c | 68 
>> +++-
>> 1 file changed, 67 insertions(+), 1 deletion(-)
>> 
>> diff --git a/kernel/events/core.c b/kernel/events/core.c
>> index 5789810..623d38f 100644
>> --- a/kernel/events/core.c
>> +++ b/kernel/events/core.c
> 
>> @@ -766,6 +821,17 @@ perf_cgroup_set_timestamp(struct task_struct *task,
>>  cgrp = perf_cgroup_from_task(task, ctx);
>>  info = this_cpu_ptr(cgrp->info);
>>  info->timestamp = ctx->timestamp;
>> +
>> +/* set timestamp for ancestor cgroups */
>> +if (cgrp->css.cgroup->level > 1) {
>> +struct perf_cgroup_info *info = this_cpu_ptr(cgrp->info);
>> +struct perf_event *event;
>> +
>> +list_for_each_entry(event, &ctx->pinned_groups, group_entry)
>> +perf_ancestor_cgroup_set_timestamp(event, cgrp, info);
>> +list_for_each_entry(event, &ctx->flexible_groups, group_entry)
>> +perf_ancestor_cgroup_set_timestamp(event, cgrp, info);
>> +}
>> }
> 
> That doesn't make any kind of sense... if we're interested in ancestor
> groups, then WTH are you iterating _events_ ?
> 
> I'm expecting something like:
> 
>   struct cgroup_subsys_state *css;
> 
>   for (css = cgrp->css; css; css = css->parent) {
>   /* fudge time */
>   }

I guess this is what I have been looking for! Thanks Peter! 

V2 coming soon...

Song



Re: [PATCH] perf: update perf_cgroup time for ancestor cgroup(s)

2018-03-12 Thread Peter Zijlstra
On Sun, Mar 11, 2018 at 10:32:44PM -0700, Song Liu wrote:
> When a perf_event is attached to parent cgroup, it should count events
> for all children cgroups:
> 
>  parent_group   < perf_event
>\
> - child_group  < process(es)
> 
> However, in our tests, we found this perf_event cannot report reliable
> results. This is because perf_event->cgrp and cpuctx->cgrp are not
> identical, thus perf_event->cgrp are not updated properly.

It might help now and in for our older selves, if you could provide a
simple reproducer for this.

> Signed-off-by: Song Liu 
> Reported-by: Ephraim Park 
> ---
>  kernel/events/core.c | 68 
> +++-
>  1 file changed, 67 insertions(+), 1 deletion(-)
> 
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 5789810..623d38f 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c

> @@ -766,6 +821,17 @@ perf_cgroup_set_timestamp(struct task_struct *task,
>   cgrp = perf_cgroup_from_task(task, ctx);
>   info = this_cpu_ptr(cgrp->info);
>   info->timestamp = ctx->timestamp;
> +
> + /* set timestamp for ancestor cgroups */
> + if (cgrp->css.cgroup->level > 1) {
> + struct perf_cgroup_info *info = this_cpu_ptr(cgrp->info);
> + struct perf_event *event;
> +
> + list_for_each_entry(event, &ctx->pinned_groups, group_entry)
> + perf_ancestor_cgroup_set_timestamp(event, cgrp, info);
> + list_for_each_entry(event, &ctx->flexible_groups, group_entry)
> + perf_ancestor_cgroup_set_timestamp(event, cgrp, info);
> + }
>  }

That doesn't make any kind of sense... if we're interested in ancestor
groups, then WTH are you iterating _events_ ?

I'm expecting something like:

struct cgroup_subsys_state *css;

for (css = cgrp->css; css; css = css->parent) {
/* fudge time */
}





[PATCH] perf: update perf_cgroup time for ancestor cgroup(s)

2018-03-11 Thread Song Liu
When a perf_event is attached to parent cgroup, it should count events
for all children cgroups:

 parent_group   < perf_event
   \
- child_group  < process(es)

However, in our tests, we found this perf_event cannot report reliable
results. This is because perf_event->cgrp and cpuctx->cgrp are not
identical, thus perf_event->cgrp are not updated properly.

This patch fixes this by updating perf_cgroup properly for ancestor
cgroup(s).

Signed-off-by: Song Liu 
Reported-by: Ephraim Park 
---
 kernel/events/core.c | 68 +++-
 1 file changed, 67 insertions(+), 1 deletion(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 5789810..623d38f 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -722,11 +722,48 @@ static inline void __update_cgrp_time(struct perf_cgroup 
*cgrp)
info->timestamp = now;
 }
 
+static inline void
+update_ancestor_cgroup_time(struct perf_event *event,
+   struct perf_cgroup *cgrp,
+   struct perf_cgroup_info *cgrp_info)
+{
+   if (!is_cgroup_event(event))
+   return;
+
+   if (cgroup_is_descendant(cgrp->css.cgroup,
+event->cgrp->css.cgroup) &&
+   event->cgrp != cgrp) {
+   struct perf_cgroup_info *info;
+
+   info = this_cpu_ptr(event->cgrp->info);
+   info->time += cgrp_info->timestamp - info->timestamp;
+   info->timestamp = cgrp_info->timestamp;
+   }
+}
+
 static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context 
*cpuctx)
 {
struct perf_cgroup *cgrp_out = cpuctx->cgrp;
-   if (cgrp_out)
+
+   if (cgrp_out) {
__update_cgrp_time(cgrp_out);
+
+   /* update time for ancestor cgroups */
+   if (cgrp_out->css.cgroup->level > 1) {
+   struct perf_cgroup_info *info
+   = this_cpu_ptr(cgrp_out->info);
+   struct perf_event *event;
+
+   list_for_each_entry(event, &cpuctx->ctx.pinned_groups,
+   group_entry)
+   update_ancestor_cgroup_time(event, cgrp_out,
+   info);
+   list_for_each_entry(event, &cpuctx->ctx.flexible_groups,
+   group_entry)
+   update_ancestor_cgroup_time(event, cgrp_out,
+   info);
+   }
+   }
 }
 
 static inline void update_cgrp_time_from_event(struct perf_event *event)
@@ -749,6 +786,24 @@ static inline void update_cgrp_time_from_event(struct 
perf_event *event)
 }
 
 static inline void
+perf_ancestor_cgroup_set_timestamp(struct perf_event *event,
+  struct perf_cgroup *cgrp,
+  struct perf_cgroup_info *cgrp_info)
+{
+   if (!is_cgroup_event(event))
+   return;
+
+   if (cgroup_is_descendant(cgrp->css.cgroup,
+event->cgrp->css.cgroup) &&
+   event->cgrp != cgrp) {
+   struct perf_cgroup_info *info
+   = this_cpu_ptr(event->cgrp->info);
+
+   info->timestamp = cgrp_info->timestamp;
+   }
+}
+
+static inline void
 perf_cgroup_set_timestamp(struct task_struct *task,
  struct perf_event_context *ctx)
 {
@@ -766,6 +821,17 @@ perf_cgroup_set_timestamp(struct task_struct *task,
cgrp = perf_cgroup_from_task(task, ctx);
info = this_cpu_ptr(cgrp->info);
info->timestamp = ctx->timestamp;
+
+   /* set timestamp for ancestor cgroups */
+   if (cgrp->css.cgroup->level > 1) {
+   struct perf_cgroup_info *info = this_cpu_ptr(cgrp->info);
+   struct perf_event *event;
+
+   list_for_each_entry(event, &ctx->pinned_groups, group_entry)
+   perf_ancestor_cgroup_set_timestamp(event, cgrp, info);
+   list_for_each_entry(event, &ctx->flexible_groups, group_entry)
+   perf_ancestor_cgroup_set_timestamp(event, cgrp, info);
+   }
 }
 
 static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list);
-- 
2.9.5