Re: [PATCH Resend v5] sched: fix init NOHZ_IDLE flag

2013-04-12 Thread Vincent Guittot
On 11 April 2013 16:56, Frederic Weisbecker  wrote:
> On Wed, Apr 03, 2013 at 03:37:43PM +0200, Vincent Guittot wrote:
>> On my smp platform which is made of 5 cores in 2 clusters, I have the
>> nr_busy_cpu field of sched_group_power struct that is not null when the
>> platform is fully idle. The root cause is:
>> During the boot sequence, some CPUs reach the idle loop and set their
>> NOHZ_IDLE flag while waiting for others CPUs to boot. But the nr_busy_cpus
>> field is initialized later with the assumption that all CPUs are in the busy
>> state whereas some CPUs have already set their NOHZ_IDLE flag.
>>
>> More generally, the NOHZ_IDLE flag must be initialized when new sched_domains
>> are created in order to ensure that NOHZ_IDLE and nr_busy_cpus are aligned.
>>
>> This condition can be ensured by adding a synchronize_rcu between the
>> destruction of old sched_domains and the creation of new ones so the 
>> NOHZ_IDLE
>> flag will not be updated with old sched_domain once it has been initialized.
>> But this solution introduces a additionnal latency in the rebuild sequence
>> that is called during cpu hotplug.
>>
>> As suggested by Frederic Weisbecker, another solution is to have the same
>> rcu lifecycle for both NOHZ_IDLE and sched_domain struct. I have introduce
>> a new sched_domain_rq struct that is the entry point for both sched_domains
>> and objects that must follow the same lifecycle like NOHZ_IDLE flags. They
>> will share the same RCU lifecycle and will be always synchronized.
>>
>> The synchronization is done at the cost of :
>>  - an additional indirection for accessing the first sched_domain level
>>  - an additional indirection and a rcu_dereference before accessing to the
>>NOHZ_IDLE flag.
>>
>> Change since v4:
>>  - link both sched_domain and NOHZ_IDLE flag in one RCU object so
>>their states are always synchronized.
>>
>> Change since V3;
>>  - NOHZ flag is not cleared if a NULL domain is attached to the CPU
>>  - Remove patch 2/2 which becomes useless with latest modifications
>>
>> Change since V2:
>>  - change the initialization to idle state instead of busy state so a CPU 
>> that
>>enters idle during the build of the sched_domain will not corrupt the
>>initialization state
>>
>> Change since V1:
>>  - remove the patch for SCHED softirq on an idle core use case as it was
>>a side effect of the other use cases.
>>
>> Signed-off-by: Vincent Guittot 
>
> Ok, the lockless scheme involving nr_buzy_cpus and rq flags seem to be 
> correct now.
> We can hope somebody will come up with a less complicated solution. But for 
> now that's
> the best fix I've seen.

Thanks

>
> I just have a few comments on details.
>
>> ---
>>  include/linux/sched.h |6 +++
>>  kernel/sched/core.c   |  105 
>> -
>>  kernel/sched/fair.c   |   35 +++--
>>  kernel/sched/sched.h  |   24 +--
>>  4 files changed, 145 insertions(+), 25 deletions(-)
>>
>> diff --git a/include/linux/sched.h b/include/linux/sched.h
>> index d35d2b6..2a52188 100644
>> --- a/include/linux/sched.h
>> +++ b/include/linux/sched.h
>> @@ -959,6 +959,12 @@ struct sched_domain {
>>   unsigned long span[0];
>>  };
>>
>> +struct sched_domain_rq {
>> + struct sched_domain *sd;
>> + unsigned long flags;
>> + struct rcu_head rcu;/* used during destruction */
>> +};
>
> So the reason for this level of indirection won't be intuitive for those
> who read that code. Please add some comments that explain why we need
> that. ie: because we need the object lifecycle of sched_power and flags to be 
> the
> same for the lockless scheme to work.

ok, i will add a comment

>
>> +
>>  static inline struct cpumask *sched_domain_span(struct sched_domain *sd)
>>  {
>>   return to_cpumask(sd->span);
>> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
>> index 7f12624..69e2313 100644
>> --- a/kernel/sched/core.c
>> +++ b/kernel/sched/core.c
>> @@ -5602,6 +5602,15 @@ static void destroy_sched_domains(struct sched_domain 
>> *sd, int cpu)
>>   destroy_sched_domain(sd, cpu);
>>  }
>>
>> +static void destroy_sched_domain_rq(struct sched_domain_rq *sd_rq, int cpu)
>> +{
>> + if (!sd_rq)
>> + return;
>> +
>> + destroy_sched_domains(sd_rq->sd, cpu);
>> + kfree_rcu(sd_rq, rcu);
>> +}
>> +
>>  /*
>>   * Keep a special pointer to the highest sched_domain that has
>>   * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
>> @@ -5632,10 +5641,23 @@ static void update_top_cache_domain(int cpu)
>>   * hold the hotplug lock.
>>   */
>>  static void
>> -cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
>> +cpu_attach_domain(struct sched_domain_rq *sd_rq, struct root_domain *rd,
>> + int cpu)
>>  {
>>   struct rq *rq = cpu_rq(cpu);
>> - struct sched_domain *tmp;
>> + struct sched_domain_rq *tmp_rq;
>
> old_sd_rq would be a clearer name.

ok

>
>> + struct 

Re: [PATCH Resend v5] sched: fix init NOHZ_IDLE flag

2013-04-12 Thread Vincent Guittot
On 11 April 2013 16:56, Frederic Weisbecker fweis...@gmail.com wrote:
 On Wed, Apr 03, 2013 at 03:37:43PM +0200, Vincent Guittot wrote:
 On my smp platform which is made of 5 cores in 2 clusters, I have the
 nr_busy_cpu field of sched_group_power struct that is not null when the
 platform is fully idle. The root cause is:
 During the boot sequence, some CPUs reach the idle loop and set their
 NOHZ_IDLE flag while waiting for others CPUs to boot. But the nr_busy_cpus
 field is initialized later with the assumption that all CPUs are in the busy
 state whereas some CPUs have already set their NOHZ_IDLE flag.

 More generally, the NOHZ_IDLE flag must be initialized when new sched_domains
 are created in order to ensure that NOHZ_IDLE and nr_busy_cpus are aligned.

 This condition can be ensured by adding a synchronize_rcu between the
 destruction of old sched_domains and the creation of new ones so the 
 NOHZ_IDLE
 flag will not be updated with old sched_domain once it has been initialized.
 But this solution introduces a additionnal latency in the rebuild sequence
 that is called during cpu hotplug.

 As suggested by Frederic Weisbecker, another solution is to have the same
 rcu lifecycle for both NOHZ_IDLE and sched_domain struct. I have introduce
 a new sched_domain_rq struct that is the entry point for both sched_domains
 and objects that must follow the same lifecycle like NOHZ_IDLE flags. They
 will share the same RCU lifecycle and will be always synchronized.

 The synchronization is done at the cost of :
  - an additional indirection for accessing the first sched_domain level
  - an additional indirection and a rcu_dereference before accessing to the
NOHZ_IDLE flag.

 Change since v4:
  - link both sched_domain and NOHZ_IDLE flag in one RCU object so
their states are always synchronized.

 Change since V3;
  - NOHZ flag is not cleared if a NULL domain is attached to the CPU
  - Remove patch 2/2 which becomes useless with latest modifications

 Change since V2:
  - change the initialization to idle state instead of busy state so a CPU 
 that
enters idle during the build of the sched_domain will not corrupt the
initialization state

 Change since V1:
  - remove the patch for SCHED softirq on an idle core use case as it was
a side effect of the other use cases.

 Signed-off-by: Vincent Guittot vincent.guit...@linaro.org

 Ok, the lockless scheme involving nr_buzy_cpus and rq flags seem to be 
 correct now.
 We can hope somebody will come up with a less complicated solution. But for 
 now that's
 the best fix I've seen.

Thanks


 I just have a few comments on details.

 ---
  include/linux/sched.h |6 +++
  kernel/sched/core.c   |  105 
 -
  kernel/sched/fair.c   |   35 +++--
  kernel/sched/sched.h  |   24 +--
  4 files changed, 145 insertions(+), 25 deletions(-)

 diff --git a/include/linux/sched.h b/include/linux/sched.h
 index d35d2b6..2a52188 100644
 --- a/include/linux/sched.h
 +++ b/include/linux/sched.h
 @@ -959,6 +959,12 @@ struct sched_domain {
   unsigned long span[0];
  };

 +struct sched_domain_rq {
 + struct sched_domain *sd;
 + unsigned long flags;
 + struct rcu_head rcu;/* used during destruction */
 +};

 So the reason for this level of indirection won't be intuitive for those
 who read that code. Please add some comments that explain why we need
 that. ie: because we need the object lifecycle of sched_power and flags to be 
 the
 same for the lockless scheme to work.

ok, i will add a comment


 +
  static inline struct cpumask *sched_domain_span(struct sched_domain *sd)
  {
   return to_cpumask(sd-span);
 diff --git a/kernel/sched/core.c b/kernel/sched/core.c
 index 7f12624..69e2313 100644
 --- a/kernel/sched/core.c
 +++ b/kernel/sched/core.c
 @@ -5602,6 +5602,15 @@ static void destroy_sched_domains(struct sched_domain 
 *sd, int cpu)
   destroy_sched_domain(sd, cpu);
  }

 +static void destroy_sched_domain_rq(struct sched_domain_rq *sd_rq, int cpu)
 +{
 + if (!sd_rq)
 + return;
 +
 + destroy_sched_domains(sd_rq-sd, cpu);
 + kfree_rcu(sd_rq, rcu);
 +}
 +
  /*
   * Keep a special pointer to the highest sched_domain that has
   * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
 @@ -5632,10 +5641,23 @@ static void update_top_cache_domain(int cpu)
   * hold the hotplug lock.
   */
  static void
 -cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
 +cpu_attach_domain(struct sched_domain_rq *sd_rq, struct root_domain *rd,
 + int cpu)
  {
   struct rq *rq = cpu_rq(cpu);
 - struct sched_domain *tmp;
 + struct sched_domain_rq *tmp_rq;

 old_sd_rq would be a clearer name.

ok


 + struct sched_domain *tmp, *sd = NULL;
 +
 + /*
 +  * If we don't have any sched_domain and associated object, we can
 +  * directly jump to the attach sequence otherwise we try to degenerate
 +  

Re: [PATCH Resend v5] sched: fix init NOHZ_IDLE flag

2013-04-11 Thread Frederic Weisbecker
On Wed, Apr 03, 2013 at 03:37:43PM +0200, Vincent Guittot wrote:
> On my smp platform which is made of 5 cores in 2 clusters, I have the
> nr_busy_cpu field of sched_group_power struct that is not null when the
> platform is fully idle. The root cause is:
> During the boot sequence, some CPUs reach the idle loop and set their
> NOHZ_IDLE flag while waiting for others CPUs to boot. But the nr_busy_cpus
> field is initialized later with the assumption that all CPUs are in the busy
> state whereas some CPUs have already set their NOHZ_IDLE flag.
> 
> More generally, the NOHZ_IDLE flag must be initialized when new sched_domains
> are created in order to ensure that NOHZ_IDLE and nr_busy_cpus are aligned.
> 
> This condition can be ensured by adding a synchronize_rcu between the
> destruction of old sched_domains and the creation of new ones so the NOHZ_IDLE
> flag will not be updated with old sched_domain once it has been initialized.
> But this solution introduces a additionnal latency in the rebuild sequence
> that is called during cpu hotplug.
> 
> As suggested by Frederic Weisbecker, another solution is to have the same
> rcu lifecycle for both NOHZ_IDLE and sched_domain struct. I have introduce
> a new sched_domain_rq struct that is the entry point for both sched_domains
> and objects that must follow the same lifecycle like NOHZ_IDLE flags. They
> will share the same RCU lifecycle and will be always synchronized.
> 
> The synchronization is done at the cost of :
>  - an additional indirection for accessing the first sched_domain level
>  - an additional indirection and a rcu_dereference before accessing to the
>NOHZ_IDLE flag.
> 
> Change since v4:
>  - link both sched_domain and NOHZ_IDLE flag in one RCU object so
>their states are always synchronized.
> 
> Change since V3;
>  - NOHZ flag is not cleared if a NULL domain is attached to the CPU
>  - Remove patch 2/2 which becomes useless with latest modifications
> 
> Change since V2:
>  - change the initialization to idle state instead of busy state so a CPU that
>enters idle during the build of the sched_domain will not corrupt the
>initialization state
> 
> Change since V1:
>  - remove the patch for SCHED softirq on an idle core use case as it was
>a side effect of the other use cases.
> 
> Signed-off-by: Vincent Guittot 

Ok, the lockless scheme involving nr_buzy_cpus and rq flags seem to be correct 
now.
We can hope somebody will come up with a less complicated solution. But for now 
that's
the best fix I've seen.

I just have a few comments on details.

> ---
>  include/linux/sched.h |6 +++
>  kernel/sched/core.c   |  105 
> -
>  kernel/sched/fair.c   |   35 +++--
>  kernel/sched/sched.h  |   24 +--
>  4 files changed, 145 insertions(+), 25 deletions(-)
> 
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index d35d2b6..2a52188 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -959,6 +959,12 @@ struct sched_domain {
>   unsigned long span[0];
>  };
> 
> +struct sched_domain_rq {
> + struct sched_domain *sd;
> + unsigned long flags;
> + struct rcu_head rcu;/* used during destruction */
> +};

So the reason for this level of indirection won't be intuitive for those
who read that code. Please add some comments that explain why we need
that. ie: because we need the object lifecycle of sched_power and flags to be 
the
same for the lockless scheme to work.

> +
>  static inline struct cpumask *sched_domain_span(struct sched_domain *sd)
>  {
>   return to_cpumask(sd->span);
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 7f12624..69e2313 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -5602,6 +5602,15 @@ static void destroy_sched_domains(struct sched_domain 
> *sd, int cpu)
>   destroy_sched_domain(sd, cpu);
>  }
>  
> +static void destroy_sched_domain_rq(struct sched_domain_rq *sd_rq, int cpu)
> +{
> + if (!sd_rq)
> + return;
> +
> + destroy_sched_domains(sd_rq->sd, cpu);
> + kfree_rcu(sd_rq, rcu);
> +}
> +
>  /*
>   * Keep a special pointer to the highest sched_domain that has
>   * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
> @@ -5632,10 +5641,23 @@ static void update_top_cache_domain(int cpu)
>   * hold the hotplug lock.
>   */
>  static void
> -cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
> +cpu_attach_domain(struct sched_domain_rq *sd_rq, struct root_domain *rd,
> + int cpu)
>  {
>   struct rq *rq = cpu_rq(cpu);
> - struct sched_domain *tmp;
> + struct sched_domain_rq *tmp_rq;

old_sd_rq would be a clearer name.

> + struct sched_domain *tmp, *sd = NULL;
> +
> + /*
> +  * If we don't have any sched_domain and associated object, we can
> +  * directly jump to the attach sequence otherwise we try to degenerate
> +  * the sched_domain
> +

Re: [PATCH Resend v5] sched: fix init NOHZ_IDLE flag

2013-04-11 Thread Frederic Weisbecker
On Wed, Apr 03, 2013 at 03:37:43PM +0200, Vincent Guittot wrote:
 On my smp platform which is made of 5 cores in 2 clusters, I have the
 nr_busy_cpu field of sched_group_power struct that is not null when the
 platform is fully idle. The root cause is:
 During the boot sequence, some CPUs reach the idle loop and set their
 NOHZ_IDLE flag while waiting for others CPUs to boot. But the nr_busy_cpus
 field is initialized later with the assumption that all CPUs are in the busy
 state whereas some CPUs have already set their NOHZ_IDLE flag.
 
 More generally, the NOHZ_IDLE flag must be initialized when new sched_domains
 are created in order to ensure that NOHZ_IDLE and nr_busy_cpus are aligned.
 
 This condition can be ensured by adding a synchronize_rcu between the
 destruction of old sched_domains and the creation of new ones so the NOHZ_IDLE
 flag will not be updated with old sched_domain once it has been initialized.
 But this solution introduces a additionnal latency in the rebuild sequence
 that is called during cpu hotplug.
 
 As suggested by Frederic Weisbecker, another solution is to have the same
 rcu lifecycle for both NOHZ_IDLE and sched_domain struct. I have introduce
 a new sched_domain_rq struct that is the entry point for both sched_domains
 and objects that must follow the same lifecycle like NOHZ_IDLE flags. They
 will share the same RCU lifecycle and will be always synchronized.
 
 The synchronization is done at the cost of :
  - an additional indirection for accessing the first sched_domain level
  - an additional indirection and a rcu_dereference before accessing to the
NOHZ_IDLE flag.
 
 Change since v4:
  - link both sched_domain and NOHZ_IDLE flag in one RCU object so
their states are always synchronized.
 
 Change since V3;
  - NOHZ flag is not cleared if a NULL domain is attached to the CPU
  - Remove patch 2/2 which becomes useless with latest modifications
 
 Change since V2:
  - change the initialization to idle state instead of busy state so a CPU that
enters idle during the build of the sched_domain will not corrupt the
initialization state
 
 Change since V1:
  - remove the patch for SCHED softirq on an idle core use case as it was
a side effect of the other use cases.
 
 Signed-off-by: Vincent Guittot vincent.guit...@linaro.org

Ok, the lockless scheme involving nr_buzy_cpus and rq flags seem to be correct 
now.
We can hope somebody will come up with a less complicated solution. But for now 
that's
the best fix I've seen.

I just have a few comments on details.

 ---
  include/linux/sched.h |6 +++
  kernel/sched/core.c   |  105 
 -
  kernel/sched/fair.c   |   35 +++--
  kernel/sched/sched.h  |   24 +--
  4 files changed, 145 insertions(+), 25 deletions(-)
 
 diff --git a/include/linux/sched.h b/include/linux/sched.h
 index d35d2b6..2a52188 100644
 --- a/include/linux/sched.h
 +++ b/include/linux/sched.h
 @@ -959,6 +959,12 @@ struct sched_domain {
   unsigned long span[0];
  };
 
 +struct sched_domain_rq {
 + struct sched_domain *sd;
 + unsigned long flags;
 + struct rcu_head rcu;/* used during destruction */
 +};

So the reason for this level of indirection won't be intuitive for those
who read that code. Please add some comments that explain why we need
that. ie: because we need the object lifecycle of sched_power and flags to be 
the
same for the lockless scheme to work.

 +
  static inline struct cpumask *sched_domain_span(struct sched_domain *sd)
  {
   return to_cpumask(sd-span);
 diff --git a/kernel/sched/core.c b/kernel/sched/core.c
 index 7f12624..69e2313 100644
 --- a/kernel/sched/core.c
 +++ b/kernel/sched/core.c
 @@ -5602,6 +5602,15 @@ static void destroy_sched_domains(struct sched_domain 
 *sd, int cpu)
   destroy_sched_domain(sd, cpu);
  }
  
 +static void destroy_sched_domain_rq(struct sched_domain_rq *sd_rq, int cpu)
 +{
 + if (!sd_rq)
 + return;
 +
 + destroy_sched_domains(sd_rq-sd, cpu);
 + kfree_rcu(sd_rq, rcu);
 +}
 +
  /*
   * Keep a special pointer to the highest sched_domain that has
   * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
 @@ -5632,10 +5641,23 @@ static void update_top_cache_domain(int cpu)
   * hold the hotplug lock.
   */
  static void
 -cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
 +cpu_attach_domain(struct sched_domain_rq *sd_rq, struct root_domain *rd,
 + int cpu)
  {
   struct rq *rq = cpu_rq(cpu);
 - struct sched_domain *tmp;
 + struct sched_domain_rq *tmp_rq;

old_sd_rq would be a clearer name.

 + struct sched_domain *tmp, *sd = NULL;
 +
 + /*
 +  * If we don't have any sched_domain and associated object, we can
 +  * directly jump to the attach sequence otherwise we try to degenerate
 +  * the sched_domain
 +  */
 + if (!sd_rq)
 + goto attach;
 +
 + /* Get a pointer to the 

Re: [PATCH Resend v5] sched: fix init NOHZ_IDLE flag

2013-04-10 Thread Vincent Guittot
On 9 April 2013 14:45, Frederic Weisbecker  wrote:
> 2013/4/4 Vincent Guittot :
>> On 4 April 2013 19:07, Frederic Weisbecker  wrote:
>>> Is it possible that we can be dealing here with a
>>> sched_group/sched_group_power that is used on another CPU (from that
>>> CPU's rq->rq_sd->sd) concurrently?
>>> When we call build_sched_groups(), we might reuse an exisiting struct
>>> sched_group used elsewhere right? If so, is there a race with the
>>> above initialization?
>>
>> No we are not reusing an existing struct, the
>> sched_group/sched_group_power that is initialized here, has just been
>> created by __visit_domain_allocation_hell in build_sched_domains. The
>> sched_group/sched_group_power is not already attached to any CPU
>
> I see. Yeah the group allocations/initialization is done per domain
> found in ndoms_new[]. And there is no further reuse of these groups
> once these are attached.
>
> Looking at the code it seems we can make some symetric conclusion with
> group release? When we destroy a per cpu domain hierarchy and then put
> our references to the struct sched_group, all the other per cpu
> domains that reference these sched_group are about to put their
> reference soon too, right? Because IIUC we always destroy these per
> cpu domains per highest level partition (those found in doms_cur[])?

Yes

>
> I'm just asking to make sure we don't need some
> atomic_dec(nr_busy_cpus) on per cpu domain release, which is not
> necessary the sched group is getting released soon.

yes, it's not needed

>
> Thanks for your patience :)

That's fine.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH Resend v5] sched: fix init NOHZ_IDLE flag

2013-04-10 Thread Vincent Guittot
On 9 April 2013 14:45, Frederic Weisbecker fweis...@gmail.com wrote:
 2013/4/4 Vincent Guittot vincent.guit...@linaro.org:
 On 4 April 2013 19:07, Frederic Weisbecker fweis...@gmail.com wrote:
 Is it possible that we can be dealing here with a
 sched_group/sched_group_power that is used on another CPU (from that
 CPU's rq-rq_sd-sd) concurrently?
 When we call build_sched_groups(), we might reuse an exisiting struct
 sched_group used elsewhere right? If so, is there a race with the
 above initialization?

 No we are not reusing an existing struct, the
 sched_group/sched_group_power that is initialized here, has just been
 created by __visit_domain_allocation_hell in build_sched_domains. The
 sched_group/sched_group_power is not already attached to any CPU

 I see. Yeah the group allocations/initialization is done per domain
 found in ndoms_new[]. And there is no further reuse of these groups
 once these are attached.

 Looking at the code it seems we can make some symetric conclusion with
 group release? When we destroy a per cpu domain hierarchy and then put
 our references to the struct sched_group, all the other per cpu
 domains that reference these sched_group are about to put their
 reference soon too, right? Because IIUC we always destroy these per
 cpu domains per highest level partition (those found in doms_cur[])?

Yes


 I'm just asking to make sure we don't need some
 atomic_dec(nr_busy_cpus) on per cpu domain release, which is not
 necessary the sched group is getting released soon.

yes, it's not needed


 Thanks for your patience :)

That's fine.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH Resend v5] sched: fix init NOHZ_IDLE flag

2013-04-09 Thread Frederic Weisbecker
2013/4/4 Vincent Guittot :
> On 4 April 2013 19:07, Frederic Weisbecker  wrote:
>> Is it possible that we can be dealing here with a
>> sched_group/sched_group_power that is used on another CPU (from that
>> CPU's rq->rq_sd->sd) concurrently?
>> When we call build_sched_groups(), we might reuse an exisiting struct
>> sched_group used elsewhere right? If so, is there a race with the
>> above initialization?
>
> No we are not reusing an existing struct, the
> sched_group/sched_group_power that is initialized here, has just been
> created by __visit_domain_allocation_hell in build_sched_domains. The
> sched_group/sched_group_power is not already attached to any CPU

I see. Yeah the group allocations/initialization is done per domain
found in ndoms_new[]. And there is no further reuse of these groups
once these are attached.

Looking at the code it seems we can make some symetric conclusion with
group release? When we destroy a per cpu domain hierarchy and then put
our references to the struct sched_group, all the other per cpu
domains that reference these sched_group are about to put their
reference soon too, right? Because IIUC we always destroy these per
cpu domains per highest level partition (those found in doms_cur[])?

I'm just asking to make sure we don't need some
atomic_dec(nr_busy_cpus) on per cpu domain release, which is not
necessary the sched group is getting released soon.

Thanks for your patience :)
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH Resend v5] sched: fix init NOHZ_IDLE flag

2013-04-09 Thread Vincent Guittot
On 4 April 2013 19:30, Vincent Guittot  wrote:
> On 4 April 2013 19:07, Frederic Weisbecker  wrote:
>> 2013/4/3 Vincent Guittot :
>>> On my smp platform which is made of 5 cores in 2 clusters, I have the
>>> nr_busy_cpu field of sched_group_power struct that is not null when the
>>> platform is fully idle. The root cause is:
>>> During the boot sequence, some CPUs reach the idle loop and set their
>>> NOHZ_IDLE flag while waiting for others CPUs to boot. But the nr_busy_cpus
>>> field is initialized later with the assumption that all CPUs are in the busy
>>> state whereas some CPUs have already set their NOHZ_IDLE flag.
>>>
>>> More generally, the NOHZ_IDLE flag must be initialized when new 
>>> sched_domains
>>> are created in order to ensure that NOHZ_IDLE and nr_busy_cpus are aligned.
>>>
>>> This condition can be ensured by adding a synchronize_rcu between the
>>> destruction of old sched_domains and the creation of new ones so the 
>>> NOHZ_IDLE
>>> flag will not be updated with old sched_domain once it has been initialized.
>>> But this solution introduces a additionnal latency in the rebuild sequence
>>> that is called during cpu hotplug.
>>>
>>> As suggested by Frederic Weisbecker, another solution is to have the same
>>> rcu lifecycle for both NOHZ_IDLE and sched_domain struct. I have introduce
>>> a new sched_domain_rq struct that is the entry point for both sched_domains
>>> and objects that must follow the same lifecycle like NOHZ_IDLE flags. They
>>> will share the same RCU lifecycle and will be always synchronized.
>>>
>>> The synchronization is done at the cost of :
>>>  - an additional indirection for accessing the first sched_domain level
>>>  - an additional indirection and a rcu_dereference before accessing to the
>>>NOHZ_IDLE flag.
>>>
>>> Change since v4:
>>>  - link both sched_domain and NOHZ_IDLE flag in one RCU object so
>>>their states are always synchronized.
>>>
>>> Change since V3;
>>>  - NOHZ flag is not cleared if a NULL domain is attached to the CPU
>>>  - Remove patch 2/2 which becomes useless with latest modifications
>>>
>>> Change since V2:
>>>  - change the initialization to idle state instead of busy state so a CPU 
>>> that
>>>enters idle during the build of the sched_domain will not corrupt the
>>>initialization state
>>>
>>> Change since V1:
>>>  - remove the patch for SCHED softirq on an idle core use case as it was
>>>a side effect of the other use cases.
>>>
>>> Signed-off-by: Vincent Guittot 
>>> ---
>>>  include/linux/sched.h |6 +++
>>>  kernel/sched/core.c   |  105 
>>> -
>>>  kernel/sched/fair.c   |   35 +++--
>>>  kernel/sched/sched.h  |   24 +--
>>>  4 files changed, 145 insertions(+), 25 deletions(-)
>>>
>>> diff --git a/include/linux/sched.h b/include/linux/sched.h
>>> index d35d2b6..2a52188 100644
>>> --- a/include/linux/sched.h
>>> +++ b/include/linux/sched.h
>>> @@ -959,6 +959,12 @@ struct sched_domain {
>>> unsigned long span[0];
>>>  };
>>>
>>> +struct sched_domain_rq {
>>> +   struct sched_domain *sd;
>>> +   unsigned long flags;
>>> +   struct rcu_head rcu;/* used during destruction */
>>> +};
>>> +
>>>  static inline struct cpumask *sched_domain_span(struct sched_domain *sd)
>>>  {
>>> return to_cpumask(sd->span);
>>> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
>>> index 7f12624..69e2313 100644
>>> --- a/kernel/sched/core.c
>>> +++ b/kernel/sched/core.c
>>> @@ -5602,6 +5602,15 @@ static void destroy_sched_domains(struct 
>>> sched_domain *sd, int cpu)
>>> destroy_sched_domain(sd, cpu);
>>>  }
>>>
>>> +static void destroy_sched_domain_rq(struct sched_domain_rq *sd_rq, int cpu)
>>> +{
>>> +   if (!sd_rq)
>>> +   return;
>>> +
>>> +   destroy_sched_domains(sd_rq->sd, cpu);
>>> +   kfree_rcu(sd_rq, rcu);
>>> +}
>>> +
>>>  /*
>>>   * Keep a special pointer to the highest sched_domain that has
>>>   * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
>>> @@ -5632,10 +5641,23 @@ static void update_top_cache_domain(int cpu)
>>>   * hold the hotplug lock.
>>>   */
>>>  static void
>>> -cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
>>> +cpu_attach_domain(struct sched_domain_rq *sd_rq, struct root_domain *rd,
>>> +   int cpu)
>>>  {
>>> struct rq *rq = cpu_rq(cpu);
>>> -   struct sched_domain *tmp;
>>> +   struct sched_domain_rq *tmp_rq;
>>> +   struct sched_domain *tmp, *sd = NULL;
>>> +
>>> +   /*
>>> +* If we don't have any sched_domain and associated object, we can
>>> +* directly jump to the attach sequence otherwise we try to 
>>> degenerate
>>> +* the sched_domain
>>> +*/
>>> +   if (!sd_rq)
>>> +   goto attach;
>>> +
>>> +   /* Get a pointer to the 1st sched_domain */
>>> +   sd = sd_rq->sd;
>>>
>>> /* Remove the sched domains which do not 

Re: [PATCH Resend v5] sched: fix init NOHZ_IDLE flag

2013-04-09 Thread Vincent Guittot
On 4 April 2013 19:30, Vincent Guittot vincent.guit...@linaro.org wrote:
 On 4 April 2013 19:07, Frederic Weisbecker fweis...@gmail.com wrote:
 2013/4/3 Vincent Guittot vincent.guit...@linaro.org:
 On my smp platform which is made of 5 cores in 2 clusters, I have the
 nr_busy_cpu field of sched_group_power struct that is not null when the
 platform is fully idle. The root cause is:
 During the boot sequence, some CPUs reach the idle loop and set their
 NOHZ_IDLE flag while waiting for others CPUs to boot. But the nr_busy_cpus
 field is initialized later with the assumption that all CPUs are in the busy
 state whereas some CPUs have already set their NOHZ_IDLE flag.

 More generally, the NOHZ_IDLE flag must be initialized when new 
 sched_domains
 are created in order to ensure that NOHZ_IDLE and nr_busy_cpus are aligned.

 This condition can be ensured by adding a synchronize_rcu between the
 destruction of old sched_domains and the creation of new ones so the 
 NOHZ_IDLE
 flag will not be updated with old sched_domain once it has been initialized.
 But this solution introduces a additionnal latency in the rebuild sequence
 that is called during cpu hotplug.

 As suggested by Frederic Weisbecker, another solution is to have the same
 rcu lifecycle for both NOHZ_IDLE and sched_domain struct. I have introduce
 a new sched_domain_rq struct that is the entry point for both sched_domains
 and objects that must follow the same lifecycle like NOHZ_IDLE flags. They
 will share the same RCU lifecycle and will be always synchronized.

 The synchronization is done at the cost of :
  - an additional indirection for accessing the first sched_domain level
  - an additional indirection and a rcu_dereference before accessing to the
NOHZ_IDLE flag.

 Change since v4:
  - link both sched_domain and NOHZ_IDLE flag in one RCU object so
their states are always synchronized.

 Change since V3;
  - NOHZ flag is not cleared if a NULL domain is attached to the CPU
  - Remove patch 2/2 which becomes useless with latest modifications

 Change since V2:
  - change the initialization to idle state instead of busy state so a CPU 
 that
enters idle during the build of the sched_domain will not corrupt the
initialization state

 Change since V1:
  - remove the patch for SCHED softirq on an idle core use case as it was
a side effect of the other use cases.

 Signed-off-by: Vincent Guittot vincent.guit...@linaro.org
 ---
  include/linux/sched.h |6 +++
  kernel/sched/core.c   |  105 
 -
  kernel/sched/fair.c   |   35 +++--
  kernel/sched/sched.h  |   24 +--
  4 files changed, 145 insertions(+), 25 deletions(-)

 diff --git a/include/linux/sched.h b/include/linux/sched.h
 index d35d2b6..2a52188 100644
 --- a/include/linux/sched.h
 +++ b/include/linux/sched.h
 @@ -959,6 +959,12 @@ struct sched_domain {
 unsigned long span[0];
  };

 +struct sched_domain_rq {
 +   struct sched_domain *sd;
 +   unsigned long flags;
 +   struct rcu_head rcu;/* used during destruction */
 +};
 +
  static inline struct cpumask *sched_domain_span(struct sched_domain *sd)
  {
 return to_cpumask(sd-span);
 diff --git a/kernel/sched/core.c b/kernel/sched/core.c
 index 7f12624..69e2313 100644
 --- a/kernel/sched/core.c
 +++ b/kernel/sched/core.c
 @@ -5602,6 +5602,15 @@ static void destroy_sched_domains(struct 
 sched_domain *sd, int cpu)
 destroy_sched_domain(sd, cpu);
  }

 +static void destroy_sched_domain_rq(struct sched_domain_rq *sd_rq, int cpu)
 +{
 +   if (!sd_rq)
 +   return;
 +
 +   destroy_sched_domains(sd_rq-sd, cpu);
 +   kfree_rcu(sd_rq, rcu);
 +}
 +
  /*
   * Keep a special pointer to the highest sched_domain that has
   * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
 @@ -5632,10 +5641,23 @@ static void update_top_cache_domain(int cpu)
   * hold the hotplug lock.
   */
  static void
 -cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
 +cpu_attach_domain(struct sched_domain_rq *sd_rq, struct root_domain *rd,
 +   int cpu)
  {
 struct rq *rq = cpu_rq(cpu);
 -   struct sched_domain *tmp;
 +   struct sched_domain_rq *tmp_rq;
 +   struct sched_domain *tmp, *sd = NULL;
 +
 +   /*
 +* If we don't have any sched_domain and associated object, we can
 +* directly jump to the attach sequence otherwise we try to 
 degenerate
 +* the sched_domain
 +*/
 +   if (!sd_rq)
 +   goto attach;
 +
 +   /* Get a pointer to the 1st sched_domain */
 +   sd = sd_rq-sd;

 /* Remove the sched domains which do not contribute to scheduling. 
 */
 for (tmp = sd; tmp; ) {
 @@ -5658,14 +5680,17 @@ cpu_attach_domain(struct sched_domain *sd, struct 
 root_domain *rd, int cpu)
 destroy_sched_domain(tmp, cpu);
 if (sd)
 

Re: [PATCH Resend v5] sched: fix init NOHZ_IDLE flag

2013-04-09 Thread Frederic Weisbecker
2013/4/4 Vincent Guittot vincent.guit...@linaro.org:
 On 4 April 2013 19:07, Frederic Weisbecker fweis...@gmail.com wrote:
 Is it possible that we can be dealing here with a
 sched_group/sched_group_power that is used on another CPU (from that
 CPU's rq-rq_sd-sd) concurrently?
 When we call build_sched_groups(), we might reuse an exisiting struct
 sched_group used elsewhere right? If so, is there a race with the
 above initialization?

 No we are not reusing an existing struct, the
 sched_group/sched_group_power that is initialized here, has just been
 created by __visit_domain_allocation_hell in build_sched_domains. The
 sched_group/sched_group_power is not already attached to any CPU

I see. Yeah the group allocations/initialization is done per domain
found in ndoms_new[]. And there is no further reuse of these groups
once these are attached.

Looking at the code it seems we can make some symetric conclusion with
group release? When we destroy a per cpu domain hierarchy and then put
our references to the struct sched_group, all the other per cpu
domains that reference these sched_group are about to put their
reference soon too, right? Because IIUC we always destroy these per
cpu domains per highest level partition (those found in doms_cur[])?

I'm just asking to make sure we don't need some
atomic_dec(nr_busy_cpus) on per cpu domain release, which is not
necessary the sched group is getting released soon.

Thanks for your patience :)
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH Resend v5] sched: fix init NOHZ_IDLE flag

2013-04-04 Thread Vincent Guittot
On 4 April 2013 19:07, Frederic Weisbecker  wrote:
> 2013/4/3 Vincent Guittot :
>> On my smp platform which is made of 5 cores in 2 clusters, I have the
>> nr_busy_cpu field of sched_group_power struct that is not null when the
>> platform is fully idle. The root cause is:
>> During the boot sequence, some CPUs reach the idle loop and set their
>> NOHZ_IDLE flag while waiting for others CPUs to boot. But the nr_busy_cpus
>> field is initialized later with the assumption that all CPUs are in the busy
>> state whereas some CPUs have already set their NOHZ_IDLE flag.
>>
>> More generally, the NOHZ_IDLE flag must be initialized when new sched_domains
>> are created in order to ensure that NOHZ_IDLE and nr_busy_cpus are aligned.
>>
>> This condition can be ensured by adding a synchronize_rcu between the
>> destruction of old sched_domains and the creation of new ones so the 
>> NOHZ_IDLE
>> flag will not be updated with old sched_domain once it has been initialized.
>> But this solution introduces a additionnal latency in the rebuild sequence
>> that is called during cpu hotplug.
>>
>> As suggested by Frederic Weisbecker, another solution is to have the same
>> rcu lifecycle for both NOHZ_IDLE and sched_domain struct. I have introduce
>> a new sched_domain_rq struct that is the entry point for both sched_domains
>> and objects that must follow the same lifecycle like NOHZ_IDLE flags. They
>> will share the same RCU lifecycle and will be always synchronized.
>>
>> The synchronization is done at the cost of :
>>  - an additional indirection for accessing the first sched_domain level
>>  - an additional indirection and a rcu_dereference before accessing to the
>>NOHZ_IDLE flag.
>>
>> Change since v4:
>>  - link both sched_domain and NOHZ_IDLE flag in one RCU object so
>>their states are always synchronized.
>>
>> Change since V3;
>>  - NOHZ flag is not cleared if a NULL domain is attached to the CPU
>>  - Remove patch 2/2 which becomes useless with latest modifications
>>
>> Change since V2:
>>  - change the initialization to idle state instead of busy state so a CPU 
>> that
>>enters idle during the build of the sched_domain will not corrupt the
>>initialization state
>>
>> Change since V1:
>>  - remove the patch for SCHED softirq on an idle core use case as it was
>>a side effect of the other use cases.
>>
>> Signed-off-by: Vincent Guittot 
>> ---
>>  include/linux/sched.h |6 +++
>>  kernel/sched/core.c   |  105 
>> -
>>  kernel/sched/fair.c   |   35 +++--
>>  kernel/sched/sched.h  |   24 +--
>>  4 files changed, 145 insertions(+), 25 deletions(-)
>>
>> diff --git a/include/linux/sched.h b/include/linux/sched.h
>> index d35d2b6..2a52188 100644
>> --- a/include/linux/sched.h
>> +++ b/include/linux/sched.h
>> @@ -959,6 +959,12 @@ struct sched_domain {
>> unsigned long span[0];
>>  };
>>
>> +struct sched_domain_rq {
>> +   struct sched_domain *sd;
>> +   unsigned long flags;
>> +   struct rcu_head rcu;/* used during destruction */
>> +};
>> +
>>  static inline struct cpumask *sched_domain_span(struct sched_domain *sd)
>>  {
>> return to_cpumask(sd->span);
>> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
>> index 7f12624..69e2313 100644
>> --- a/kernel/sched/core.c
>> +++ b/kernel/sched/core.c
>> @@ -5602,6 +5602,15 @@ static void destroy_sched_domains(struct sched_domain 
>> *sd, int cpu)
>> destroy_sched_domain(sd, cpu);
>>  }
>>
>> +static void destroy_sched_domain_rq(struct sched_domain_rq *sd_rq, int cpu)
>> +{
>> +   if (!sd_rq)
>> +   return;
>> +
>> +   destroy_sched_domains(sd_rq->sd, cpu);
>> +   kfree_rcu(sd_rq, rcu);
>> +}
>> +
>>  /*
>>   * Keep a special pointer to the highest sched_domain that has
>>   * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
>> @@ -5632,10 +5641,23 @@ static void update_top_cache_domain(int cpu)
>>   * hold the hotplug lock.
>>   */
>>  static void
>> -cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
>> +cpu_attach_domain(struct sched_domain_rq *sd_rq, struct root_domain *rd,
>> +   int cpu)
>>  {
>> struct rq *rq = cpu_rq(cpu);
>> -   struct sched_domain *tmp;
>> +   struct sched_domain_rq *tmp_rq;
>> +   struct sched_domain *tmp, *sd = NULL;
>> +
>> +   /*
>> +* If we don't have any sched_domain and associated object, we can
>> +* directly jump to the attach sequence otherwise we try to 
>> degenerate
>> +* the sched_domain
>> +*/
>> +   if (!sd_rq)
>> +   goto attach;
>> +
>> +   /* Get a pointer to the 1st sched_domain */
>> +   sd = sd_rq->sd;
>>
>> /* Remove the sched domains which do not contribute to scheduling. */
>> for (tmp = sd; tmp; ) {
>> @@ -5658,14 +5680,17 @@ cpu_attach_domain(struct sched_domain *sd, struct 
>> root_domain *rd, int cpu)
>>   

Re: [PATCH Resend v5] sched: fix init NOHZ_IDLE flag

2013-04-04 Thread Frederic Weisbecker
2013/4/3 Vincent Guittot :
> On my smp platform which is made of 5 cores in 2 clusters, I have the
> nr_busy_cpu field of sched_group_power struct that is not null when the
> platform is fully idle. The root cause is:
> During the boot sequence, some CPUs reach the idle loop and set their
> NOHZ_IDLE flag while waiting for others CPUs to boot. But the nr_busy_cpus
> field is initialized later with the assumption that all CPUs are in the busy
> state whereas some CPUs have already set their NOHZ_IDLE flag.
>
> More generally, the NOHZ_IDLE flag must be initialized when new sched_domains
> are created in order to ensure that NOHZ_IDLE and nr_busy_cpus are aligned.
>
> This condition can be ensured by adding a synchronize_rcu between the
> destruction of old sched_domains and the creation of new ones so the NOHZ_IDLE
> flag will not be updated with old sched_domain once it has been initialized.
> But this solution introduces a additionnal latency in the rebuild sequence
> that is called during cpu hotplug.
>
> As suggested by Frederic Weisbecker, another solution is to have the same
> rcu lifecycle for both NOHZ_IDLE and sched_domain struct. I have introduce
> a new sched_domain_rq struct that is the entry point for both sched_domains
> and objects that must follow the same lifecycle like NOHZ_IDLE flags. They
> will share the same RCU lifecycle and will be always synchronized.
>
> The synchronization is done at the cost of :
>  - an additional indirection for accessing the first sched_domain level
>  - an additional indirection and a rcu_dereference before accessing to the
>NOHZ_IDLE flag.
>
> Change since v4:
>  - link both sched_domain and NOHZ_IDLE flag in one RCU object so
>their states are always synchronized.
>
> Change since V3;
>  - NOHZ flag is not cleared if a NULL domain is attached to the CPU
>  - Remove patch 2/2 which becomes useless with latest modifications
>
> Change since V2:
>  - change the initialization to idle state instead of busy state so a CPU that
>enters idle during the build of the sched_domain will not corrupt the
>initialization state
>
> Change since V1:
>  - remove the patch for SCHED softirq on an idle core use case as it was
>a side effect of the other use cases.
>
> Signed-off-by: Vincent Guittot 
> ---
>  include/linux/sched.h |6 +++
>  kernel/sched/core.c   |  105 
> -
>  kernel/sched/fair.c   |   35 +++--
>  kernel/sched/sched.h  |   24 +--
>  4 files changed, 145 insertions(+), 25 deletions(-)
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index d35d2b6..2a52188 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -959,6 +959,12 @@ struct sched_domain {
> unsigned long span[0];
>  };
>
> +struct sched_domain_rq {
> +   struct sched_domain *sd;
> +   unsigned long flags;
> +   struct rcu_head rcu;/* used during destruction */
> +};
> +
>  static inline struct cpumask *sched_domain_span(struct sched_domain *sd)
>  {
> return to_cpumask(sd->span);
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 7f12624..69e2313 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -5602,6 +5602,15 @@ static void destroy_sched_domains(struct sched_domain 
> *sd, int cpu)
> destroy_sched_domain(sd, cpu);
>  }
>
> +static void destroy_sched_domain_rq(struct sched_domain_rq *sd_rq, int cpu)
> +{
> +   if (!sd_rq)
> +   return;
> +
> +   destroy_sched_domains(sd_rq->sd, cpu);
> +   kfree_rcu(sd_rq, rcu);
> +}
> +
>  /*
>   * Keep a special pointer to the highest sched_domain that has
>   * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
> @@ -5632,10 +5641,23 @@ static void update_top_cache_domain(int cpu)
>   * hold the hotplug lock.
>   */
>  static void
> -cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
> +cpu_attach_domain(struct sched_domain_rq *sd_rq, struct root_domain *rd,
> +   int cpu)
>  {
> struct rq *rq = cpu_rq(cpu);
> -   struct sched_domain *tmp;
> +   struct sched_domain_rq *tmp_rq;
> +   struct sched_domain *tmp, *sd = NULL;
> +
> +   /*
> +* If we don't have any sched_domain and associated object, we can
> +* directly jump to the attach sequence otherwise we try to degenerate
> +* the sched_domain
> +*/
> +   if (!sd_rq)
> +   goto attach;
> +
> +   /* Get a pointer to the 1st sched_domain */
> +   sd = sd_rq->sd;
>
> /* Remove the sched domains which do not contribute to scheduling. */
> for (tmp = sd; tmp; ) {
> @@ -5658,14 +5680,17 @@ cpu_attach_domain(struct sched_domain *sd, struct 
> root_domain *rd, int cpu)
> destroy_sched_domain(tmp, cpu);
> if (sd)
> sd->child = NULL;
> +   /* update sched_domain_rq */
> +   

Re: [PATCH Resend v5] sched: fix init NOHZ_IDLE flag

2013-04-04 Thread Frederic Weisbecker
2013/4/4 Frederic Weisbecker :
> 2013/4/3 Vincent Guittot :
>> On my smp platform which is made of 5 cores in 2 clusters, I have the
>> nr_busy_cpu field of sched_group_power struct that is not null when the
>> platform is fully idle. The root cause is:
>> During the boot sequence, some CPUs reach the idle loop and set their
>> NOHZ_IDLE flag while waiting for others CPUs to boot. But the nr_busy_cpus
>> field is initialized later with the assumption that all CPUs are in the busy
>> state whereas some CPUs have already set their NOHZ_IDLE flag.
>>
>> More generally, the NOHZ_IDLE flag must be initialized when new sched_domains
>> are created in order to ensure that NOHZ_IDLE and nr_busy_cpus are aligned.
>>
>> This condition can be ensured by adding a synchronize_rcu between the
>> destruction of old sched_domains and the creation of new ones so the 
>> NOHZ_IDLE
>> flag will not be updated with old sched_domain once it has been initialized.
>> But this solution introduces a additionnal latency in the rebuild sequence
>> that is called during cpu hotplug.
>>
>> As suggested by Frederic Weisbecker, another solution is to have the same
>> rcu lifecycle for both NOHZ_IDLE and sched_domain struct. I have introduce
>> a new sched_domain_rq struct that is the entry point for both sched_domains
>> and objects that must follow the same lifecycle like NOHZ_IDLE flags. They
>> will share the same RCU lifecycle and will be always synchronized.
>>
>> The synchronization is done at the cost of :
>>  - an additional indirection for accessing the first sched_domain level
>>  - an additional indirection and a rcu_dereference before accessing to the
>>NOHZ_IDLE flag.
>>
>> Change since v4:
>>  - link both sched_domain and NOHZ_IDLE flag in one RCU object so
>>their states are always synchronized.
>>
>> Change since V3;
>>  - NOHZ flag is not cleared if a NULL domain is attached to the CPU
>>  - Remove patch 2/2 which becomes useless with latest modifications
>>
>> Change since V2:
>>  - change the initialization to idle state instead of busy state so a CPU 
>> that
>>enters idle during the build of the sched_domain will not corrupt the
>>initialization state
>>
>> Change since V1:
>>  - remove the patch for SCHED softirq on an idle core use case as it was
>>a side effect of the other use cases.
>>
>> Signed-off-by: Vincent Guittot 
>> ---
>>  include/linux/sched.h |6 +++
>>  kernel/sched/core.c   |  105 
>> -
>>  kernel/sched/fair.c   |   35 +++--
>>  kernel/sched/sched.h  |   24 +--
>>  4 files changed, 145 insertions(+), 25 deletions(-)
>>
>> diff --git a/include/linux/sched.h b/include/linux/sched.h
>> index d35d2b6..2a52188 100644
>> --- a/include/linux/sched.h
>> +++ b/include/linux/sched.h
>> @@ -959,6 +959,12 @@ struct sched_domain {
>> unsigned long span[0];
>>  };
>>
>> +struct sched_domain_rq {
>> +   struct sched_domain *sd;
>
> Why not make it part of the structure content instead of pointing to it?

I just realize that would make destroy_sched_domains() too complicated
because only the leaf sched_domain belong to a sched_domain_rq.
Nevermind.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH Resend v5] sched: fix init NOHZ_IDLE flag

2013-04-04 Thread Frederic Weisbecker
2013/4/3 Vincent Guittot :
> On my smp platform which is made of 5 cores in 2 clusters, I have the
> nr_busy_cpu field of sched_group_power struct that is not null when the
> platform is fully idle. The root cause is:
> During the boot sequence, some CPUs reach the idle loop and set their
> NOHZ_IDLE flag while waiting for others CPUs to boot. But the nr_busy_cpus
> field is initialized later with the assumption that all CPUs are in the busy
> state whereas some CPUs have already set their NOHZ_IDLE flag.
>
> More generally, the NOHZ_IDLE flag must be initialized when new sched_domains
> are created in order to ensure that NOHZ_IDLE and nr_busy_cpus are aligned.
>
> This condition can be ensured by adding a synchronize_rcu between the
> destruction of old sched_domains and the creation of new ones so the NOHZ_IDLE
> flag will not be updated with old sched_domain once it has been initialized.
> But this solution introduces a additionnal latency in the rebuild sequence
> that is called during cpu hotplug.
>
> As suggested by Frederic Weisbecker, another solution is to have the same
> rcu lifecycle for both NOHZ_IDLE and sched_domain struct. I have introduce
> a new sched_domain_rq struct that is the entry point for both sched_domains
> and objects that must follow the same lifecycle like NOHZ_IDLE flags. They
> will share the same RCU lifecycle and will be always synchronized.
>
> The synchronization is done at the cost of :
>  - an additional indirection for accessing the first sched_domain level
>  - an additional indirection and a rcu_dereference before accessing to the
>NOHZ_IDLE flag.
>
> Change since v4:
>  - link both sched_domain and NOHZ_IDLE flag in one RCU object so
>their states are always synchronized.
>
> Change since V3;
>  - NOHZ flag is not cleared if a NULL domain is attached to the CPU
>  - Remove patch 2/2 which becomes useless with latest modifications
>
> Change since V2:
>  - change the initialization to idle state instead of busy state so a CPU that
>enters idle during the build of the sched_domain will not corrupt the
>initialization state
>
> Change since V1:
>  - remove the patch for SCHED softirq on an idle core use case as it was
>a side effect of the other use cases.
>
> Signed-off-by: Vincent Guittot 
> ---
>  include/linux/sched.h |6 +++
>  kernel/sched/core.c   |  105 
> -
>  kernel/sched/fair.c   |   35 +++--
>  kernel/sched/sched.h  |   24 +--
>  4 files changed, 145 insertions(+), 25 deletions(-)
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index d35d2b6..2a52188 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -959,6 +959,12 @@ struct sched_domain {
> unsigned long span[0];
>  };
>
> +struct sched_domain_rq {
> +   struct sched_domain *sd;

Why not make it part of the structure content instead of pointing to it?

> +   unsigned long flags;
> +   struct rcu_head rcu;/* used during destruction */
> +};
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH Resend v5] sched: fix init NOHZ_IDLE flag

2013-04-04 Thread Frederic Weisbecker
2013/4/3 Vincent Guittot vincent.guit...@linaro.org:
 On my smp platform which is made of 5 cores in 2 clusters, I have the
 nr_busy_cpu field of sched_group_power struct that is not null when the
 platform is fully idle. The root cause is:
 During the boot sequence, some CPUs reach the idle loop and set their
 NOHZ_IDLE flag while waiting for others CPUs to boot. But the nr_busy_cpus
 field is initialized later with the assumption that all CPUs are in the busy
 state whereas some CPUs have already set their NOHZ_IDLE flag.

 More generally, the NOHZ_IDLE flag must be initialized when new sched_domains
 are created in order to ensure that NOHZ_IDLE and nr_busy_cpus are aligned.

 This condition can be ensured by adding a synchronize_rcu between the
 destruction of old sched_domains and the creation of new ones so the NOHZ_IDLE
 flag will not be updated with old sched_domain once it has been initialized.
 But this solution introduces a additionnal latency in the rebuild sequence
 that is called during cpu hotplug.

 As suggested by Frederic Weisbecker, another solution is to have the same
 rcu lifecycle for both NOHZ_IDLE and sched_domain struct. I have introduce
 a new sched_domain_rq struct that is the entry point for both sched_domains
 and objects that must follow the same lifecycle like NOHZ_IDLE flags. They
 will share the same RCU lifecycle and will be always synchronized.

 The synchronization is done at the cost of :
  - an additional indirection for accessing the first sched_domain level
  - an additional indirection and a rcu_dereference before accessing to the
NOHZ_IDLE flag.

 Change since v4:
  - link both sched_domain and NOHZ_IDLE flag in one RCU object so
their states are always synchronized.

 Change since V3;
  - NOHZ flag is not cleared if a NULL domain is attached to the CPU
  - Remove patch 2/2 which becomes useless with latest modifications

 Change since V2:
  - change the initialization to idle state instead of busy state so a CPU that
enters idle during the build of the sched_domain will not corrupt the
initialization state

 Change since V1:
  - remove the patch for SCHED softirq on an idle core use case as it was
a side effect of the other use cases.

 Signed-off-by: Vincent Guittot vincent.guit...@linaro.org
 ---
  include/linux/sched.h |6 +++
  kernel/sched/core.c   |  105 
 -
  kernel/sched/fair.c   |   35 +++--
  kernel/sched/sched.h  |   24 +--
  4 files changed, 145 insertions(+), 25 deletions(-)

 diff --git a/include/linux/sched.h b/include/linux/sched.h
 index d35d2b6..2a52188 100644
 --- a/include/linux/sched.h
 +++ b/include/linux/sched.h
 @@ -959,6 +959,12 @@ struct sched_domain {
 unsigned long span[0];
  };

 +struct sched_domain_rq {
 +   struct sched_domain *sd;

Why not make it part of the structure content instead of pointing to it?

 +   unsigned long flags;
 +   struct rcu_head rcu;/* used during destruction */
 +};
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH Resend v5] sched: fix init NOHZ_IDLE flag

2013-04-04 Thread Frederic Weisbecker
2013/4/4 Frederic Weisbecker fweis...@gmail.com:
 2013/4/3 Vincent Guittot vincent.guit...@linaro.org:
 On my smp platform which is made of 5 cores in 2 clusters, I have the
 nr_busy_cpu field of sched_group_power struct that is not null when the
 platform is fully idle. The root cause is:
 During the boot sequence, some CPUs reach the idle loop and set their
 NOHZ_IDLE flag while waiting for others CPUs to boot. But the nr_busy_cpus
 field is initialized later with the assumption that all CPUs are in the busy
 state whereas some CPUs have already set their NOHZ_IDLE flag.

 More generally, the NOHZ_IDLE flag must be initialized when new sched_domains
 are created in order to ensure that NOHZ_IDLE and nr_busy_cpus are aligned.

 This condition can be ensured by adding a synchronize_rcu between the
 destruction of old sched_domains and the creation of new ones so the 
 NOHZ_IDLE
 flag will not be updated with old sched_domain once it has been initialized.
 But this solution introduces a additionnal latency in the rebuild sequence
 that is called during cpu hotplug.

 As suggested by Frederic Weisbecker, another solution is to have the same
 rcu lifecycle for both NOHZ_IDLE and sched_domain struct. I have introduce
 a new sched_domain_rq struct that is the entry point for both sched_domains
 and objects that must follow the same lifecycle like NOHZ_IDLE flags. They
 will share the same RCU lifecycle and will be always synchronized.

 The synchronization is done at the cost of :
  - an additional indirection for accessing the first sched_domain level
  - an additional indirection and a rcu_dereference before accessing to the
NOHZ_IDLE flag.

 Change since v4:
  - link both sched_domain and NOHZ_IDLE flag in one RCU object so
their states are always synchronized.

 Change since V3;
  - NOHZ flag is not cleared if a NULL domain is attached to the CPU
  - Remove patch 2/2 which becomes useless with latest modifications

 Change since V2:
  - change the initialization to idle state instead of busy state so a CPU 
 that
enters idle during the build of the sched_domain will not corrupt the
initialization state

 Change since V1:
  - remove the patch for SCHED softirq on an idle core use case as it was
a side effect of the other use cases.

 Signed-off-by: Vincent Guittot vincent.guit...@linaro.org
 ---
  include/linux/sched.h |6 +++
  kernel/sched/core.c   |  105 
 -
  kernel/sched/fair.c   |   35 +++--
  kernel/sched/sched.h  |   24 +--
  4 files changed, 145 insertions(+), 25 deletions(-)

 diff --git a/include/linux/sched.h b/include/linux/sched.h
 index d35d2b6..2a52188 100644
 --- a/include/linux/sched.h
 +++ b/include/linux/sched.h
 @@ -959,6 +959,12 @@ struct sched_domain {
 unsigned long span[0];
  };

 +struct sched_domain_rq {
 +   struct sched_domain *sd;

 Why not make it part of the structure content instead of pointing to it?

I just realize that would make destroy_sched_domains() too complicated
because only the leaf sched_domain belong to a sched_domain_rq.
Nevermind.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH Resend v5] sched: fix init NOHZ_IDLE flag

2013-04-04 Thread Frederic Weisbecker
2013/4/3 Vincent Guittot vincent.guit...@linaro.org:
 On my smp platform which is made of 5 cores in 2 clusters, I have the
 nr_busy_cpu field of sched_group_power struct that is not null when the
 platform is fully idle. The root cause is:
 During the boot sequence, some CPUs reach the idle loop and set their
 NOHZ_IDLE flag while waiting for others CPUs to boot. But the nr_busy_cpus
 field is initialized later with the assumption that all CPUs are in the busy
 state whereas some CPUs have already set their NOHZ_IDLE flag.

 More generally, the NOHZ_IDLE flag must be initialized when new sched_domains
 are created in order to ensure that NOHZ_IDLE and nr_busy_cpus are aligned.

 This condition can be ensured by adding a synchronize_rcu between the
 destruction of old sched_domains and the creation of new ones so the NOHZ_IDLE
 flag will not be updated with old sched_domain once it has been initialized.
 But this solution introduces a additionnal latency in the rebuild sequence
 that is called during cpu hotplug.

 As suggested by Frederic Weisbecker, another solution is to have the same
 rcu lifecycle for both NOHZ_IDLE and sched_domain struct. I have introduce
 a new sched_domain_rq struct that is the entry point for both sched_domains
 and objects that must follow the same lifecycle like NOHZ_IDLE flags. They
 will share the same RCU lifecycle and will be always synchronized.

 The synchronization is done at the cost of :
  - an additional indirection for accessing the first sched_domain level
  - an additional indirection and a rcu_dereference before accessing to the
NOHZ_IDLE flag.

 Change since v4:
  - link both sched_domain and NOHZ_IDLE flag in one RCU object so
their states are always synchronized.

 Change since V3;
  - NOHZ flag is not cleared if a NULL domain is attached to the CPU
  - Remove patch 2/2 which becomes useless with latest modifications

 Change since V2:
  - change the initialization to idle state instead of busy state so a CPU that
enters idle during the build of the sched_domain will not corrupt the
initialization state

 Change since V1:
  - remove the patch for SCHED softirq on an idle core use case as it was
a side effect of the other use cases.

 Signed-off-by: Vincent Guittot vincent.guit...@linaro.org
 ---
  include/linux/sched.h |6 +++
  kernel/sched/core.c   |  105 
 -
  kernel/sched/fair.c   |   35 +++--
  kernel/sched/sched.h  |   24 +--
  4 files changed, 145 insertions(+), 25 deletions(-)

 diff --git a/include/linux/sched.h b/include/linux/sched.h
 index d35d2b6..2a52188 100644
 --- a/include/linux/sched.h
 +++ b/include/linux/sched.h
 @@ -959,6 +959,12 @@ struct sched_domain {
 unsigned long span[0];
  };

 +struct sched_domain_rq {
 +   struct sched_domain *sd;
 +   unsigned long flags;
 +   struct rcu_head rcu;/* used during destruction */
 +};
 +
  static inline struct cpumask *sched_domain_span(struct sched_domain *sd)
  {
 return to_cpumask(sd-span);
 diff --git a/kernel/sched/core.c b/kernel/sched/core.c
 index 7f12624..69e2313 100644
 --- a/kernel/sched/core.c
 +++ b/kernel/sched/core.c
 @@ -5602,6 +5602,15 @@ static void destroy_sched_domains(struct sched_domain 
 *sd, int cpu)
 destroy_sched_domain(sd, cpu);
  }

 +static void destroy_sched_domain_rq(struct sched_domain_rq *sd_rq, int cpu)
 +{
 +   if (!sd_rq)
 +   return;
 +
 +   destroy_sched_domains(sd_rq-sd, cpu);
 +   kfree_rcu(sd_rq, rcu);
 +}
 +
  /*
   * Keep a special pointer to the highest sched_domain that has
   * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
 @@ -5632,10 +5641,23 @@ static void update_top_cache_domain(int cpu)
   * hold the hotplug lock.
   */
  static void
 -cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
 +cpu_attach_domain(struct sched_domain_rq *sd_rq, struct root_domain *rd,
 +   int cpu)
  {
 struct rq *rq = cpu_rq(cpu);
 -   struct sched_domain *tmp;
 +   struct sched_domain_rq *tmp_rq;
 +   struct sched_domain *tmp, *sd = NULL;
 +
 +   /*
 +* If we don't have any sched_domain and associated object, we can
 +* directly jump to the attach sequence otherwise we try to degenerate
 +* the sched_domain
 +*/
 +   if (!sd_rq)
 +   goto attach;
 +
 +   /* Get a pointer to the 1st sched_domain */
 +   sd = sd_rq-sd;

 /* Remove the sched domains which do not contribute to scheduling. */
 for (tmp = sd; tmp; ) {
 @@ -5658,14 +5680,17 @@ cpu_attach_domain(struct sched_domain *sd, struct 
 root_domain *rd, int cpu)
 destroy_sched_domain(tmp, cpu);
 if (sd)
 sd-child = NULL;
 +   /* update sched_domain_rq */
 +   sd_rq-sd = sd;
 }

 +attach:
 sched_domain_debug(sd, cpu);

  

Re: [PATCH Resend v5] sched: fix init NOHZ_IDLE flag

2013-04-04 Thread Vincent Guittot
On 4 April 2013 19:07, Frederic Weisbecker fweis...@gmail.com wrote:
 2013/4/3 Vincent Guittot vincent.guit...@linaro.org:
 On my smp platform which is made of 5 cores in 2 clusters, I have the
 nr_busy_cpu field of sched_group_power struct that is not null when the
 platform is fully idle. The root cause is:
 During the boot sequence, some CPUs reach the idle loop and set their
 NOHZ_IDLE flag while waiting for others CPUs to boot. But the nr_busy_cpus
 field is initialized later with the assumption that all CPUs are in the busy
 state whereas some CPUs have already set their NOHZ_IDLE flag.

 More generally, the NOHZ_IDLE flag must be initialized when new sched_domains
 are created in order to ensure that NOHZ_IDLE and nr_busy_cpus are aligned.

 This condition can be ensured by adding a synchronize_rcu between the
 destruction of old sched_domains and the creation of new ones so the 
 NOHZ_IDLE
 flag will not be updated with old sched_domain once it has been initialized.
 But this solution introduces a additionnal latency in the rebuild sequence
 that is called during cpu hotplug.

 As suggested by Frederic Weisbecker, another solution is to have the same
 rcu lifecycle for both NOHZ_IDLE and sched_domain struct. I have introduce
 a new sched_domain_rq struct that is the entry point for both sched_domains
 and objects that must follow the same lifecycle like NOHZ_IDLE flags. They
 will share the same RCU lifecycle and will be always synchronized.

 The synchronization is done at the cost of :
  - an additional indirection for accessing the first sched_domain level
  - an additional indirection and a rcu_dereference before accessing to the
NOHZ_IDLE flag.

 Change since v4:
  - link both sched_domain and NOHZ_IDLE flag in one RCU object so
their states are always synchronized.

 Change since V3;
  - NOHZ flag is not cleared if a NULL domain is attached to the CPU
  - Remove patch 2/2 which becomes useless with latest modifications

 Change since V2:
  - change the initialization to idle state instead of busy state so a CPU 
 that
enters idle during the build of the sched_domain will not corrupt the
initialization state

 Change since V1:
  - remove the patch for SCHED softirq on an idle core use case as it was
a side effect of the other use cases.

 Signed-off-by: Vincent Guittot vincent.guit...@linaro.org
 ---
  include/linux/sched.h |6 +++
  kernel/sched/core.c   |  105 
 -
  kernel/sched/fair.c   |   35 +++--
  kernel/sched/sched.h  |   24 +--
  4 files changed, 145 insertions(+), 25 deletions(-)

 diff --git a/include/linux/sched.h b/include/linux/sched.h
 index d35d2b6..2a52188 100644
 --- a/include/linux/sched.h
 +++ b/include/linux/sched.h
 @@ -959,6 +959,12 @@ struct sched_domain {
 unsigned long span[0];
  };

 +struct sched_domain_rq {
 +   struct sched_domain *sd;
 +   unsigned long flags;
 +   struct rcu_head rcu;/* used during destruction */
 +};
 +
  static inline struct cpumask *sched_domain_span(struct sched_domain *sd)
  {
 return to_cpumask(sd-span);
 diff --git a/kernel/sched/core.c b/kernel/sched/core.c
 index 7f12624..69e2313 100644
 --- a/kernel/sched/core.c
 +++ b/kernel/sched/core.c
 @@ -5602,6 +5602,15 @@ static void destroy_sched_domains(struct sched_domain 
 *sd, int cpu)
 destroy_sched_domain(sd, cpu);
  }

 +static void destroy_sched_domain_rq(struct sched_domain_rq *sd_rq, int cpu)
 +{
 +   if (!sd_rq)
 +   return;
 +
 +   destroy_sched_domains(sd_rq-sd, cpu);
 +   kfree_rcu(sd_rq, rcu);
 +}
 +
  /*
   * Keep a special pointer to the highest sched_domain that has
   * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
 @@ -5632,10 +5641,23 @@ static void update_top_cache_domain(int cpu)
   * hold the hotplug lock.
   */
  static void
 -cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
 +cpu_attach_domain(struct sched_domain_rq *sd_rq, struct root_domain *rd,
 +   int cpu)
  {
 struct rq *rq = cpu_rq(cpu);
 -   struct sched_domain *tmp;
 +   struct sched_domain_rq *tmp_rq;
 +   struct sched_domain *tmp, *sd = NULL;
 +
 +   /*
 +* If we don't have any sched_domain and associated object, we can
 +* directly jump to the attach sequence otherwise we try to 
 degenerate
 +* the sched_domain
 +*/
 +   if (!sd_rq)
 +   goto attach;
 +
 +   /* Get a pointer to the 1st sched_domain */
 +   sd = sd_rq-sd;

 /* Remove the sched domains which do not contribute to scheduling. */
 for (tmp = sd; tmp; ) {
 @@ -5658,14 +5680,17 @@ cpu_attach_domain(struct sched_domain *sd, struct 
 root_domain *rd, int cpu)
 destroy_sched_domain(tmp, cpu);
 if (sd)
 sd-child = NULL;
 +   /* update sched_domain_rq */
 +   

[PATCH Resend v5] sched: fix init NOHZ_IDLE flag

2013-04-03 Thread Vincent Guittot
On my smp platform which is made of 5 cores in 2 clusters, I have the
nr_busy_cpu field of sched_group_power struct that is not null when the
platform is fully idle. The root cause is:
During the boot sequence, some CPUs reach the idle loop and set their
NOHZ_IDLE flag while waiting for others CPUs to boot. But the nr_busy_cpus
field is initialized later with the assumption that all CPUs are in the busy
state whereas some CPUs have already set their NOHZ_IDLE flag.

More generally, the NOHZ_IDLE flag must be initialized when new sched_domains
are created in order to ensure that NOHZ_IDLE and nr_busy_cpus are aligned.

This condition can be ensured by adding a synchronize_rcu between the
destruction of old sched_domains and the creation of new ones so the NOHZ_IDLE
flag will not be updated with old sched_domain once it has been initialized.
But this solution introduces a additionnal latency in the rebuild sequence
that is called during cpu hotplug.

As suggested by Frederic Weisbecker, another solution is to have the same
rcu lifecycle for both NOHZ_IDLE and sched_domain struct. I have introduce
a new sched_domain_rq struct that is the entry point for both sched_domains
and objects that must follow the same lifecycle like NOHZ_IDLE flags. They
will share the same RCU lifecycle and will be always synchronized.

The synchronization is done at the cost of :
 - an additional indirection for accessing the first sched_domain level
 - an additional indirection and a rcu_dereference before accessing to the
   NOHZ_IDLE flag.

Change since v4:
 - link both sched_domain and NOHZ_IDLE flag in one RCU object so
   their states are always synchronized.

Change since V3;
 - NOHZ flag is not cleared if a NULL domain is attached to the CPU
 - Remove patch 2/2 which becomes useless with latest modifications

Change since V2:
 - change the initialization to idle state instead of busy state so a CPU that
   enters idle during the build of the sched_domain will not corrupt the
   initialization state

Change since V1:
 - remove the patch for SCHED softirq on an idle core use case as it was
   a side effect of the other use cases.

Signed-off-by: Vincent Guittot 
---
 include/linux/sched.h |6 +++
 kernel/sched/core.c   |  105 -
 kernel/sched/fair.c   |   35 +++--
 kernel/sched/sched.h  |   24 +--
 4 files changed, 145 insertions(+), 25 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index d35d2b6..2a52188 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -959,6 +959,12 @@ struct sched_domain {
unsigned long span[0];
 };
 
+struct sched_domain_rq {
+   struct sched_domain *sd;
+   unsigned long flags;
+   struct rcu_head rcu;/* used during destruction */
+};
+
 static inline struct cpumask *sched_domain_span(struct sched_domain *sd)
 {
return to_cpumask(sd->span);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 7f12624..69e2313 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5602,6 +5602,15 @@ static void destroy_sched_domains(struct sched_domain 
*sd, int cpu)
destroy_sched_domain(sd, cpu);
 }
 
+static void destroy_sched_domain_rq(struct sched_domain_rq *sd_rq, int cpu)
+{
+   if (!sd_rq)
+   return;
+
+   destroy_sched_domains(sd_rq->sd, cpu);
+   kfree_rcu(sd_rq, rcu);
+}
+
 /*
  * Keep a special pointer to the highest sched_domain that has
  * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
@@ -5632,10 +5641,23 @@ static void update_top_cache_domain(int cpu)
  * hold the hotplug lock.
  */
 static void
-cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
+cpu_attach_domain(struct sched_domain_rq *sd_rq, struct root_domain *rd,
+   int cpu)
 {
struct rq *rq = cpu_rq(cpu);
-   struct sched_domain *tmp;
+   struct sched_domain_rq *tmp_rq;
+   struct sched_domain *tmp, *sd = NULL;
+
+   /*
+* If we don't have any sched_domain and associated object, we can
+* directly jump to the attach sequence otherwise we try to degenerate
+* the sched_domain
+*/
+   if (!sd_rq)
+   goto attach;
+
+   /* Get a pointer to the 1st sched_domain */
+   sd = sd_rq->sd;
 
/* Remove the sched domains which do not contribute to scheduling. */
for (tmp = sd; tmp; ) {
@@ -5658,14 +5680,17 @@ cpu_attach_domain(struct sched_domain *sd, struct 
root_domain *rd, int cpu)
destroy_sched_domain(tmp, cpu);
if (sd)
sd->child = NULL;
+   /* update sched_domain_rq */
+   sd_rq->sd = sd;
}
 
+attach:
sched_domain_debug(sd, cpu);
 
rq_attach_root(rq, rd);
-   tmp = rq->sd;
-   rcu_assign_pointer(rq->sd, sd);
-   destroy_sched_domains(tmp, cpu);
+   tmp_rq = rq->sd_rq;
+   

[PATCH Resend v5] sched: fix init NOHZ_IDLE flag

2013-04-03 Thread Vincent Guittot
On my smp platform which is made of 5 cores in 2 clusters, I have the
nr_busy_cpu field of sched_group_power struct that is not null when the
platform is fully idle. The root cause is:
During the boot sequence, some CPUs reach the idle loop and set their
NOHZ_IDLE flag while waiting for others CPUs to boot. But the nr_busy_cpus
field is initialized later with the assumption that all CPUs are in the busy
state whereas some CPUs have already set their NOHZ_IDLE flag.

More generally, the NOHZ_IDLE flag must be initialized when new sched_domains
are created in order to ensure that NOHZ_IDLE and nr_busy_cpus are aligned.

This condition can be ensured by adding a synchronize_rcu between the
destruction of old sched_domains and the creation of new ones so the NOHZ_IDLE
flag will not be updated with old sched_domain once it has been initialized.
But this solution introduces a additionnal latency in the rebuild sequence
that is called during cpu hotplug.

As suggested by Frederic Weisbecker, another solution is to have the same
rcu lifecycle for both NOHZ_IDLE and sched_domain struct. I have introduce
a new sched_domain_rq struct that is the entry point for both sched_domains
and objects that must follow the same lifecycle like NOHZ_IDLE flags. They
will share the same RCU lifecycle and will be always synchronized.

The synchronization is done at the cost of :
 - an additional indirection for accessing the first sched_domain level
 - an additional indirection and a rcu_dereference before accessing to the
   NOHZ_IDLE flag.

Change since v4:
 - link both sched_domain and NOHZ_IDLE flag in one RCU object so
   their states are always synchronized.

Change since V3;
 - NOHZ flag is not cleared if a NULL domain is attached to the CPU
 - Remove patch 2/2 which becomes useless with latest modifications

Change since V2:
 - change the initialization to idle state instead of busy state so a CPU that
   enters idle during the build of the sched_domain will not corrupt the
   initialization state

Change since V1:
 - remove the patch for SCHED softirq on an idle core use case as it was
   a side effect of the other use cases.

Signed-off-by: Vincent Guittot vincent.guit...@linaro.org
---
 include/linux/sched.h |6 +++
 kernel/sched/core.c   |  105 -
 kernel/sched/fair.c   |   35 +++--
 kernel/sched/sched.h  |   24 +--
 4 files changed, 145 insertions(+), 25 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index d35d2b6..2a52188 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -959,6 +959,12 @@ struct sched_domain {
unsigned long span[0];
 };
 
+struct sched_domain_rq {
+   struct sched_domain *sd;
+   unsigned long flags;
+   struct rcu_head rcu;/* used during destruction */
+};
+
 static inline struct cpumask *sched_domain_span(struct sched_domain *sd)
 {
return to_cpumask(sd-span);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 7f12624..69e2313 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5602,6 +5602,15 @@ static void destroy_sched_domains(struct sched_domain 
*sd, int cpu)
destroy_sched_domain(sd, cpu);
 }
 
+static void destroy_sched_domain_rq(struct sched_domain_rq *sd_rq, int cpu)
+{
+   if (!sd_rq)
+   return;
+
+   destroy_sched_domains(sd_rq-sd, cpu);
+   kfree_rcu(sd_rq, rcu);
+}
+
 /*
  * Keep a special pointer to the highest sched_domain that has
  * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
@@ -5632,10 +5641,23 @@ static void update_top_cache_domain(int cpu)
  * hold the hotplug lock.
  */
 static void
-cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
+cpu_attach_domain(struct sched_domain_rq *sd_rq, struct root_domain *rd,
+   int cpu)
 {
struct rq *rq = cpu_rq(cpu);
-   struct sched_domain *tmp;
+   struct sched_domain_rq *tmp_rq;
+   struct sched_domain *tmp, *sd = NULL;
+
+   /*
+* If we don't have any sched_domain and associated object, we can
+* directly jump to the attach sequence otherwise we try to degenerate
+* the sched_domain
+*/
+   if (!sd_rq)
+   goto attach;
+
+   /* Get a pointer to the 1st sched_domain */
+   sd = sd_rq-sd;
 
/* Remove the sched domains which do not contribute to scheduling. */
for (tmp = sd; tmp; ) {
@@ -5658,14 +5680,17 @@ cpu_attach_domain(struct sched_domain *sd, struct 
root_domain *rd, int cpu)
destroy_sched_domain(tmp, cpu);
if (sd)
sd-child = NULL;
+   /* update sched_domain_rq */
+   sd_rq-sd = sd;
}
 
+attach:
sched_domain_debug(sd, cpu);
 
rq_attach_root(rq, rd);
-   tmp = rq-sd;
-   rcu_assign_pointer(rq-sd, sd);
-   destroy_sched_domains(tmp, cpu);
+   tmp_rq = rq-sd_rq;
+