Re: [PATCH Resend v5] sched: fix init NOHZ_IDLE flag
On 11 April 2013 16:56, Frederic Weisbecker wrote: > On Wed, Apr 03, 2013 at 03:37:43PM +0200, Vincent Guittot wrote: >> On my smp platform which is made of 5 cores in 2 clusters, I have the >> nr_busy_cpu field of sched_group_power struct that is not null when the >> platform is fully idle. The root cause is: >> During the boot sequence, some CPUs reach the idle loop and set their >> NOHZ_IDLE flag while waiting for others CPUs to boot. But the nr_busy_cpus >> field is initialized later with the assumption that all CPUs are in the busy >> state whereas some CPUs have already set their NOHZ_IDLE flag. >> >> More generally, the NOHZ_IDLE flag must be initialized when new sched_domains >> are created in order to ensure that NOHZ_IDLE and nr_busy_cpus are aligned. >> >> This condition can be ensured by adding a synchronize_rcu between the >> destruction of old sched_domains and the creation of new ones so the >> NOHZ_IDLE >> flag will not be updated with old sched_domain once it has been initialized. >> But this solution introduces a additionnal latency in the rebuild sequence >> that is called during cpu hotplug. >> >> As suggested by Frederic Weisbecker, another solution is to have the same >> rcu lifecycle for both NOHZ_IDLE and sched_domain struct. I have introduce >> a new sched_domain_rq struct that is the entry point for both sched_domains >> and objects that must follow the same lifecycle like NOHZ_IDLE flags. They >> will share the same RCU lifecycle and will be always synchronized. >> >> The synchronization is done at the cost of : >> - an additional indirection for accessing the first sched_domain level >> - an additional indirection and a rcu_dereference before accessing to the >>NOHZ_IDLE flag. >> >> Change since v4: >> - link both sched_domain and NOHZ_IDLE flag in one RCU object so >>their states are always synchronized. >> >> Change since V3; >> - NOHZ flag is not cleared if a NULL domain is attached to the CPU >> - Remove patch 2/2 which becomes useless with latest modifications >> >> Change since V2: >> - change the initialization to idle state instead of busy state so a CPU >> that >>enters idle during the build of the sched_domain will not corrupt the >>initialization state >> >> Change since V1: >> - remove the patch for SCHED softirq on an idle core use case as it was >>a side effect of the other use cases. >> >> Signed-off-by: Vincent Guittot > > Ok, the lockless scheme involving nr_buzy_cpus and rq flags seem to be > correct now. > We can hope somebody will come up with a less complicated solution. But for > now that's > the best fix I've seen. Thanks > > I just have a few comments on details. > >> --- >> include/linux/sched.h |6 +++ >> kernel/sched/core.c | 105 >> - >> kernel/sched/fair.c | 35 +++-- >> kernel/sched/sched.h | 24 +-- >> 4 files changed, 145 insertions(+), 25 deletions(-) >> >> diff --git a/include/linux/sched.h b/include/linux/sched.h >> index d35d2b6..2a52188 100644 >> --- a/include/linux/sched.h >> +++ b/include/linux/sched.h >> @@ -959,6 +959,12 @@ struct sched_domain { >> unsigned long span[0]; >> }; >> >> +struct sched_domain_rq { >> + struct sched_domain *sd; >> + unsigned long flags; >> + struct rcu_head rcu;/* used during destruction */ >> +}; > > So the reason for this level of indirection won't be intuitive for those > who read that code. Please add some comments that explain why we need > that. ie: because we need the object lifecycle of sched_power and flags to be > the > same for the lockless scheme to work. ok, i will add a comment > >> + >> static inline struct cpumask *sched_domain_span(struct sched_domain *sd) >> { >> return to_cpumask(sd->span); >> diff --git a/kernel/sched/core.c b/kernel/sched/core.c >> index 7f12624..69e2313 100644 >> --- a/kernel/sched/core.c >> +++ b/kernel/sched/core.c >> @@ -5602,6 +5602,15 @@ static void destroy_sched_domains(struct sched_domain >> *sd, int cpu) >> destroy_sched_domain(sd, cpu); >> } >> >> +static void destroy_sched_domain_rq(struct sched_domain_rq *sd_rq, int cpu) >> +{ >> + if (!sd_rq) >> + return; >> + >> + destroy_sched_domains(sd_rq->sd, cpu); >> + kfree_rcu(sd_rq, rcu); >> +} >> + >> /* >> * Keep a special pointer to the highest sched_domain that has >> * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this >> @@ -5632,10 +5641,23 @@ static void update_top_cache_domain(int cpu) >> * hold the hotplug lock. >> */ >> static void >> -cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) >> +cpu_attach_domain(struct sched_domain_rq *sd_rq, struct root_domain *rd, >> + int cpu) >> { >> struct rq *rq = cpu_rq(cpu); >> - struct sched_domain *tmp; >> + struct sched_domain_rq *tmp_rq; > > old_sd_rq would be a clearer name. ok > >> + struct
Re: [PATCH Resend v5] sched: fix init NOHZ_IDLE flag
On 11 April 2013 16:56, Frederic Weisbecker fweis...@gmail.com wrote: On Wed, Apr 03, 2013 at 03:37:43PM +0200, Vincent Guittot wrote: On my smp platform which is made of 5 cores in 2 clusters, I have the nr_busy_cpu field of sched_group_power struct that is not null when the platform is fully idle. The root cause is: During the boot sequence, some CPUs reach the idle loop and set their NOHZ_IDLE flag while waiting for others CPUs to boot. But the nr_busy_cpus field is initialized later with the assumption that all CPUs are in the busy state whereas some CPUs have already set their NOHZ_IDLE flag. More generally, the NOHZ_IDLE flag must be initialized when new sched_domains are created in order to ensure that NOHZ_IDLE and nr_busy_cpus are aligned. This condition can be ensured by adding a synchronize_rcu between the destruction of old sched_domains and the creation of new ones so the NOHZ_IDLE flag will not be updated with old sched_domain once it has been initialized. But this solution introduces a additionnal latency in the rebuild sequence that is called during cpu hotplug. As suggested by Frederic Weisbecker, another solution is to have the same rcu lifecycle for both NOHZ_IDLE and sched_domain struct. I have introduce a new sched_domain_rq struct that is the entry point for both sched_domains and objects that must follow the same lifecycle like NOHZ_IDLE flags. They will share the same RCU lifecycle and will be always synchronized. The synchronization is done at the cost of : - an additional indirection for accessing the first sched_domain level - an additional indirection and a rcu_dereference before accessing to the NOHZ_IDLE flag. Change since v4: - link both sched_domain and NOHZ_IDLE flag in one RCU object so their states are always synchronized. Change since V3; - NOHZ flag is not cleared if a NULL domain is attached to the CPU - Remove patch 2/2 which becomes useless with latest modifications Change since V2: - change the initialization to idle state instead of busy state so a CPU that enters idle during the build of the sched_domain will not corrupt the initialization state Change since V1: - remove the patch for SCHED softirq on an idle core use case as it was a side effect of the other use cases. Signed-off-by: Vincent Guittot vincent.guit...@linaro.org Ok, the lockless scheme involving nr_buzy_cpus and rq flags seem to be correct now. We can hope somebody will come up with a less complicated solution. But for now that's the best fix I've seen. Thanks I just have a few comments on details. --- include/linux/sched.h |6 +++ kernel/sched/core.c | 105 - kernel/sched/fair.c | 35 +++-- kernel/sched/sched.h | 24 +-- 4 files changed, 145 insertions(+), 25 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index d35d2b6..2a52188 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -959,6 +959,12 @@ struct sched_domain { unsigned long span[0]; }; +struct sched_domain_rq { + struct sched_domain *sd; + unsigned long flags; + struct rcu_head rcu;/* used during destruction */ +}; So the reason for this level of indirection won't be intuitive for those who read that code. Please add some comments that explain why we need that. ie: because we need the object lifecycle of sched_power and flags to be the same for the lockless scheme to work. ok, i will add a comment + static inline struct cpumask *sched_domain_span(struct sched_domain *sd) { return to_cpumask(sd-span); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7f12624..69e2313 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5602,6 +5602,15 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu) destroy_sched_domain(sd, cpu); } +static void destroy_sched_domain_rq(struct sched_domain_rq *sd_rq, int cpu) +{ + if (!sd_rq) + return; + + destroy_sched_domains(sd_rq-sd, cpu); + kfree_rcu(sd_rq, rcu); +} + /* * Keep a special pointer to the highest sched_domain that has * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this @@ -5632,10 +5641,23 @@ static void update_top_cache_domain(int cpu) * hold the hotplug lock. */ static void -cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) +cpu_attach_domain(struct sched_domain_rq *sd_rq, struct root_domain *rd, + int cpu) { struct rq *rq = cpu_rq(cpu); - struct sched_domain *tmp; + struct sched_domain_rq *tmp_rq; old_sd_rq would be a clearer name. ok + struct sched_domain *tmp, *sd = NULL; + + /* + * If we don't have any sched_domain and associated object, we can + * directly jump to the attach sequence otherwise we try to degenerate +
Re: [PATCH Resend v5] sched: fix init NOHZ_IDLE flag
On Wed, Apr 03, 2013 at 03:37:43PM +0200, Vincent Guittot wrote: > On my smp platform which is made of 5 cores in 2 clusters, I have the > nr_busy_cpu field of sched_group_power struct that is not null when the > platform is fully idle. The root cause is: > During the boot sequence, some CPUs reach the idle loop and set their > NOHZ_IDLE flag while waiting for others CPUs to boot. But the nr_busy_cpus > field is initialized later with the assumption that all CPUs are in the busy > state whereas some CPUs have already set their NOHZ_IDLE flag. > > More generally, the NOHZ_IDLE flag must be initialized when new sched_domains > are created in order to ensure that NOHZ_IDLE and nr_busy_cpus are aligned. > > This condition can be ensured by adding a synchronize_rcu between the > destruction of old sched_domains and the creation of new ones so the NOHZ_IDLE > flag will not be updated with old sched_domain once it has been initialized. > But this solution introduces a additionnal latency in the rebuild sequence > that is called during cpu hotplug. > > As suggested by Frederic Weisbecker, another solution is to have the same > rcu lifecycle for both NOHZ_IDLE and sched_domain struct. I have introduce > a new sched_domain_rq struct that is the entry point for both sched_domains > and objects that must follow the same lifecycle like NOHZ_IDLE flags. They > will share the same RCU lifecycle and will be always synchronized. > > The synchronization is done at the cost of : > - an additional indirection for accessing the first sched_domain level > - an additional indirection and a rcu_dereference before accessing to the >NOHZ_IDLE flag. > > Change since v4: > - link both sched_domain and NOHZ_IDLE flag in one RCU object so >their states are always synchronized. > > Change since V3; > - NOHZ flag is not cleared if a NULL domain is attached to the CPU > - Remove patch 2/2 which becomes useless with latest modifications > > Change since V2: > - change the initialization to idle state instead of busy state so a CPU that >enters idle during the build of the sched_domain will not corrupt the >initialization state > > Change since V1: > - remove the patch for SCHED softirq on an idle core use case as it was >a side effect of the other use cases. > > Signed-off-by: Vincent Guittot Ok, the lockless scheme involving nr_buzy_cpus and rq flags seem to be correct now. We can hope somebody will come up with a less complicated solution. But for now that's the best fix I've seen. I just have a few comments on details. > --- > include/linux/sched.h |6 +++ > kernel/sched/core.c | 105 > - > kernel/sched/fair.c | 35 +++-- > kernel/sched/sched.h | 24 +-- > 4 files changed, 145 insertions(+), 25 deletions(-) > > diff --git a/include/linux/sched.h b/include/linux/sched.h > index d35d2b6..2a52188 100644 > --- a/include/linux/sched.h > +++ b/include/linux/sched.h > @@ -959,6 +959,12 @@ struct sched_domain { > unsigned long span[0]; > }; > > +struct sched_domain_rq { > + struct sched_domain *sd; > + unsigned long flags; > + struct rcu_head rcu;/* used during destruction */ > +}; So the reason for this level of indirection won't be intuitive for those who read that code. Please add some comments that explain why we need that. ie: because we need the object lifecycle of sched_power and flags to be the same for the lockless scheme to work. > + > static inline struct cpumask *sched_domain_span(struct sched_domain *sd) > { > return to_cpumask(sd->span); > diff --git a/kernel/sched/core.c b/kernel/sched/core.c > index 7f12624..69e2313 100644 > --- a/kernel/sched/core.c > +++ b/kernel/sched/core.c > @@ -5602,6 +5602,15 @@ static void destroy_sched_domains(struct sched_domain > *sd, int cpu) > destroy_sched_domain(sd, cpu); > } > > +static void destroy_sched_domain_rq(struct sched_domain_rq *sd_rq, int cpu) > +{ > + if (!sd_rq) > + return; > + > + destroy_sched_domains(sd_rq->sd, cpu); > + kfree_rcu(sd_rq, rcu); > +} > + > /* > * Keep a special pointer to the highest sched_domain that has > * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this > @@ -5632,10 +5641,23 @@ static void update_top_cache_domain(int cpu) > * hold the hotplug lock. > */ > static void > -cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) > +cpu_attach_domain(struct sched_domain_rq *sd_rq, struct root_domain *rd, > + int cpu) > { > struct rq *rq = cpu_rq(cpu); > - struct sched_domain *tmp; > + struct sched_domain_rq *tmp_rq; old_sd_rq would be a clearer name. > + struct sched_domain *tmp, *sd = NULL; > + > + /* > + * If we don't have any sched_domain and associated object, we can > + * directly jump to the attach sequence otherwise we try to degenerate > + * the sched_domain > +
Re: [PATCH Resend v5] sched: fix init NOHZ_IDLE flag
On Wed, Apr 03, 2013 at 03:37:43PM +0200, Vincent Guittot wrote: On my smp platform which is made of 5 cores in 2 clusters, I have the nr_busy_cpu field of sched_group_power struct that is not null when the platform is fully idle. The root cause is: During the boot sequence, some CPUs reach the idle loop and set their NOHZ_IDLE flag while waiting for others CPUs to boot. But the nr_busy_cpus field is initialized later with the assumption that all CPUs are in the busy state whereas some CPUs have already set their NOHZ_IDLE flag. More generally, the NOHZ_IDLE flag must be initialized when new sched_domains are created in order to ensure that NOHZ_IDLE and nr_busy_cpus are aligned. This condition can be ensured by adding a synchronize_rcu between the destruction of old sched_domains and the creation of new ones so the NOHZ_IDLE flag will not be updated with old sched_domain once it has been initialized. But this solution introduces a additionnal latency in the rebuild sequence that is called during cpu hotplug. As suggested by Frederic Weisbecker, another solution is to have the same rcu lifecycle for both NOHZ_IDLE and sched_domain struct. I have introduce a new sched_domain_rq struct that is the entry point for both sched_domains and objects that must follow the same lifecycle like NOHZ_IDLE flags. They will share the same RCU lifecycle and will be always synchronized. The synchronization is done at the cost of : - an additional indirection for accessing the first sched_domain level - an additional indirection and a rcu_dereference before accessing to the NOHZ_IDLE flag. Change since v4: - link both sched_domain and NOHZ_IDLE flag in one RCU object so their states are always synchronized. Change since V3; - NOHZ flag is not cleared if a NULL domain is attached to the CPU - Remove patch 2/2 which becomes useless with latest modifications Change since V2: - change the initialization to idle state instead of busy state so a CPU that enters idle during the build of the sched_domain will not corrupt the initialization state Change since V1: - remove the patch for SCHED softirq on an idle core use case as it was a side effect of the other use cases. Signed-off-by: Vincent Guittot vincent.guit...@linaro.org Ok, the lockless scheme involving nr_buzy_cpus and rq flags seem to be correct now. We can hope somebody will come up with a less complicated solution. But for now that's the best fix I've seen. I just have a few comments on details. --- include/linux/sched.h |6 +++ kernel/sched/core.c | 105 - kernel/sched/fair.c | 35 +++-- kernel/sched/sched.h | 24 +-- 4 files changed, 145 insertions(+), 25 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index d35d2b6..2a52188 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -959,6 +959,12 @@ struct sched_domain { unsigned long span[0]; }; +struct sched_domain_rq { + struct sched_domain *sd; + unsigned long flags; + struct rcu_head rcu;/* used during destruction */ +}; So the reason for this level of indirection won't be intuitive for those who read that code. Please add some comments that explain why we need that. ie: because we need the object lifecycle of sched_power and flags to be the same for the lockless scheme to work. + static inline struct cpumask *sched_domain_span(struct sched_domain *sd) { return to_cpumask(sd-span); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7f12624..69e2313 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5602,6 +5602,15 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu) destroy_sched_domain(sd, cpu); } +static void destroy_sched_domain_rq(struct sched_domain_rq *sd_rq, int cpu) +{ + if (!sd_rq) + return; + + destroy_sched_domains(sd_rq-sd, cpu); + kfree_rcu(sd_rq, rcu); +} + /* * Keep a special pointer to the highest sched_domain that has * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this @@ -5632,10 +5641,23 @@ static void update_top_cache_domain(int cpu) * hold the hotplug lock. */ static void -cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) +cpu_attach_domain(struct sched_domain_rq *sd_rq, struct root_domain *rd, + int cpu) { struct rq *rq = cpu_rq(cpu); - struct sched_domain *tmp; + struct sched_domain_rq *tmp_rq; old_sd_rq would be a clearer name. + struct sched_domain *tmp, *sd = NULL; + + /* + * If we don't have any sched_domain and associated object, we can + * directly jump to the attach sequence otherwise we try to degenerate + * the sched_domain + */ + if (!sd_rq) + goto attach; + + /* Get a pointer to the
Re: [PATCH Resend v5] sched: fix init NOHZ_IDLE flag
On 9 April 2013 14:45, Frederic Weisbecker wrote: > 2013/4/4 Vincent Guittot : >> On 4 April 2013 19:07, Frederic Weisbecker wrote: >>> Is it possible that we can be dealing here with a >>> sched_group/sched_group_power that is used on another CPU (from that >>> CPU's rq->rq_sd->sd) concurrently? >>> When we call build_sched_groups(), we might reuse an exisiting struct >>> sched_group used elsewhere right? If so, is there a race with the >>> above initialization? >> >> No we are not reusing an existing struct, the >> sched_group/sched_group_power that is initialized here, has just been >> created by __visit_domain_allocation_hell in build_sched_domains. The >> sched_group/sched_group_power is not already attached to any CPU > > I see. Yeah the group allocations/initialization is done per domain > found in ndoms_new[]. And there is no further reuse of these groups > once these are attached. > > Looking at the code it seems we can make some symetric conclusion with > group release? When we destroy a per cpu domain hierarchy and then put > our references to the struct sched_group, all the other per cpu > domains that reference these sched_group are about to put their > reference soon too, right? Because IIUC we always destroy these per > cpu domains per highest level partition (those found in doms_cur[])? Yes > > I'm just asking to make sure we don't need some > atomic_dec(nr_busy_cpus) on per cpu domain release, which is not > necessary the sched group is getting released soon. yes, it's not needed > > Thanks for your patience :) That's fine. -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH Resend v5] sched: fix init NOHZ_IDLE flag
On 9 April 2013 14:45, Frederic Weisbecker fweis...@gmail.com wrote: 2013/4/4 Vincent Guittot vincent.guit...@linaro.org: On 4 April 2013 19:07, Frederic Weisbecker fweis...@gmail.com wrote: Is it possible that we can be dealing here with a sched_group/sched_group_power that is used on another CPU (from that CPU's rq-rq_sd-sd) concurrently? When we call build_sched_groups(), we might reuse an exisiting struct sched_group used elsewhere right? If so, is there a race with the above initialization? No we are not reusing an existing struct, the sched_group/sched_group_power that is initialized here, has just been created by __visit_domain_allocation_hell in build_sched_domains. The sched_group/sched_group_power is not already attached to any CPU I see. Yeah the group allocations/initialization is done per domain found in ndoms_new[]. And there is no further reuse of these groups once these are attached. Looking at the code it seems we can make some symetric conclusion with group release? When we destroy a per cpu domain hierarchy and then put our references to the struct sched_group, all the other per cpu domains that reference these sched_group are about to put their reference soon too, right? Because IIUC we always destroy these per cpu domains per highest level partition (those found in doms_cur[])? Yes I'm just asking to make sure we don't need some atomic_dec(nr_busy_cpus) on per cpu domain release, which is not necessary the sched group is getting released soon. yes, it's not needed Thanks for your patience :) That's fine. -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH Resend v5] sched: fix init NOHZ_IDLE flag
2013/4/4 Vincent Guittot : > On 4 April 2013 19:07, Frederic Weisbecker wrote: >> Is it possible that we can be dealing here with a >> sched_group/sched_group_power that is used on another CPU (from that >> CPU's rq->rq_sd->sd) concurrently? >> When we call build_sched_groups(), we might reuse an exisiting struct >> sched_group used elsewhere right? If so, is there a race with the >> above initialization? > > No we are not reusing an existing struct, the > sched_group/sched_group_power that is initialized here, has just been > created by __visit_domain_allocation_hell in build_sched_domains. The > sched_group/sched_group_power is not already attached to any CPU I see. Yeah the group allocations/initialization is done per domain found in ndoms_new[]. And there is no further reuse of these groups once these are attached. Looking at the code it seems we can make some symetric conclusion with group release? When we destroy a per cpu domain hierarchy and then put our references to the struct sched_group, all the other per cpu domains that reference these sched_group are about to put their reference soon too, right? Because IIUC we always destroy these per cpu domains per highest level partition (those found in doms_cur[])? I'm just asking to make sure we don't need some atomic_dec(nr_busy_cpus) on per cpu domain release, which is not necessary the sched group is getting released soon. Thanks for your patience :) -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH Resend v5] sched: fix init NOHZ_IDLE flag
On 4 April 2013 19:30, Vincent Guittot wrote: > On 4 April 2013 19:07, Frederic Weisbecker wrote: >> 2013/4/3 Vincent Guittot : >>> On my smp platform which is made of 5 cores in 2 clusters, I have the >>> nr_busy_cpu field of sched_group_power struct that is not null when the >>> platform is fully idle. The root cause is: >>> During the boot sequence, some CPUs reach the idle loop and set their >>> NOHZ_IDLE flag while waiting for others CPUs to boot. But the nr_busy_cpus >>> field is initialized later with the assumption that all CPUs are in the busy >>> state whereas some CPUs have already set their NOHZ_IDLE flag. >>> >>> More generally, the NOHZ_IDLE flag must be initialized when new >>> sched_domains >>> are created in order to ensure that NOHZ_IDLE and nr_busy_cpus are aligned. >>> >>> This condition can be ensured by adding a synchronize_rcu between the >>> destruction of old sched_domains and the creation of new ones so the >>> NOHZ_IDLE >>> flag will not be updated with old sched_domain once it has been initialized. >>> But this solution introduces a additionnal latency in the rebuild sequence >>> that is called during cpu hotplug. >>> >>> As suggested by Frederic Weisbecker, another solution is to have the same >>> rcu lifecycle for both NOHZ_IDLE and sched_domain struct. I have introduce >>> a new sched_domain_rq struct that is the entry point for both sched_domains >>> and objects that must follow the same lifecycle like NOHZ_IDLE flags. They >>> will share the same RCU lifecycle and will be always synchronized. >>> >>> The synchronization is done at the cost of : >>> - an additional indirection for accessing the first sched_domain level >>> - an additional indirection and a rcu_dereference before accessing to the >>>NOHZ_IDLE flag. >>> >>> Change since v4: >>> - link both sched_domain and NOHZ_IDLE flag in one RCU object so >>>their states are always synchronized. >>> >>> Change since V3; >>> - NOHZ flag is not cleared if a NULL domain is attached to the CPU >>> - Remove patch 2/2 which becomes useless with latest modifications >>> >>> Change since V2: >>> - change the initialization to idle state instead of busy state so a CPU >>> that >>>enters idle during the build of the sched_domain will not corrupt the >>>initialization state >>> >>> Change since V1: >>> - remove the patch for SCHED softirq on an idle core use case as it was >>>a side effect of the other use cases. >>> >>> Signed-off-by: Vincent Guittot >>> --- >>> include/linux/sched.h |6 +++ >>> kernel/sched/core.c | 105 >>> - >>> kernel/sched/fair.c | 35 +++-- >>> kernel/sched/sched.h | 24 +-- >>> 4 files changed, 145 insertions(+), 25 deletions(-) >>> >>> diff --git a/include/linux/sched.h b/include/linux/sched.h >>> index d35d2b6..2a52188 100644 >>> --- a/include/linux/sched.h >>> +++ b/include/linux/sched.h >>> @@ -959,6 +959,12 @@ struct sched_domain { >>> unsigned long span[0]; >>> }; >>> >>> +struct sched_domain_rq { >>> + struct sched_domain *sd; >>> + unsigned long flags; >>> + struct rcu_head rcu;/* used during destruction */ >>> +}; >>> + >>> static inline struct cpumask *sched_domain_span(struct sched_domain *sd) >>> { >>> return to_cpumask(sd->span); >>> diff --git a/kernel/sched/core.c b/kernel/sched/core.c >>> index 7f12624..69e2313 100644 >>> --- a/kernel/sched/core.c >>> +++ b/kernel/sched/core.c >>> @@ -5602,6 +5602,15 @@ static void destroy_sched_domains(struct >>> sched_domain *sd, int cpu) >>> destroy_sched_domain(sd, cpu); >>> } >>> >>> +static void destroy_sched_domain_rq(struct sched_domain_rq *sd_rq, int cpu) >>> +{ >>> + if (!sd_rq) >>> + return; >>> + >>> + destroy_sched_domains(sd_rq->sd, cpu); >>> + kfree_rcu(sd_rq, rcu); >>> +} >>> + >>> /* >>> * Keep a special pointer to the highest sched_domain that has >>> * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this >>> @@ -5632,10 +5641,23 @@ static void update_top_cache_domain(int cpu) >>> * hold the hotplug lock. >>> */ >>> static void >>> -cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) >>> +cpu_attach_domain(struct sched_domain_rq *sd_rq, struct root_domain *rd, >>> + int cpu) >>> { >>> struct rq *rq = cpu_rq(cpu); >>> - struct sched_domain *tmp; >>> + struct sched_domain_rq *tmp_rq; >>> + struct sched_domain *tmp, *sd = NULL; >>> + >>> + /* >>> +* If we don't have any sched_domain and associated object, we can >>> +* directly jump to the attach sequence otherwise we try to >>> degenerate >>> +* the sched_domain >>> +*/ >>> + if (!sd_rq) >>> + goto attach; >>> + >>> + /* Get a pointer to the 1st sched_domain */ >>> + sd = sd_rq->sd; >>> >>> /* Remove the sched domains which do not
Re: [PATCH Resend v5] sched: fix init NOHZ_IDLE flag
On 4 April 2013 19:30, Vincent Guittot vincent.guit...@linaro.org wrote: On 4 April 2013 19:07, Frederic Weisbecker fweis...@gmail.com wrote: 2013/4/3 Vincent Guittot vincent.guit...@linaro.org: On my smp platform which is made of 5 cores in 2 clusters, I have the nr_busy_cpu field of sched_group_power struct that is not null when the platform is fully idle. The root cause is: During the boot sequence, some CPUs reach the idle loop and set their NOHZ_IDLE flag while waiting for others CPUs to boot. But the nr_busy_cpus field is initialized later with the assumption that all CPUs are in the busy state whereas some CPUs have already set their NOHZ_IDLE flag. More generally, the NOHZ_IDLE flag must be initialized when new sched_domains are created in order to ensure that NOHZ_IDLE and nr_busy_cpus are aligned. This condition can be ensured by adding a synchronize_rcu between the destruction of old sched_domains and the creation of new ones so the NOHZ_IDLE flag will not be updated with old sched_domain once it has been initialized. But this solution introduces a additionnal latency in the rebuild sequence that is called during cpu hotplug. As suggested by Frederic Weisbecker, another solution is to have the same rcu lifecycle for both NOHZ_IDLE and sched_domain struct. I have introduce a new sched_domain_rq struct that is the entry point for both sched_domains and objects that must follow the same lifecycle like NOHZ_IDLE flags. They will share the same RCU lifecycle and will be always synchronized. The synchronization is done at the cost of : - an additional indirection for accessing the first sched_domain level - an additional indirection and a rcu_dereference before accessing to the NOHZ_IDLE flag. Change since v4: - link both sched_domain and NOHZ_IDLE flag in one RCU object so their states are always synchronized. Change since V3; - NOHZ flag is not cleared if a NULL domain is attached to the CPU - Remove patch 2/2 which becomes useless with latest modifications Change since V2: - change the initialization to idle state instead of busy state so a CPU that enters idle during the build of the sched_domain will not corrupt the initialization state Change since V1: - remove the patch for SCHED softirq on an idle core use case as it was a side effect of the other use cases. Signed-off-by: Vincent Guittot vincent.guit...@linaro.org --- include/linux/sched.h |6 +++ kernel/sched/core.c | 105 - kernel/sched/fair.c | 35 +++-- kernel/sched/sched.h | 24 +-- 4 files changed, 145 insertions(+), 25 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index d35d2b6..2a52188 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -959,6 +959,12 @@ struct sched_domain { unsigned long span[0]; }; +struct sched_domain_rq { + struct sched_domain *sd; + unsigned long flags; + struct rcu_head rcu;/* used during destruction */ +}; + static inline struct cpumask *sched_domain_span(struct sched_domain *sd) { return to_cpumask(sd-span); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7f12624..69e2313 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5602,6 +5602,15 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu) destroy_sched_domain(sd, cpu); } +static void destroy_sched_domain_rq(struct sched_domain_rq *sd_rq, int cpu) +{ + if (!sd_rq) + return; + + destroy_sched_domains(sd_rq-sd, cpu); + kfree_rcu(sd_rq, rcu); +} + /* * Keep a special pointer to the highest sched_domain that has * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this @@ -5632,10 +5641,23 @@ static void update_top_cache_domain(int cpu) * hold the hotplug lock. */ static void -cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) +cpu_attach_domain(struct sched_domain_rq *sd_rq, struct root_domain *rd, + int cpu) { struct rq *rq = cpu_rq(cpu); - struct sched_domain *tmp; + struct sched_domain_rq *tmp_rq; + struct sched_domain *tmp, *sd = NULL; + + /* +* If we don't have any sched_domain and associated object, we can +* directly jump to the attach sequence otherwise we try to degenerate +* the sched_domain +*/ + if (!sd_rq) + goto attach; + + /* Get a pointer to the 1st sched_domain */ + sd = sd_rq-sd; /* Remove the sched domains which do not contribute to scheduling. */ for (tmp = sd; tmp; ) { @@ -5658,14 +5680,17 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) destroy_sched_domain(tmp, cpu); if (sd)
Re: [PATCH Resend v5] sched: fix init NOHZ_IDLE flag
2013/4/4 Vincent Guittot vincent.guit...@linaro.org: On 4 April 2013 19:07, Frederic Weisbecker fweis...@gmail.com wrote: Is it possible that we can be dealing here with a sched_group/sched_group_power that is used on another CPU (from that CPU's rq-rq_sd-sd) concurrently? When we call build_sched_groups(), we might reuse an exisiting struct sched_group used elsewhere right? If so, is there a race with the above initialization? No we are not reusing an existing struct, the sched_group/sched_group_power that is initialized here, has just been created by __visit_domain_allocation_hell in build_sched_domains. The sched_group/sched_group_power is not already attached to any CPU I see. Yeah the group allocations/initialization is done per domain found in ndoms_new[]. And there is no further reuse of these groups once these are attached. Looking at the code it seems we can make some symetric conclusion with group release? When we destroy a per cpu domain hierarchy and then put our references to the struct sched_group, all the other per cpu domains that reference these sched_group are about to put their reference soon too, right? Because IIUC we always destroy these per cpu domains per highest level partition (those found in doms_cur[])? I'm just asking to make sure we don't need some atomic_dec(nr_busy_cpus) on per cpu domain release, which is not necessary the sched group is getting released soon. Thanks for your patience :) -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH Resend v5] sched: fix init NOHZ_IDLE flag
On 4 April 2013 19:07, Frederic Weisbecker wrote: > 2013/4/3 Vincent Guittot : >> On my smp platform which is made of 5 cores in 2 clusters, I have the >> nr_busy_cpu field of sched_group_power struct that is not null when the >> platform is fully idle. The root cause is: >> During the boot sequence, some CPUs reach the idle loop and set their >> NOHZ_IDLE flag while waiting for others CPUs to boot. But the nr_busy_cpus >> field is initialized later with the assumption that all CPUs are in the busy >> state whereas some CPUs have already set their NOHZ_IDLE flag. >> >> More generally, the NOHZ_IDLE flag must be initialized when new sched_domains >> are created in order to ensure that NOHZ_IDLE and nr_busy_cpus are aligned. >> >> This condition can be ensured by adding a synchronize_rcu between the >> destruction of old sched_domains and the creation of new ones so the >> NOHZ_IDLE >> flag will not be updated with old sched_domain once it has been initialized. >> But this solution introduces a additionnal latency in the rebuild sequence >> that is called during cpu hotplug. >> >> As suggested by Frederic Weisbecker, another solution is to have the same >> rcu lifecycle for both NOHZ_IDLE and sched_domain struct. I have introduce >> a new sched_domain_rq struct that is the entry point for both sched_domains >> and objects that must follow the same lifecycle like NOHZ_IDLE flags. They >> will share the same RCU lifecycle and will be always synchronized. >> >> The synchronization is done at the cost of : >> - an additional indirection for accessing the first sched_domain level >> - an additional indirection and a rcu_dereference before accessing to the >>NOHZ_IDLE flag. >> >> Change since v4: >> - link both sched_domain and NOHZ_IDLE flag in one RCU object so >>their states are always synchronized. >> >> Change since V3; >> - NOHZ flag is not cleared if a NULL domain is attached to the CPU >> - Remove patch 2/2 which becomes useless with latest modifications >> >> Change since V2: >> - change the initialization to idle state instead of busy state so a CPU >> that >>enters idle during the build of the sched_domain will not corrupt the >>initialization state >> >> Change since V1: >> - remove the patch for SCHED softirq on an idle core use case as it was >>a side effect of the other use cases. >> >> Signed-off-by: Vincent Guittot >> --- >> include/linux/sched.h |6 +++ >> kernel/sched/core.c | 105 >> - >> kernel/sched/fair.c | 35 +++-- >> kernel/sched/sched.h | 24 +-- >> 4 files changed, 145 insertions(+), 25 deletions(-) >> >> diff --git a/include/linux/sched.h b/include/linux/sched.h >> index d35d2b6..2a52188 100644 >> --- a/include/linux/sched.h >> +++ b/include/linux/sched.h >> @@ -959,6 +959,12 @@ struct sched_domain { >> unsigned long span[0]; >> }; >> >> +struct sched_domain_rq { >> + struct sched_domain *sd; >> + unsigned long flags; >> + struct rcu_head rcu;/* used during destruction */ >> +}; >> + >> static inline struct cpumask *sched_domain_span(struct sched_domain *sd) >> { >> return to_cpumask(sd->span); >> diff --git a/kernel/sched/core.c b/kernel/sched/core.c >> index 7f12624..69e2313 100644 >> --- a/kernel/sched/core.c >> +++ b/kernel/sched/core.c >> @@ -5602,6 +5602,15 @@ static void destroy_sched_domains(struct sched_domain >> *sd, int cpu) >> destroy_sched_domain(sd, cpu); >> } >> >> +static void destroy_sched_domain_rq(struct sched_domain_rq *sd_rq, int cpu) >> +{ >> + if (!sd_rq) >> + return; >> + >> + destroy_sched_domains(sd_rq->sd, cpu); >> + kfree_rcu(sd_rq, rcu); >> +} >> + >> /* >> * Keep a special pointer to the highest sched_domain that has >> * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this >> @@ -5632,10 +5641,23 @@ static void update_top_cache_domain(int cpu) >> * hold the hotplug lock. >> */ >> static void >> -cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) >> +cpu_attach_domain(struct sched_domain_rq *sd_rq, struct root_domain *rd, >> + int cpu) >> { >> struct rq *rq = cpu_rq(cpu); >> - struct sched_domain *tmp; >> + struct sched_domain_rq *tmp_rq; >> + struct sched_domain *tmp, *sd = NULL; >> + >> + /* >> +* If we don't have any sched_domain and associated object, we can >> +* directly jump to the attach sequence otherwise we try to >> degenerate >> +* the sched_domain >> +*/ >> + if (!sd_rq) >> + goto attach; >> + >> + /* Get a pointer to the 1st sched_domain */ >> + sd = sd_rq->sd; >> >> /* Remove the sched domains which do not contribute to scheduling. */ >> for (tmp = sd; tmp; ) { >> @@ -5658,14 +5680,17 @@ cpu_attach_domain(struct sched_domain *sd, struct >> root_domain *rd, int cpu) >>
Re: [PATCH Resend v5] sched: fix init NOHZ_IDLE flag
2013/4/3 Vincent Guittot : > On my smp platform which is made of 5 cores in 2 clusters, I have the > nr_busy_cpu field of sched_group_power struct that is not null when the > platform is fully idle. The root cause is: > During the boot sequence, some CPUs reach the idle loop and set their > NOHZ_IDLE flag while waiting for others CPUs to boot. But the nr_busy_cpus > field is initialized later with the assumption that all CPUs are in the busy > state whereas some CPUs have already set their NOHZ_IDLE flag. > > More generally, the NOHZ_IDLE flag must be initialized when new sched_domains > are created in order to ensure that NOHZ_IDLE and nr_busy_cpus are aligned. > > This condition can be ensured by adding a synchronize_rcu between the > destruction of old sched_domains and the creation of new ones so the NOHZ_IDLE > flag will not be updated with old sched_domain once it has been initialized. > But this solution introduces a additionnal latency in the rebuild sequence > that is called during cpu hotplug. > > As suggested by Frederic Weisbecker, another solution is to have the same > rcu lifecycle for both NOHZ_IDLE and sched_domain struct. I have introduce > a new sched_domain_rq struct that is the entry point for both sched_domains > and objects that must follow the same lifecycle like NOHZ_IDLE flags. They > will share the same RCU lifecycle and will be always synchronized. > > The synchronization is done at the cost of : > - an additional indirection for accessing the first sched_domain level > - an additional indirection and a rcu_dereference before accessing to the >NOHZ_IDLE flag. > > Change since v4: > - link both sched_domain and NOHZ_IDLE flag in one RCU object so >their states are always synchronized. > > Change since V3; > - NOHZ flag is not cleared if a NULL domain is attached to the CPU > - Remove patch 2/2 which becomes useless with latest modifications > > Change since V2: > - change the initialization to idle state instead of busy state so a CPU that >enters idle during the build of the sched_domain will not corrupt the >initialization state > > Change since V1: > - remove the patch for SCHED softirq on an idle core use case as it was >a side effect of the other use cases. > > Signed-off-by: Vincent Guittot > --- > include/linux/sched.h |6 +++ > kernel/sched/core.c | 105 > - > kernel/sched/fair.c | 35 +++-- > kernel/sched/sched.h | 24 +-- > 4 files changed, 145 insertions(+), 25 deletions(-) > > diff --git a/include/linux/sched.h b/include/linux/sched.h > index d35d2b6..2a52188 100644 > --- a/include/linux/sched.h > +++ b/include/linux/sched.h > @@ -959,6 +959,12 @@ struct sched_domain { > unsigned long span[0]; > }; > > +struct sched_domain_rq { > + struct sched_domain *sd; > + unsigned long flags; > + struct rcu_head rcu;/* used during destruction */ > +}; > + > static inline struct cpumask *sched_domain_span(struct sched_domain *sd) > { > return to_cpumask(sd->span); > diff --git a/kernel/sched/core.c b/kernel/sched/core.c > index 7f12624..69e2313 100644 > --- a/kernel/sched/core.c > +++ b/kernel/sched/core.c > @@ -5602,6 +5602,15 @@ static void destroy_sched_domains(struct sched_domain > *sd, int cpu) > destroy_sched_domain(sd, cpu); > } > > +static void destroy_sched_domain_rq(struct sched_domain_rq *sd_rq, int cpu) > +{ > + if (!sd_rq) > + return; > + > + destroy_sched_domains(sd_rq->sd, cpu); > + kfree_rcu(sd_rq, rcu); > +} > + > /* > * Keep a special pointer to the highest sched_domain that has > * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this > @@ -5632,10 +5641,23 @@ static void update_top_cache_domain(int cpu) > * hold the hotplug lock. > */ > static void > -cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) > +cpu_attach_domain(struct sched_domain_rq *sd_rq, struct root_domain *rd, > + int cpu) > { > struct rq *rq = cpu_rq(cpu); > - struct sched_domain *tmp; > + struct sched_domain_rq *tmp_rq; > + struct sched_domain *tmp, *sd = NULL; > + > + /* > +* If we don't have any sched_domain and associated object, we can > +* directly jump to the attach sequence otherwise we try to degenerate > +* the sched_domain > +*/ > + if (!sd_rq) > + goto attach; > + > + /* Get a pointer to the 1st sched_domain */ > + sd = sd_rq->sd; > > /* Remove the sched domains which do not contribute to scheduling. */ > for (tmp = sd; tmp; ) { > @@ -5658,14 +5680,17 @@ cpu_attach_domain(struct sched_domain *sd, struct > root_domain *rd, int cpu) > destroy_sched_domain(tmp, cpu); > if (sd) > sd->child = NULL; > + /* update sched_domain_rq */ > +
Re: [PATCH Resend v5] sched: fix init NOHZ_IDLE flag
2013/4/4 Frederic Weisbecker : > 2013/4/3 Vincent Guittot : >> On my smp platform which is made of 5 cores in 2 clusters, I have the >> nr_busy_cpu field of sched_group_power struct that is not null when the >> platform is fully idle. The root cause is: >> During the boot sequence, some CPUs reach the idle loop and set their >> NOHZ_IDLE flag while waiting for others CPUs to boot. But the nr_busy_cpus >> field is initialized later with the assumption that all CPUs are in the busy >> state whereas some CPUs have already set their NOHZ_IDLE flag. >> >> More generally, the NOHZ_IDLE flag must be initialized when new sched_domains >> are created in order to ensure that NOHZ_IDLE and nr_busy_cpus are aligned. >> >> This condition can be ensured by adding a synchronize_rcu between the >> destruction of old sched_domains and the creation of new ones so the >> NOHZ_IDLE >> flag will not be updated with old sched_domain once it has been initialized. >> But this solution introduces a additionnal latency in the rebuild sequence >> that is called during cpu hotplug. >> >> As suggested by Frederic Weisbecker, another solution is to have the same >> rcu lifecycle for both NOHZ_IDLE and sched_domain struct. I have introduce >> a new sched_domain_rq struct that is the entry point for both sched_domains >> and objects that must follow the same lifecycle like NOHZ_IDLE flags. They >> will share the same RCU lifecycle and will be always synchronized. >> >> The synchronization is done at the cost of : >> - an additional indirection for accessing the first sched_domain level >> - an additional indirection and a rcu_dereference before accessing to the >>NOHZ_IDLE flag. >> >> Change since v4: >> - link both sched_domain and NOHZ_IDLE flag in one RCU object so >>their states are always synchronized. >> >> Change since V3; >> - NOHZ flag is not cleared if a NULL domain is attached to the CPU >> - Remove patch 2/2 which becomes useless with latest modifications >> >> Change since V2: >> - change the initialization to idle state instead of busy state so a CPU >> that >>enters idle during the build of the sched_domain will not corrupt the >>initialization state >> >> Change since V1: >> - remove the patch for SCHED softirq on an idle core use case as it was >>a side effect of the other use cases. >> >> Signed-off-by: Vincent Guittot >> --- >> include/linux/sched.h |6 +++ >> kernel/sched/core.c | 105 >> - >> kernel/sched/fair.c | 35 +++-- >> kernel/sched/sched.h | 24 +-- >> 4 files changed, 145 insertions(+), 25 deletions(-) >> >> diff --git a/include/linux/sched.h b/include/linux/sched.h >> index d35d2b6..2a52188 100644 >> --- a/include/linux/sched.h >> +++ b/include/linux/sched.h >> @@ -959,6 +959,12 @@ struct sched_domain { >> unsigned long span[0]; >> }; >> >> +struct sched_domain_rq { >> + struct sched_domain *sd; > > Why not make it part of the structure content instead of pointing to it? I just realize that would make destroy_sched_domains() too complicated because only the leaf sched_domain belong to a sched_domain_rq. Nevermind. -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH Resend v5] sched: fix init NOHZ_IDLE flag
2013/4/3 Vincent Guittot : > On my smp platform which is made of 5 cores in 2 clusters, I have the > nr_busy_cpu field of sched_group_power struct that is not null when the > platform is fully idle. The root cause is: > During the boot sequence, some CPUs reach the idle loop and set their > NOHZ_IDLE flag while waiting for others CPUs to boot. But the nr_busy_cpus > field is initialized later with the assumption that all CPUs are in the busy > state whereas some CPUs have already set their NOHZ_IDLE flag. > > More generally, the NOHZ_IDLE flag must be initialized when new sched_domains > are created in order to ensure that NOHZ_IDLE and nr_busy_cpus are aligned. > > This condition can be ensured by adding a synchronize_rcu between the > destruction of old sched_domains and the creation of new ones so the NOHZ_IDLE > flag will not be updated with old sched_domain once it has been initialized. > But this solution introduces a additionnal latency in the rebuild sequence > that is called during cpu hotplug. > > As suggested by Frederic Weisbecker, another solution is to have the same > rcu lifecycle for both NOHZ_IDLE and sched_domain struct. I have introduce > a new sched_domain_rq struct that is the entry point for both sched_domains > and objects that must follow the same lifecycle like NOHZ_IDLE flags. They > will share the same RCU lifecycle and will be always synchronized. > > The synchronization is done at the cost of : > - an additional indirection for accessing the first sched_domain level > - an additional indirection and a rcu_dereference before accessing to the >NOHZ_IDLE flag. > > Change since v4: > - link both sched_domain and NOHZ_IDLE flag in one RCU object so >their states are always synchronized. > > Change since V3; > - NOHZ flag is not cleared if a NULL domain is attached to the CPU > - Remove patch 2/2 which becomes useless with latest modifications > > Change since V2: > - change the initialization to idle state instead of busy state so a CPU that >enters idle during the build of the sched_domain will not corrupt the >initialization state > > Change since V1: > - remove the patch for SCHED softirq on an idle core use case as it was >a side effect of the other use cases. > > Signed-off-by: Vincent Guittot > --- > include/linux/sched.h |6 +++ > kernel/sched/core.c | 105 > - > kernel/sched/fair.c | 35 +++-- > kernel/sched/sched.h | 24 +-- > 4 files changed, 145 insertions(+), 25 deletions(-) > > diff --git a/include/linux/sched.h b/include/linux/sched.h > index d35d2b6..2a52188 100644 > --- a/include/linux/sched.h > +++ b/include/linux/sched.h > @@ -959,6 +959,12 @@ struct sched_domain { > unsigned long span[0]; > }; > > +struct sched_domain_rq { > + struct sched_domain *sd; Why not make it part of the structure content instead of pointing to it? > + unsigned long flags; > + struct rcu_head rcu;/* used during destruction */ > +}; -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH Resend v5] sched: fix init NOHZ_IDLE flag
2013/4/3 Vincent Guittot vincent.guit...@linaro.org: On my smp platform which is made of 5 cores in 2 clusters, I have the nr_busy_cpu field of sched_group_power struct that is not null when the platform is fully idle. The root cause is: During the boot sequence, some CPUs reach the idle loop and set their NOHZ_IDLE flag while waiting for others CPUs to boot. But the nr_busy_cpus field is initialized later with the assumption that all CPUs are in the busy state whereas some CPUs have already set their NOHZ_IDLE flag. More generally, the NOHZ_IDLE flag must be initialized when new sched_domains are created in order to ensure that NOHZ_IDLE and nr_busy_cpus are aligned. This condition can be ensured by adding a synchronize_rcu between the destruction of old sched_domains and the creation of new ones so the NOHZ_IDLE flag will not be updated with old sched_domain once it has been initialized. But this solution introduces a additionnal latency in the rebuild sequence that is called during cpu hotplug. As suggested by Frederic Weisbecker, another solution is to have the same rcu lifecycle for both NOHZ_IDLE and sched_domain struct. I have introduce a new sched_domain_rq struct that is the entry point for both sched_domains and objects that must follow the same lifecycle like NOHZ_IDLE flags. They will share the same RCU lifecycle and will be always synchronized. The synchronization is done at the cost of : - an additional indirection for accessing the first sched_domain level - an additional indirection and a rcu_dereference before accessing to the NOHZ_IDLE flag. Change since v4: - link both sched_domain and NOHZ_IDLE flag in one RCU object so their states are always synchronized. Change since V3; - NOHZ flag is not cleared if a NULL domain is attached to the CPU - Remove patch 2/2 which becomes useless with latest modifications Change since V2: - change the initialization to idle state instead of busy state so a CPU that enters idle during the build of the sched_domain will not corrupt the initialization state Change since V1: - remove the patch for SCHED softirq on an idle core use case as it was a side effect of the other use cases. Signed-off-by: Vincent Guittot vincent.guit...@linaro.org --- include/linux/sched.h |6 +++ kernel/sched/core.c | 105 - kernel/sched/fair.c | 35 +++-- kernel/sched/sched.h | 24 +-- 4 files changed, 145 insertions(+), 25 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index d35d2b6..2a52188 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -959,6 +959,12 @@ struct sched_domain { unsigned long span[0]; }; +struct sched_domain_rq { + struct sched_domain *sd; Why not make it part of the structure content instead of pointing to it? + unsigned long flags; + struct rcu_head rcu;/* used during destruction */ +}; -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH Resend v5] sched: fix init NOHZ_IDLE flag
2013/4/4 Frederic Weisbecker fweis...@gmail.com: 2013/4/3 Vincent Guittot vincent.guit...@linaro.org: On my smp platform which is made of 5 cores in 2 clusters, I have the nr_busy_cpu field of sched_group_power struct that is not null when the platform is fully idle. The root cause is: During the boot sequence, some CPUs reach the idle loop and set their NOHZ_IDLE flag while waiting for others CPUs to boot. But the nr_busy_cpus field is initialized later with the assumption that all CPUs are in the busy state whereas some CPUs have already set their NOHZ_IDLE flag. More generally, the NOHZ_IDLE flag must be initialized when new sched_domains are created in order to ensure that NOHZ_IDLE and nr_busy_cpus are aligned. This condition can be ensured by adding a synchronize_rcu between the destruction of old sched_domains and the creation of new ones so the NOHZ_IDLE flag will not be updated with old sched_domain once it has been initialized. But this solution introduces a additionnal latency in the rebuild sequence that is called during cpu hotplug. As suggested by Frederic Weisbecker, another solution is to have the same rcu lifecycle for both NOHZ_IDLE and sched_domain struct. I have introduce a new sched_domain_rq struct that is the entry point for both sched_domains and objects that must follow the same lifecycle like NOHZ_IDLE flags. They will share the same RCU lifecycle and will be always synchronized. The synchronization is done at the cost of : - an additional indirection for accessing the first sched_domain level - an additional indirection and a rcu_dereference before accessing to the NOHZ_IDLE flag. Change since v4: - link both sched_domain and NOHZ_IDLE flag in one RCU object so their states are always synchronized. Change since V3; - NOHZ flag is not cleared if a NULL domain is attached to the CPU - Remove patch 2/2 which becomes useless with latest modifications Change since V2: - change the initialization to idle state instead of busy state so a CPU that enters idle during the build of the sched_domain will not corrupt the initialization state Change since V1: - remove the patch for SCHED softirq on an idle core use case as it was a side effect of the other use cases. Signed-off-by: Vincent Guittot vincent.guit...@linaro.org --- include/linux/sched.h |6 +++ kernel/sched/core.c | 105 - kernel/sched/fair.c | 35 +++-- kernel/sched/sched.h | 24 +-- 4 files changed, 145 insertions(+), 25 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index d35d2b6..2a52188 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -959,6 +959,12 @@ struct sched_domain { unsigned long span[0]; }; +struct sched_domain_rq { + struct sched_domain *sd; Why not make it part of the structure content instead of pointing to it? I just realize that would make destroy_sched_domains() too complicated because only the leaf sched_domain belong to a sched_domain_rq. Nevermind. -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH Resend v5] sched: fix init NOHZ_IDLE flag
2013/4/3 Vincent Guittot vincent.guit...@linaro.org: On my smp platform which is made of 5 cores in 2 clusters, I have the nr_busy_cpu field of sched_group_power struct that is not null when the platform is fully idle. The root cause is: During the boot sequence, some CPUs reach the idle loop and set their NOHZ_IDLE flag while waiting for others CPUs to boot. But the nr_busy_cpus field is initialized later with the assumption that all CPUs are in the busy state whereas some CPUs have already set their NOHZ_IDLE flag. More generally, the NOHZ_IDLE flag must be initialized when new sched_domains are created in order to ensure that NOHZ_IDLE and nr_busy_cpus are aligned. This condition can be ensured by adding a synchronize_rcu between the destruction of old sched_domains and the creation of new ones so the NOHZ_IDLE flag will not be updated with old sched_domain once it has been initialized. But this solution introduces a additionnal latency in the rebuild sequence that is called during cpu hotplug. As suggested by Frederic Weisbecker, another solution is to have the same rcu lifecycle for both NOHZ_IDLE and sched_domain struct. I have introduce a new sched_domain_rq struct that is the entry point for both sched_domains and objects that must follow the same lifecycle like NOHZ_IDLE flags. They will share the same RCU lifecycle and will be always synchronized. The synchronization is done at the cost of : - an additional indirection for accessing the first sched_domain level - an additional indirection and a rcu_dereference before accessing to the NOHZ_IDLE flag. Change since v4: - link both sched_domain and NOHZ_IDLE flag in one RCU object so their states are always synchronized. Change since V3; - NOHZ flag is not cleared if a NULL domain is attached to the CPU - Remove patch 2/2 which becomes useless with latest modifications Change since V2: - change the initialization to idle state instead of busy state so a CPU that enters idle during the build of the sched_domain will not corrupt the initialization state Change since V1: - remove the patch for SCHED softirq on an idle core use case as it was a side effect of the other use cases. Signed-off-by: Vincent Guittot vincent.guit...@linaro.org --- include/linux/sched.h |6 +++ kernel/sched/core.c | 105 - kernel/sched/fair.c | 35 +++-- kernel/sched/sched.h | 24 +-- 4 files changed, 145 insertions(+), 25 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index d35d2b6..2a52188 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -959,6 +959,12 @@ struct sched_domain { unsigned long span[0]; }; +struct sched_domain_rq { + struct sched_domain *sd; + unsigned long flags; + struct rcu_head rcu;/* used during destruction */ +}; + static inline struct cpumask *sched_domain_span(struct sched_domain *sd) { return to_cpumask(sd-span); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7f12624..69e2313 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5602,6 +5602,15 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu) destroy_sched_domain(sd, cpu); } +static void destroy_sched_domain_rq(struct sched_domain_rq *sd_rq, int cpu) +{ + if (!sd_rq) + return; + + destroy_sched_domains(sd_rq-sd, cpu); + kfree_rcu(sd_rq, rcu); +} + /* * Keep a special pointer to the highest sched_domain that has * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this @@ -5632,10 +5641,23 @@ static void update_top_cache_domain(int cpu) * hold the hotplug lock. */ static void -cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) +cpu_attach_domain(struct sched_domain_rq *sd_rq, struct root_domain *rd, + int cpu) { struct rq *rq = cpu_rq(cpu); - struct sched_domain *tmp; + struct sched_domain_rq *tmp_rq; + struct sched_domain *tmp, *sd = NULL; + + /* +* If we don't have any sched_domain and associated object, we can +* directly jump to the attach sequence otherwise we try to degenerate +* the sched_domain +*/ + if (!sd_rq) + goto attach; + + /* Get a pointer to the 1st sched_domain */ + sd = sd_rq-sd; /* Remove the sched domains which do not contribute to scheduling. */ for (tmp = sd; tmp; ) { @@ -5658,14 +5680,17 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) destroy_sched_domain(tmp, cpu); if (sd) sd-child = NULL; + /* update sched_domain_rq */ + sd_rq-sd = sd; } +attach: sched_domain_debug(sd, cpu);
Re: [PATCH Resend v5] sched: fix init NOHZ_IDLE flag
On 4 April 2013 19:07, Frederic Weisbecker fweis...@gmail.com wrote: 2013/4/3 Vincent Guittot vincent.guit...@linaro.org: On my smp platform which is made of 5 cores in 2 clusters, I have the nr_busy_cpu field of sched_group_power struct that is not null when the platform is fully idle. The root cause is: During the boot sequence, some CPUs reach the idle loop and set their NOHZ_IDLE flag while waiting for others CPUs to boot. But the nr_busy_cpus field is initialized later with the assumption that all CPUs are in the busy state whereas some CPUs have already set their NOHZ_IDLE flag. More generally, the NOHZ_IDLE flag must be initialized when new sched_domains are created in order to ensure that NOHZ_IDLE and nr_busy_cpus are aligned. This condition can be ensured by adding a synchronize_rcu between the destruction of old sched_domains and the creation of new ones so the NOHZ_IDLE flag will not be updated with old sched_domain once it has been initialized. But this solution introduces a additionnal latency in the rebuild sequence that is called during cpu hotplug. As suggested by Frederic Weisbecker, another solution is to have the same rcu lifecycle for both NOHZ_IDLE and sched_domain struct. I have introduce a new sched_domain_rq struct that is the entry point for both sched_domains and objects that must follow the same lifecycle like NOHZ_IDLE flags. They will share the same RCU lifecycle and will be always synchronized. The synchronization is done at the cost of : - an additional indirection for accessing the first sched_domain level - an additional indirection and a rcu_dereference before accessing to the NOHZ_IDLE flag. Change since v4: - link both sched_domain and NOHZ_IDLE flag in one RCU object so their states are always synchronized. Change since V3; - NOHZ flag is not cleared if a NULL domain is attached to the CPU - Remove patch 2/2 which becomes useless with latest modifications Change since V2: - change the initialization to idle state instead of busy state so a CPU that enters idle during the build of the sched_domain will not corrupt the initialization state Change since V1: - remove the patch for SCHED softirq on an idle core use case as it was a side effect of the other use cases. Signed-off-by: Vincent Guittot vincent.guit...@linaro.org --- include/linux/sched.h |6 +++ kernel/sched/core.c | 105 - kernel/sched/fair.c | 35 +++-- kernel/sched/sched.h | 24 +-- 4 files changed, 145 insertions(+), 25 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index d35d2b6..2a52188 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -959,6 +959,12 @@ struct sched_domain { unsigned long span[0]; }; +struct sched_domain_rq { + struct sched_domain *sd; + unsigned long flags; + struct rcu_head rcu;/* used during destruction */ +}; + static inline struct cpumask *sched_domain_span(struct sched_domain *sd) { return to_cpumask(sd-span); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7f12624..69e2313 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5602,6 +5602,15 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu) destroy_sched_domain(sd, cpu); } +static void destroy_sched_domain_rq(struct sched_domain_rq *sd_rq, int cpu) +{ + if (!sd_rq) + return; + + destroy_sched_domains(sd_rq-sd, cpu); + kfree_rcu(sd_rq, rcu); +} + /* * Keep a special pointer to the highest sched_domain that has * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this @@ -5632,10 +5641,23 @@ static void update_top_cache_domain(int cpu) * hold the hotplug lock. */ static void -cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) +cpu_attach_domain(struct sched_domain_rq *sd_rq, struct root_domain *rd, + int cpu) { struct rq *rq = cpu_rq(cpu); - struct sched_domain *tmp; + struct sched_domain_rq *tmp_rq; + struct sched_domain *tmp, *sd = NULL; + + /* +* If we don't have any sched_domain and associated object, we can +* directly jump to the attach sequence otherwise we try to degenerate +* the sched_domain +*/ + if (!sd_rq) + goto attach; + + /* Get a pointer to the 1st sched_domain */ + sd = sd_rq-sd; /* Remove the sched domains which do not contribute to scheduling. */ for (tmp = sd; tmp; ) { @@ -5658,14 +5680,17 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) destroy_sched_domain(tmp, cpu); if (sd) sd-child = NULL; + /* update sched_domain_rq */ +
[PATCH Resend v5] sched: fix init NOHZ_IDLE flag
On my smp platform which is made of 5 cores in 2 clusters, I have the nr_busy_cpu field of sched_group_power struct that is not null when the platform is fully idle. The root cause is: During the boot sequence, some CPUs reach the idle loop and set their NOHZ_IDLE flag while waiting for others CPUs to boot. But the nr_busy_cpus field is initialized later with the assumption that all CPUs are in the busy state whereas some CPUs have already set their NOHZ_IDLE flag. More generally, the NOHZ_IDLE flag must be initialized when new sched_domains are created in order to ensure that NOHZ_IDLE and nr_busy_cpus are aligned. This condition can be ensured by adding a synchronize_rcu between the destruction of old sched_domains and the creation of new ones so the NOHZ_IDLE flag will not be updated with old sched_domain once it has been initialized. But this solution introduces a additionnal latency in the rebuild sequence that is called during cpu hotplug. As suggested by Frederic Weisbecker, another solution is to have the same rcu lifecycle for both NOHZ_IDLE and sched_domain struct. I have introduce a new sched_domain_rq struct that is the entry point for both sched_domains and objects that must follow the same lifecycle like NOHZ_IDLE flags. They will share the same RCU lifecycle and will be always synchronized. The synchronization is done at the cost of : - an additional indirection for accessing the first sched_domain level - an additional indirection and a rcu_dereference before accessing to the NOHZ_IDLE flag. Change since v4: - link both sched_domain and NOHZ_IDLE flag in one RCU object so their states are always synchronized. Change since V3; - NOHZ flag is not cleared if a NULL domain is attached to the CPU - Remove patch 2/2 which becomes useless with latest modifications Change since V2: - change the initialization to idle state instead of busy state so a CPU that enters idle during the build of the sched_domain will not corrupt the initialization state Change since V1: - remove the patch for SCHED softirq on an idle core use case as it was a side effect of the other use cases. Signed-off-by: Vincent Guittot --- include/linux/sched.h |6 +++ kernel/sched/core.c | 105 - kernel/sched/fair.c | 35 +++-- kernel/sched/sched.h | 24 +-- 4 files changed, 145 insertions(+), 25 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index d35d2b6..2a52188 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -959,6 +959,12 @@ struct sched_domain { unsigned long span[0]; }; +struct sched_domain_rq { + struct sched_domain *sd; + unsigned long flags; + struct rcu_head rcu;/* used during destruction */ +}; + static inline struct cpumask *sched_domain_span(struct sched_domain *sd) { return to_cpumask(sd->span); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7f12624..69e2313 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5602,6 +5602,15 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu) destroy_sched_domain(sd, cpu); } +static void destroy_sched_domain_rq(struct sched_domain_rq *sd_rq, int cpu) +{ + if (!sd_rq) + return; + + destroy_sched_domains(sd_rq->sd, cpu); + kfree_rcu(sd_rq, rcu); +} + /* * Keep a special pointer to the highest sched_domain that has * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this @@ -5632,10 +5641,23 @@ static void update_top_cache_domain(int cpu) * hold the hotplug lock. */ static void -cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) +cpu_attach_domain(struct sched_domain_rq *sd_rq, struct root_domain *rd, + int cpu) { struct rq *rq = cpu_rq(cpu); - struct sched_domain *tmp; + struct sched_domain_rq *tmp_rq; + struct sched_domain *tmp, *sd = NULL; + + /* +* If we don't have any sched_domain and associated object, we can +* directly jump to the attach sequence otherwise we try to degenerate +* the sched_domain +*/ + if (!sd_rq) + goto attach; + + /* Get a pointer to the 1st sched_domain */ + sd = sd_rq->sd; /* Remove the sched domains which do not contribute to scheduling. */ for (tmp = sd; tmp; ) { @@ -5658,14 +5680,17 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) destroy_sched_domain(tmp, cpu); if (sd) sd->child = NULL; + /* update sched_domain_rq */ + sd_rq->sd = sd; } +attach: sched_domain_debug(sd, cpu); rq_attach_root(rq, rd); - tmp = rq->sd; - rcu_assign_pointer(rq->sd, sd); - destroy_sched_domains(tmp, cpu); + tmp_rq = rq->sd_rq; +
[PATCH Resend v5] sched: fix init NOHZ_IDLE flag
On my smp platform which is made of 5 cores in 2 clusters, I have the nr_busy_cpu field of sched_group_power struct that is not null when the platform is fully idle. The root cause is: During the boot sequence, some CPUs reach the idle loop and set their NOHZ_IDLE flag while waiting for others CPUs to boot. But the nr_busy_cpus field is initialized later with the assumption that all CPUs are in the busy state whereas some CPUs have already set their NOHZ_IDLE flag. More generally, the NOHZ_IDLE flag must be initialized when new sched_domains are created in order to ensure that NOHZ_IDLE and nr_busy_cpus are aligned. This condition can be ensured by adding a synchronize_rcu between the destruction of old sched_domains and the creation of new ones so the NOHZ_IDLE flag will not be updated with old sched_domain once it has been initialized. But this solution introduces a additionnal latency in the rebuild sequence that is called during cpu hotplug. As suggested by Frederic Weisbecker, another solution is to have the same rcu lifecycle for both NOHZ_IDLE and sched_domain struct. I have introduce a new sched_domain_rq struct that is the entry point for both sched_domains and objects that must follow the same lifecycle like NOHZ_IDLE flags. They will share the same RCU lifecycle and will be always synchronized. The synchronization is done at the cost of : - an additional indirection for accessing the first sched_domain level - an additional indirection and a rcu_dereference before accessing to the NOHZ_IDLE flag. Change since v4: - link both sched_domain and NOHZ_IDLE flag in one RCU object so their states are always synchronized. Change since V3; - NOHZ flag is not cleared if a NULL domain is attached to the CPU - Remove patch 2/2 which becomes useless with latest modifications Change since V2: - change the initialization to idle state instead of busy state so a CPU that enters idle during the build of the sched_domain will not corrupt the initialization state Change since V1: - remove the patch for SCHED softirq on an idle core use case as it was a side effect of the other use cases. Signed-off-by: Vincent Guittot vincent.guit...@linaro.org --- include/linux/sched.h |6 +++ kernel/sched/core.c | 105 - kernel/sched/fair.c | 35 +++-- kernel/sched/sched.h | 24 +-- 4 files changed, 145 insertions(+), 25 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index d35d2b6..2a52188 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -959,6 +959,12 @@ struct sched_domain { unsigned long span[0]; }; +struct sched_domain_rq { + struct sched_domain *sd; + unsigned long flags; + struct rcu_head rcu;/* used during destruction */ +}; + static inline struct cpumask *sched_domain_span(struct sched_domain *sd) { return to_cpumask(sd-span); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7f12624..69e2313 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5602,6 +5602,15 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu) destroy_sched_domain(sd, cpu); } +static void destroy_sched_domain_rq(struct sched_domain_rq *sd_rq, int cpu) +{ + if (!sd_rq) + return; + + destroy_sched_domains(sd_rq-sd, cpu); + kfree_rcu(sd_rq, rcu); +} + /* * Keep a special pointer to the highest sched_domain that has * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this @@ -5632,10 +5641,23 @@ static void update_top_cache_domain(int cpu) * hold the hotplug lock. */ static void -cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) +cpu_attach_domain(struct sched_domain_rq *sd_rq, struct root_domain *rd, + int cpu) { struct rq *rq = cpu_rq(cpu); - struct sched_domain *tmp; + struct sched_domain_rq *tmp_rq; + struct sched_domain *tmp, *sd = NULL; + + /* +* If we don't have any sched_domain and associated object, we can +* directly jump to the attach sequence otherwise we try to degenerate +* the sched_domain +*/ + if (!sd_rq) + goto attach; + + /* Get a pointer to the 1st sched_domain */ + sd = sd_rq-sd; /* Remove the sched domains which do not contribute to scheduling. */ for (tmp = sd; tmp; ) { @@ -5658,14 +5680,17 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) destroy_sched_domain(tmp, cpu); if (sd) sd-child = NULL; + /* update sched_domain_rq */ + sd_rq-sd = sd; } +attach: sched_domain_debug(sd, cpu); rq_attach_root(rq, rd); - tmp = rq-sd; - rcu_assign_pointer(rq-sd, sd); - destroy_sched_domains(tmp, cpu); + tmp_rq = rq-sd_rq; +