Re: [PATCH 3/3] sched: update blocked load when newly idle
On 14 February 2018 at 15:40, Valentin Schneider wrote: > On 02/13/2018 10:31 AM, Vincent Guittot wrote: >> When NEWLY_IDLE load balance is not triggered, we might need to update the >> blocked load anyway. We can kick an ilb so an idle CPU will take care of >> updating blocked load or we can try to update them locally before entering >> idle. In the latter case, we reuse part of the nohz_idle_balance. >> >> Signed-off-by: Vincent Guittot >> --- >> kernel/sched/fair.c | 324 >> +++- >> 1 file changed, 193 insertions(+), 131 deletions(-) >> >> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c >> index 9183fee..cb1ab5c 100644 >> --- a/kernel/sched/fair.c >> +++ b/kernel/sched/fair.c >> >> [...] >> >> /* >> + * idle_balance is called by schedule() if this_cpu is about to become >> + * idle. Attempts to pull tasks from other CPUs. >> + */ >> +static int idle_balance(struct rq *this_rq, struct rq_flags *rf) >> +{ >> + unsigned long next_balance = jiffies + HZ; >> + int this_cpu = this_rq->cpu; >> + struct sched_domain *sd; >> + int pulled_task = 0; >> + u64 curr_cost = 0; >> + >> + /* >> + * We must set idle_stamp _before_ calling idle_balance(), such that we >> + * measure the duration of idle_balance() as idle time. >> + */ >> + this_rq->idle_stamp = rq_clock(this_rq); >> + >> + /* >> + * Do not pull tasks towards !active CPUs... >> + */ >> + if (!cpu_active(this_cpu)) >> + return 0; >> + >> + /* >> + * This is OK, because current is on_cpu, which avoids it being picked >> + * for load-balance and preemption/IRQs are still disabled avoiding >> + * further scheduler activity on it and we're being very careful to >> + * re-start the picking loop. >> + */ >> + rq_unpin_lock(this_rq, rf); >> + >> + if (this_rq->avg_idle < sysctl_sched_migration_cost || >> + !this_rq->rd->overload) { >> +#ifdef CONFIG_NO_HZ_COMMON >> + unsigned long has_blocked = READ_ONCE(nohz.has_blocked); >> + unsigned long next_blocked = READ_ONCE(nohz.next_blocked); >> +#endif >> + rcu_read_lock(); >> + sd = rcu_dereference_check_sched_domain(this_rq->sd); >> + if (sd) >> + update_next_balance(sd, &next_balance); >> + rcu_read_unlock(); >> + >> +#ifdef CONFIG_NO_HZ_COMMON >> + /* >> + * This CPU doesn't want to be disturbed by scheduler >> + * houskeeping > > Typo here (houskeeping) > >> + */ >> + if (!housekeeping_cpu(this_cpu, HK_FLAG_SCHED)) >> + goto out; >> + >> + /* Will wake up very soon. No time for fdoing anything else*/ > > Typo here (fdoing) > >> + if (this_rq->avg_idle < sysctl_sched_migration_cost) >> + goto out; >> + >> + /* Don't need to update blocked load of idle CPUs*/ >> + if (!has_blocked || time_after_eq(jiffies, next_blocked)) >> + goto out; > > My "stats update via NEWLY_IDLE" test case started misbehaving with this > version: we skip most NEWLY_IDLE stats updates. AFAICT this is the culprit. > > I believe this time check should be time_before(jiffies, next_blocked) > (or time_before_eq depending on what you want to guarantee with the jiffy > interval stuff). argh.. I have completely mess up the conditions when reordering them Thanks for spotting this > >> + >> + raw_spin_unlock(&this_rq->lock); >> + /* >> + * This CPU is going to be idle and blocked load of idle CPUs >> + * need to be updated. Run the ilb locally as it is a good >> + * candidate for ilb instead of waking up another idle CPU. >> + * Kick an normal ilb if we failed to do the update. >> + */ >> + if (!_nohz_idle_balance(this_rq, NOHZ_STATS_KICK, >> CPU_NEWLY_IDLE)) >> + kick_ilb(NOHZ_STATS_KICK); >> + raw_spin_lock(&this_rq->lock); >> +#endif >> + goto out; >> + } >> +
Re: [PATCH 3/3] sched: update blocked load when newly idle
On 02/13/2018 10:31 AM, Vincent Guittot wrote: > When NEWLY_IDLE load balance is not triggered, we might need to update the > blocked load anyway. We can kick an ilb so an idle CPU will take care of > updating blocked load or we can try to update them locally before entering > idle. In the latter case, we reuse part of the nohz_idle_balance. > > Signed-off-by: Vincent Guittot > --- > kernel/sched/fair.c | 324 > +++- > 1 file changed, 193 insertions(+), 131 deletions(-) > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c > index 9183fee..cb1ab5c 100644 > --- a/kernel/sched/fair.c > +++ b/kernel/sched/fair.c > > [...] > > /* > + * idle_balance is called by schedule() if this_cpu is about to become > + * idle. Attempts to pull tasks from other CPUs. > + */ > +static int idle_balance(struct rq *this_rq, struct rq_flags *rf) > +{ > + unsigned long next_balance = jiffies + HZ; > + int this_cpu = this_rq->cpu; > + struct sched_domain *sd; > + int pulled_task = 0; > + u64 curr_cost = 0; > + > + /* > + * We must set idle_stamp _before_ calling idle_balance(), such that we > + * measure the duration of idle_balance() as idle time. > + */ > + this_rq->idle_stamp = rq_clock(this_rq); > + > + /* > + * Do not pull tasks towards !active CPUs... > + */ > + if (!cpu_active(this_cpu)) > + return 0; > + > + /* > + * This is OK, because current is on_cpu, which avoids it being picked > + * for load-balance and preemption/IRQs are still disabled avoiding > + * further scheduler activity on it and we're being very careful to > + * re-start the picking loop. > + */ > + rq_unpin_lock(this_rq, rf); > + > + if (this_rq->avg_idle < sysctl_sched_migration_cost || > + !this_rq->rd->overload) { > +#ifdef CONFIG_NO_HZ_COMMON > + unsigned long has_blocked = READ_ONCE(nohz.has_blocked); > + unsigned long next_blocked = READ_ONCE(nohz.next_blocked); > +#endif > + rcu_read_lock(); > + sd = rcu_dereference_check_sched_domain(this_rq->sd); > + if (sd) > + update_next_balance(sd, &next_balance); > + rcu_read_unlock(); > + > +#ifdef CONFIG_NO_HZ_COMMON > + /* > + * This CPU doesn't want to be disturbed by scheduler > + * houskeeping Typo here (houskeeping) > + */ > + if (!housekeeping_cpu(this_cpu, HK_FLAG_SCHED)) > + goto out; > + > + /* Will wake up very soon. No time for fdoing anything else*/ Typo here (fdoing) > + if (this_rq->avg_idle < sysctl_sched_migration_cost) > + goto out; > + > + /* Don't need to update blocked load of idle CPUs*/ > + if (!has_blocked || time_after_eq(jiffies, next_blocked)) > + goto out; My "stats update via NEWLY_IDLE" test case started misbehaving with this version: we skip most NEWLY_IDLE stats updates. AFAICT this is the culprit. I believe this time check should be time_before(jiffies, next_blocked) (or time_before_eq depending on what you want to guarantee with the jiffy interval stuff). > + > + raw_spin_unlock(&this_rq->lock); > + /* > + * This CPU is going to be idle and blocked load of idle CPUs > + * need to be updated. Run the ilb locally as it is a good > + * candidate for ilb instead of waking up another idle CPU. > + * Kick an normal ilb if we failed to do the update. > + */ > + if (!_nohz_idle_balance(this_rq, NOHZ_STATS_KICK, > CPU_NEWLY_IDLE)) > + kick_ilb(NOHZ_STATS_KICK); > + raw_spin_lock(&this_rq->lock); > +#endif > + goto out; > + } > +
Re: [PATCH 3/3] sched: update blocked load when newly idle
On 02/06/2018 04:17 PM, Vincent Guittot wrote: > On 6 February 2018 at 15:32, Valentin Schneider > wrote: >> Hi Vincent, >> >> On 02/06/2018 08:32 AM, Vincent Guittot wrote: >>> When NEWLY_IDLE load balance is not triggered, we might need to update the >>> blocked load anyway. We can kick an ilb so an idle CPU will take care of >>> updating blocked load or we can try to update them locally before entering >>> idle. In the latter case, we reuse part of the nohz_idle_balance. >>> >>> Signed-off-by: Vincent Guittot >>> --- >>> kernel/sched/fair.c | 102 >>> ++-- >>> 1 file changed, 84 insertions(+), 18 deletions(-) >>> >>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c >>> index 6998528..256defe 100644 >>> --- a/kernel/sched/fair.c >>> +++ b/kernel/sched/fair.c >>> @@ -8829,6 +8829,9 @@ update_next_balance(struct sched_domain *sd, unsigned >>> long *next_balance) >>> *next_balance = next; >>> } >>> >>> +static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, >>> enum cpu_idle_type idle); >>> +static void kick_ilb(unsigned int flags); >>> + >>> /* >>> * idle_balance is called by schedule() if this_cpu is about to become >>> * idle. Attempts to pull tasks from other CPUs. >>> @@ -8863,12 +8866,26 @@ static int idle_balance(struct rq *this_rq, struct >>> rq_flags *rf) >>> >>> if (this_rq->avg_idle < sysctl_sched_migration_cost || >>> !this_rq->rd->overload) { >>> + unsigned long has_blocked = READ_ONCE(nohz.has_blocked); >>> + unsigned long next = READ_ONCE(nohz.next_blocked); >> >> Ditto on 'next' - there's next_balance referenced in here so it'd be nice to >> make clear which is which. >> >>> + >>> rcu_read_lock(); >>> sd = rcu_dereference_check_sched_domain(this_rq->sd); >>> if (sd) >>> update_next_balance(sd, &next_balance); >>> rcu_read_unlock(); >>> >>> + /* >>> + * Update blocked idle load if it has not been done for a >>> + * while. Try to do it locally before entering idle but kick a >>> + * ilb if it takes too much time and/or might delay next local >>> + * wake up >>> + */ >>> + if (has_blocked && time_after_eq(jiffies, next) && >>> + (this_rq->avg_idle < >>> sysctl_sched_migration_cost || >>> + !_nohz_idle_balance(this_rq, NOHZ_STATS_KICK, >>> CPU_NEWLY_IDLE))) >> >> "this_rq->avg_idle < sysctl_sched_migration_cost" is used twice now, how >> about storing it in an "idle_too_short" variable ? > > In fact it's already the 3rd time > Why do you want it to be stored in an "idle_too_short" variable ? I meant that locally in idle_balance() to not write the same thing twice. TBH that's me being nitpicky (and liking explicit variables). > >> >>> + kick_ilb(NOHZ_STATS_KICK); >>> + >>> goto out; >>> } >>> >>> @@ -9393,30 +9410,24 @@ static void rebalance_domains(struct rq *rq, enum >>> cpu_idle_type idle) >>> >>> #ifdef CONFIG_NO_HZ_COMMON >>> /* >>> - * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the >>> - * rebalancing for all the cpus for whom scheduler ticks are stopped. >>> + * Internal function that runs load balance for all idle cpus. The load >>> balance >>> + * can be a simple update of blocked load or a complete load balance with >>> + * tasks movement depending of flags. >>> + * For newly idle mode, we abort the loop if it takes too much time and >>> return >>> + * false to notify that the loop has not be completed and a ilb should be >>> kick. >>> */ >>> -static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) >>> +static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, >>> enum cpu_idle_type idle) >>> { >>> /* Earliest time when we have to do rebalance again */ >>> unsigned long now = jiffies; >>> unsigned long next_balance = now + 60*HZ; >>> - unsigned long next_stats = now + msecs_to_jiffies(LOAD_AVG_PERIOD); >>> + bool has_blocked_load = false; >>> int update_next_balance = 0; >>> int this_cpu = this_rq->cpu; >>> - unsigned int flags; >>> int balance_cpu; >>> + int ret = false; >>> struct rq *rq; >>> - >>> - if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK)) >>> - return false; >>> - >>> - if (idle != CPU_IDLE) { >>> - atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu)); >>> - return false; >>> - } >>> - >>> - flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu)); >>> + u64 curr_cost = 0; >>> >>> SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK); >>> >>> @@ -9431,6 +9442,10 @@ static bool nohz_idle_balance(struct rq *this_rq, >>> enum cpu_idle_type idle) >>> WRITE_ONCE(nohz.has
Re: [PATCH 3/3] sched: update blocked load when newly idle
On 6 February 2018 at 15:32, Valentin Schneider wrote: > Hi Vincent, > > On 02/06/2018 08:32 AM, Vincent Guittot wrote: >> When NEWLY_IDLE load balance is not triggered, we might need to update the >> blocked load anyway. We can kick an ilb so an idle CPU will take care of >> updating blocked load or we can try to update them locally before entering >> idle. In the latter case, we reuse part of the nohz_idle_balance. >> >> Signed-off-by: Vincent Guittot >> --- >> kernel/sched/fair.c | 102 >> ++-- >> 1 file changed, 84 insertions(+), 18 deletions(-) >> >> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c >> index 6998528..256defe 100644 >> --- a/kernel/sched/fair.c >> +++ b/kernel/sched/fair.c >> @@ -8829,6 +8829,9 @@ update_next_balance(struct sched_domain *sd, unsigned >> long *next_balance) >> *next_balance = next; >> } >> >> +static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, enum >> cpu_idle_type idle); >> +static void kick_ilb(unsigned int flags); >> + >> /* >> * idle_balance is called by schedule() if this_cpu is about to become >> * idle. Attempts to pull tasks from other CPUs. >> @@ -8863,12 +8866,26 @@ static int idle_balance(struct rq *this_rq, struct >> rq_flags *rf) >> >> if (this_rq->avg_idle < sysctl_sched_migration_cost || >> !this_rq->rd->overload) { >> + unsigned long has_blocked = READ_ONCE(nohz.has_blocked); >> + unsigned long next = READ_ONCE(nohz.next_blocked); > > Ditto on 'next' - there's next_balance referenced in here so it'd be nice to > make clear which is which. > >> + >> rcu_read_lock(); >> sd = rcu_dereference_check_sched_domain(this_rq->sd); >> if (sd) >> update_next_balance(sd, &next_balance); >> rcu_read_unlock(); >> >> + /* >> + * Update blocked idle load if it has not been done for a >> + * while. Try to do it locally before entering idle but kick a >> + * ilb if it takes too much time and/or might delay next local >> + * wake up >> + */ >> + if (has_blocked && time_after_eq(jiffies, next) && >> + (this_rq->avg_idle < >> sysctl_sched_migration_cost || >> + !_nohz_idle_balance(this_rq, NOHZ_STATS_KICK, >> CPU_NEWLY_IDLE))) > > "this_rq->avg_idle < sysctl_sched_migration_cost" is used twice now, how > about storing it in an "idle_too_short" variable ? In fact it's already the 3rd time Why do you want it to be stored in an "idle_too_short" variable ? > >> + kick_ilb(NOHZ_STATS_KICK); >> + >> goto out; >> } >> >> @@ -9393,30 +9410,24 @@ static void rebalance_domains(struct rq *rq, enum >> cpu_idle_type idle) >> >> #ifdef CONFIG_NO_HZ_COMMON >> /* >> - * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the >> - * rebalancing for all the cpus for whom scheduler ticks are stopped. >> + * Internal function that runs load balance for all idle cpus. The load >> balance >> + * can be a simple update of blocked load or a complete load balance with >> + * tasks movement depending of flags. >> + * For newly idle mode, we abort the loop if it takes too much time and >> return >> + * false to notify that the loop has not be completed and a ilb should be >> kick. >> */ >> -static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) >> +static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, enum >> cpu_idle_type idle) >> { >> /* Earliest time when we have to do rebalance again */ >> unsigned long now = jiffies; >> unsigned long next_balance = now + 60*HZ; >> - unsigned long next_stats = now + msecs_to_jiffies(LOAD_AVG_PERIOD); >> + bool has_blocked_load = false; >> int update_next_balance = 0; >> int this_cpu = this_rq->cpu; >> - unsigned int flags; >> int balance_cpu; >> + int ret = false; >> struct rq *rq; >> - >> - if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK)) >> - return false; >> - >> - if (idle != CPU_IDLE) { >> - atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu)); >> - return false; >> - } >> - >> - flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu)); >> + u64 curr_cost = 0; >> >> SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK); >> >> @@ -9431,6 +9442,10 @@ static bool nohz_idle_balance(struct rq *this_rq, >> enum cpu_idle_type idle) >> WRITE_ONCE(nohz.has_blocked, 0); >> >> for_each_cpu(balance_cpu, nohz.idle_cpus_mask) { >> + u64 t0, domain_cost; >> + >> + t0 = sched_clock_cpu(this_cpu); >> + >> if (balance_cpu == this_cpu || !idle_cpu(balance_cpu)) >> continue; >> >> @@ -9444,6 +9459,16 @@ static
Re: [PATCH 3/3] sched: update blocked load when newly idle
Hi Vincent, On 02/06/2018 08:32 AM, Vincent Guittot wrote: > When NEWLY_IDLE load balance is not triggered, we might need to update the > blocked load anyway. We can kick an ilb so an idle CPU will take care of > updating blocked load or we can try to update them locally before entering > idle. In the latter case, we reuse part of the nohz_idle_balance. > > Signed-off-by: Vincent Guittot > --- > kernel/sched/fair.c | 102 > ++-- > 1 file changed, 84 insertions(+), 18 deletions(-) > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c > index 6998528..256defe 100644 > --- a/kernel/sched/fair.c > +++ b/kernel/sched/fair.c > @@ -8829,6 +8829,9 @@ update_next_balance(struct sched_domain *sd, unsigned > long *next_balance) > *next_balance = next; > } > > +static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, enum > cpu_idle_type idle); > +static void kick_ilb(unsigned int flags); > + > /* > * idle_balance is called by schedule() if this_cpu is about to become > * idle. Attempts to pull tasks from other CPUs. > @@ -8863,12 +8866,26 @@ static int idle_balance(struct rq *this_rq, struct > rq_flags *rf) > > if (this_rq->avg_idle < sysctl_sched_migration_cost || > !this_rq->rd->overload) { > + unsigned long has_blocked = READ_ONCE(nohz.has_blocked); > + unsigned long next = READ_ONCE(nohz.next_blocked); Ditto on 'next' - there's next_balance referenced in here so it'd be nice to make clear which is which. > + > rcu_read_lock(); > sd = rcu_dereference_check_sched_domain(this_rq->sd); > if (sd) > update_next_balance(sd, &next_balance); > rcu_read_unlock(); > > + /* > + * Update blocked idle load if it has not been done for a > + * while. Try to do it locally before entering idle but kick a > + * ilb if it takes too much time and/or might delay next local > + * wake up > + */ > + if (has_blocked && time_after_eq(jiffies, next) && > + (this_rq->avg_idle < > sysctl_sched_migration_cost || > + !_nohz_idle_balance(this_rq, NOHZ_STATS_KICK, > CPU_NEWLY_IDLE))) "this_rq->avg_idle < sysctl_sched_migration_cost" is used twice now, how about storing it in an "idle_too_short" variable ? > + kick_ilb(NOHZ_STATS_KICK); > + > goto out; > } > > @@ -9393,30 +9410,24 @@ static void rebalance_domains(struct rq *rq, enum > cpu_idle_type idle) > > #ifdef CONFIG_NO_HZ_COMMON > /* > - * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the > - * rebalancing for all the cpus for whom scheduler ticks are stopped. > + * Internal function that runs load balance for all idle cpus. The load > balance > + * can be a simple update of blocked load or a complete load balance with > + * tasks movement depending of flags. > + * For newly idle mode, we abort the loop if it takes too much time and > return > + * false to notify that the loop has not be completed and a ilb should be > kick. > */ > -static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) > +static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, enum > cpu_idle_type idle) > { > /* Earliest time when we have to do rebalance again */ > unsigned long now = jiffies; > unsigned long next_balance = now + 60*HZ; > - unsigned long next_stats = now + msecs_to_jiffies(LOAD_AVG_PERIOD); > + bool has_blocked_load = false; > int update_next_balance = 0; > int this_cpu = this_rq->cpu; > - unsigned int flags; > int balance_cpu; > + int ret = false; > struct rq *rq; > - > - if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK)) > - return false; > - > - if (idle != CPU_IDLE) { > - atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu)); > - return false; > - } > - > - flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu)); > + u64 curr_cost = 0; > > SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK); > > @@ -9431,6 +9442,10 @@ static bool nohz_idle_balance(struct rq *this_rq, enum > cpu_idle_type idle) > WRITE_ONCE(nohz.has_blocked, 0); > > for_each_cpu(balance_cpu, nohz.idle_cpus_mask) { > + u64 t0, domain_cost; > + > + t0 = sched_clock_cpu(this_cpu); > + > if (balance_cpu == this_cpu || !idle_cpu(balance_cpu)) > continue; > > @@ -9444,6 +9459,16 @@ static bool nohz_idle_balance(struct rq *this_rq, enum > cpu_idle_type idle) > goto abort; > } > > + /* > + * If the update is done while CPU becomes idle, we abort > + * the update when its cost is