Hi Vincent,

On 02/06/2018 08:32 AM, Vincent Guittot wrote:
> When NEWLY_IDLE load balance is not triggered, we might need to update the
> blocked load anyway. We can kick an ilb so an idle CPU will take care of
> updating blocked load or we can try to update them locally before entering
> idle. In the latter case, we reuse part of the nohz_idle_balance.
> 
> Signed-off-by: Vincent Guittot <[email protected]>
> ---
>  kernel/sched/fair.c | 102 
> ++++++++++++++++++++++++++++++++++++++++++----------
>  1 file changed, 84 insertions(+), 18 deletions(-)
> 
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 6998528..256defe 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -8829,6 +8829,9 @@ update_next_balance(struct sched_domain *sd, unsigned 
> long *next_balance)
>               *next_balance = next;
>  }
>  
> +static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, enum 
> cpu_idle_type idle);
> +static void kick_ilb(unsigned int flags);
> +
>  /*
>   * idle_balance is called by schedule() if this_cpu is about to become
>   * idle. Attempts to pull tasks from other CPUs.
> @@ -8863,12 +8866,26 @@ static int idle_balance(struct rq *this_rq, struct 
> rq_flags *rf)
>  
>       if (this_rq->avg_idle < sysctl_sched_migration_cost ||
>           !this_rq->rd->overload) {
> +             unsigned long has_blocked = READ_ONCE(nohz.has_blocked);
> +             unsigned long next = READ_ONCE(nohz.next_blocked);

Ditto on 'next' - there's next_balance referenced in here so it'd be nice to 
make clear which is which.

> +
>               rcu_read_lock();
>               sd = rcu_dereference_check_sched_domain(this_rq->sd);
>               if (sd)
>                       update_next_balance(sd, &next_balance);
>               rcu_read_unlock();
>  
> +             /*
> +              * Update blocked idle load if it has not been done for a
> +              * while. Try to do it locally before entering idle but kick a
> +              * ilb if it takes too much time and/or might delay next local
> +              * wake up
> +              */
> +             if (has_blocked && time_after_eq(jiffies, next) &&
> +                             (this_rq->avg_idle < 
> sysctl_sched_migration_cost ||
> +                             !_nohz_idle_balance(this_rq, NOHZ_STATS_KICK, 
> CPU_NEWLY_IDLE)))

"this_rq->avg_idle < sysctl_sched_migration_cost" is used twice now, how about 
storing it in an "idle_too_short" variable ?

> +                     kick_ilb(NOHZ_STATS_KICK);
> +
>               goto out;
>       }
>  
> @@ -9393,30 +9410,24 @@ static void rebalance_domains(struct rq *rq, enum 
> cpu_idle_type idle)
>  
>  #ifdef CONFIG_NO_HZ_COMMON
>  /*
> - * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
> - * rebalancing for all the cpus for whom scheduler ticks are stopped.
> + * Internal function that runs load balance for all idle cpus. The load 
> balance
> + * can be a simple update of blocked load or a complete load balance with
> + * tasks movement depending of flags.
> + * For newly idle mode, we abort the loop if it takes too much time and 
> return
> + * false to notify that the loop has not be completed and a ilb should be 
> kick.
>   */
> -static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
> +static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, enum 
> cpu_idle_type idle)
>  {
>       /* Earliest time when we have to do rebalance again */
>       unsigned long now = jiffies;
>       unsigned long next_balance = now + 60*HZ;
> -     unsigned long next_stats = now + msecs_to_jiffies(LOAD_AVG_PERIOD);
> +     bool has_blocked_load = false;
>       int update_next_balance = 0;
>       int this_cpu = this_rq->cpu;
> -     unsigned int flags;
>       int balance_cpu;
> +     int ret = false;
>       struct rq *rq;
> -
> -     if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK))
> -             return false;
> -
> -     if (idle != CPU_IDLE) {
> -             atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
> -             return false;
> -     }
> -
> -     flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
> +     u64 curr_cost = 0;
>  
>       SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
>  
> @@ -9431,6 +9442,10 @@ static bool nohz_idle_balance(struct rq *this_rq, enum 
> cpu_idle_type idle)
>       WRITE_ONCE(nohz.has_blocked, 0);
>  
>       for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
> +             u64 t0, domain_cost;
> +
> +             t0 = sched_clock_cpu(this_cpu);
> +
>               if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
>                       continue;
>  
> @@ -9444,6 +9459,16 @@ static bool nohz_idle_balance(struct rq *this_rq, enum 
> cpu_idle_type idle)
>                       goto abort;
>               }
>  
> +             /*
> +              * If the update is done while CPU becomes idle, we abort
> +              * the update when its cost is higher than the average idle
> +              * time in orde to not delay a possible wake up.
> +              */
> +             if (idle == CPU_NEWLY_IDLE && this_rq->avg_idle < curr_cost) {
> +                     has_blocked_load = true;
> +                     goto abort;
> +             }
> +
>               rq = cpu_rq(balance_cpu);
>  
>               update_blocked_averages(rq->cpu);
> @@ -9456,10 +9481,10 @@ static bool nohz_idle_balance(struct rq *this_rq, 
> enum cpu_idle_type idle)
>               if (time_after_eq(jiffies, rq->next_balance)) {
>                       struct rq_flags rf;
>  
> -                     rq_lock_irq(rq, &rf);
> +                     rq_lock_irqsave(rq, &rf);
>                       update_rq_clock(rq);
>                       cpu_load_update_idle(rq);
> -                     rq_unlock_irq(rq, &rf);
> +                     rq_unlock_irqrestore(rq, &rf);
>  
>                       if (flags & NOHZ_BALANCE_KICK)
>                               rebalance_domains(rq, CPU_IDLE);
> @@ -9469,15 +9494,27 @@ static bool nohz_idle_balance(struct rq *this_rq, 
> enum cpu_idle_type idle)
>                       next_balance = rq->next_balance;
>                       update_next_balance = 1;
>               }
> +
> +             domain_cost = sched_clock_cpu(this_cpu) - t0;
> +             curr_cost += domain_cost;
> +
> +     }
> +
> +     /* Newly idle CPU doesn't need an update */
> +     if (idle != CPU_NEWLY_IDLE) {
> +             update_blocked_averages(this_cpu);
> +             has_blocked_load |= this_rq->has_blocked_load;
>       }
>  
> -     update_blocked_averages(this_cpu);
>       if (flags & NOHZ_BALANCE_KICK)
>               rebalance_domains(this_rq, CPU_IDLE);
>  
>       WRITE_ONCE(nohz.next_blocked,
>               now + msecs_to_jiffies(LOAD_AVG_PERIOD));
>  
> +     /* The full idle balance loop has been done */
> +     ret = true;
> +
>  abort:
>       /* There is still blocked load, enable periodic update */
>       if (has_blocked_load)
> @@ -9491,6 +9528,35 @@ static bool nohz_idle_balance(struct rq *this_rq, enum 
> cpu_idle_type idle)
>       if (likely(update_next_balance))
>               nohz.next_balance = next_balance;
>  
> +     return ret;
> +}
> +
> +/*
> + * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
> + * rebalancing for all the cpus for whom scheduler ticks are stopped.
> + */
> +static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
> +{
> +     int this_cpu = this_rq->cpu;
> +     unsigned int flags;
> +
> +     if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK))
> +             return false;
> +
> +     if (idle != CPU_IDLE) {
> +             atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
> +             return false;
> +     }
> +
> +     /*
> +      * barrier, pairs with nohz_balance_enter_idle(), ensures ...
> +      */
> +     flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
> +     if (!(flags & NOHZ_KICK_MASK))
> +             return false;
> +
> +     _nohz_idle_balance(this_rq, flags, idle);
> +
>       return true;
>  }
>  #else
> 

Reply via email to