Re: [PATCH v2 1/2] sched: force update of blocked load of idle cpus

2017-12-21 Thread Vincent Guittot
On 20 December 2017 at 15:27, Vincent Guittot
 wrote:
> On 20 December 2017 at 15:09, Peter Zijlstra  wrote:
>> On Fri, Dec 01, 2017 at 06:01:56PM +, Brendan Jackman wrote:
>>
>>> @@ -9210,7 +9256,15 @@ static void nohz_idle_balance(struct rq *this_rq, 
>>> enum cpu_idle_type idle)
>>>   cpu_load_update_idle(rq);
>>>   rq_unlock_irq(rq, );
>>>
>>> - rebalance_domains(rq, CPU_IDLE);
>>> + update_blocked_averages(balance_cpu);
>>> + /*
>>> +  * This idle load balance softirq may have been
>>> +  * triggered only to update the blocked load and 
>>> shares
>>> +  * of idle CPUs (which we have just done for
>>> +  * balance_cpu). In that case skip the actual balance.
>>> +  */
>>> + if (!in_nohz_stats_kick(this_cpu))
>>> + rebalance_domains(rq, idle);
>>>   }
>>>
>>>   if (time_after(next_balance, rq->next_balance)) {
>>
>>> @@ -9336,7 +9396,12 @@ static __latent_entropy void 
>>> run_rebalance_domains(struct softirq_action *h)
>>>* and abort nohz_idle_balance altogether if we pull some load.
>>>*/
>>>   nohz_idle_balance(this_rq, idle);
>>> - rebalance_domains(this_rq, idle);
>>> + update_blocked_averages(this_rq->cpu);
>>> + if (!in_nohz_stats_kick(this_rq->cpu))
>>> + rebalance_domains(this_rq, idle);
>>> +#ifdef CONFIG_NO_HZ_COMMON
>>> + clear_bit(NOHZ_STATS_KICK, nohz_flags(this_rq->cpu));
>>> +#endif
>>>  }
>>>
>>>  /*
>>
>> You're doing the same thing to both (all) callsites of
>> rebalance_domains(), does that not suggest doing it inside and leaving
>> update_blocked_averages() where it is?
>
> The goal of moving update_blocked_averages() outside rebalance_domains
> is to not add a new parameter or use a special  cpu_idle_type value in
> rebalance_domains parameters in order to abort the rebalance sequence
> just after updating blocked load

Peter,
Is the reason above reasonable or you prefer update_blocked_averages
to stay in rebalance_domains ?

>
>>
>>


Re: [PATCH v2 1/2] sched: force update of blocked load of idle cpus

2017-12-21 Thread Vincent Guittot
On 20 December 2017 at 15:27, Vincent Guittot
 wrote:
> On 20 December 2017 at 15:09, Peter Zijlstra  wrote:
>> On Fri, Dec 01, 2017 at 06:01:56PM +, Brendan Jackman wrote:
>>
>>> @@ -9210,7 +9256,15 @@ static void nohz_idle_balance(struct rq *this_rq, 
>>> enum cpu_idle_type idle)
>>>   cpu_load_update_idle(rq);
>>>   rq_unlock_irq(rq, );
>>>
>>> - rebalance_domains(rq, CPU_IDLE);
>>> + update_blocked_averages(balance_cpu);
>>> + /*
>>> +  * This idle load balance softirq may have been
>>> +  * triggered only to update the blocked load and 
>>> shares
>>> +  * of idle CPUs (which we have just done for
>>> +  * balance_cpu). In that case skip the actual balance.
>>> +  */
>>> + if (!in_nohz_stats_kick(this_cpu))
>>> + rebalance_domains(rq, idle);
>>>   }
>>>
>>>   if (time_after(next_balance, rq->next_balance)) {
>>
>>> @@ -9336,7 +9396,12 @@ static __latent_entropy void 
>>> run_rebalance_domains(struct softirq_action *h)
>>>* and abort nohz_idle_balance altogether if we pull some load.
>>>*/
>>>   nohz_idle_balance(this_rq, idle);
>>> - rebalance_domains(this_rq, idle);
>>> + update_blocked_averages(this_rq->cpu);
>>> + if (!in_nohz_stats_kick(this_rq->cpu))
>>> + rebalance_domains(this_rq, idle);
>>> +#ifdef CONFIG_NO_HZ_COMMON
>>> + clear_bit(NOHZ_STATS_KICK, nohz_flags(this_rq->cpu));
>>> +#endif
>>>  }
>>>
>>>  /*
>>
>> You're doing the same thing to both (all) callsites of
>> rebalance_domains(), does that not suggest doing it inside and leaving
>> update_blocked_averages() where it is?
>
> The goal of moving update_blocked_averages() outside rebalance_domains
> is to not add a new parameter or use a special  cpu_idle_type value in
> rebalance_domains parameters in order to abort the rebalance sequence
> just after updating blocked load

Peter,
Is the reason above reasonable or you prefer update_blocked_averages
to stay in rebalance_domains ?

>
>>
>>


Re: [PATCH v2 1/2] sched: force update of blocked load of idle cpus

2017-12-21 Thread Vincent Guittot
Le Wednesday 20 Dec 2017 à 16:01:10 (+0100), Peter Zijlstra a écrit :
> On Wed, Dec 20, 2017 at 03:23:01PM +0100, Vincent Guittot wrote:
> > On 20 December 2017 at 15:03, Peter Zijlstra  wrote:
> > > On Fri, Dec 01, 2017 at 06:01:56PM +, Brendan Jackman wrote:
> > >> @@ -8955,8 +8964,20 @@ static void nohz_balancer_kick(void)
> > >>   if (ilb_cpu >= nr_cpu_ids)
> > >>   return;
> > >>
> > >> - if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
> > >> + if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu))) {
> > >> + if (!only_update) {
> > >> + /*
> > >> +  * There's a pending/ongoing nohz kick/balance. If 
> > >> it's
> > >> +  * just for stats, convert it to a proper load 
> > >> balance.
> > >> +  */
> > >> + clear_bit(NOHZ_STATS_KICK, nohz_flags(ilb_cpu));
> > >> + }
> > >>   return;
> > >> + }
> > >> +
> > >> + if (only_update)
> > >> + set_bit(NOHZ_STATS_KICK, nohz_flags(ilb_cpu));
> > >> +
> > >>   /*
> > >>* Use smp_send_reschedule() instead of resched_cpu().
> > >>* This way we generate a sched IPI on the target cpu which
> > >
> > > This looks racy.. if its not we don't need atomic ops, if it is but is
> > > still fine it needs a comment.
> > 
> > NOHZ_STATS_KICK modification is protected by 
> > test_and_set_bit(NOHZ_BALANCE_KICK.
> > You're right that we don't need atomics ops and __set_bit() is enough
> 
> Well, you shouldn't mix atomic and non-atomic ops to the same word,
> that's asking for trouble.

In fact, the atomic operation is needed, I forgot that set/clear of 
NOHZ_TICK_STOPPED
can happen simultaneously on the ilb_cpu so we must keep
set_bit(NOHZ_STATS_KICK, nohz_flags(ilb_cpu));

As comment we can add:
 /*
  * Update of NOHZ_STATS_KICK itself is protected by test_and_set_biti but
  * clear/set of NOHZ_TICK_STOPPED can happen simultaneously so we need
  * atomic operation
  */

> 
> But why don't you do something like:
> 
> nohz_kick()
> 
>   flags = NOHZ_STAT;
>   if (!only_update)
>   flags |= NOHZ_BALANCE;
> 
>   atomic_long_or(flags, _cpu(cpu));
> 
> 
> nohz_idle_balance()
> 
>   unsigned long do_flags = 
> atomic_long_fetch_andnot(NOHZ_BALANCE|NOHZ_STAT, _flags(cpu));
> 
>   if (do_flags & NOHZ_STAT)
>   update_blocked_stuff();
> 
>   if (do_flags & NOHZ_BALANCE)
>   rebalance_domains();
> 
> That way its far more readable.
> 


Re: [PATCH v2 1/2] sched: force update of blocked load of idle cpus

2017-12-21 Thread Vincent Guittot
Le Wednesday 20 Dec 2017 à 16:01:10 (+0100), Peter Zijlstra a écrit :
> On Wed, Dec 20, 2017 at 03:23:01PM +0100, Vincent Guittot wrote:
> > On 20 December 2017 at 15:03, Peter Zijlstra  wrote:
> > > On Fri, Dec 01, 2017 at 06:01:56PM +, Brendan Jackman wrote:
> > >> @@ -8955,8 +8964,20 @@ static void nohz_balancer_kick(void)
> > >>   if (ilb_cpu >= nr_cpu_ids)
> > >>   return;
> > >>
> > >> - if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
> > >> + if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu))) {
> > >> + if (!only_update) {
> > >> + /*
> > >> +  * There's a pending/ongoing nohz kick/balance. If 
> > >> it's
> > >> +  * just for stats, convert it to a proper load 
> > >> balance.
> > >> +  */
> > >> + clear_bit(NOHZ_STATS_KICK, nohz_flags(ilb_cpu));
> > >> + }
> > >>   return;
> > >> + }
> > >> +
> > >> + if (only_update)
> > >> + set_bit(NOHZ_STATS_KICK, nohz_flags(ilb_cpu));
> > >> +
> > >>   /*
> > >>* Use smp_send_reschedule() instead of resched_cpu().
> > >>* This way we generate a sched IPI on the target cpu which
> > >
> > > This looks racy.. if its not we don't need atomic ops, if it is but is
> > > still fine it needs a comment.
> > 
> > NOHZ_STATS_KICK modification is protected by 
> > test_and_set_bit(NOHZ_BALANCE_KICK.
> > You're right that we don't need atomics ops and __set_bit() is enough
> 
> Well, you shouldn't mix atomic and non-atomic ops to the same word,
> that's asking for trouble.

In fact, the atomic operation is needed, I forgot that set/clear of 
NOHZ_TICK_STOPPED
can happen simultaneously on the ilb_cpu so we must keep
set_bit(NOHZ_STATS_KICK, nohz_flags(ilb_cpu));

As comment we can add:
 /*
  * Update of NOHZ_STATS_KICK itself is protected by test_and_set_biti but
  * clear/set of NOHZ_TICK_STOPPED can happen simultaneously so we need
  * atomic operation
  */

> 
> But why don't you do something like:
> 
> nohz_kick()
> 
>   flags = NOHZ_STAT;
>   if (!only_update)
>   flags |= NOHZ_BALANCE;
> 
>   atomic_long_or(flags, _cpu(cpu));
> 
> 
> nohz_idle_balance()
> 
>   unsigned long do_flags = 
> atomic_long_fetch_andnot(NOHZ_BALANCE|NOHZ_STAT, _flags(cpu));
> 
>   if (do_flags & NOHZ_STAT)
>   update_blocked_stuff();
> 
>   if (do_flags & NOHZ_BALANCE)
>   rebalance_domains();
> 
> That way its far more readable.
> 


Re: [PATCH v2 1/2] sched: force update of blocked load of idle cpus

2017-12-20 Thread Peter Zijlstra
On Wed, Dec 20, 2017 at 04:01:10PM +0100, Peter Zijlstra wrote:
> Well, you shouldn't mix atomic and non-atomic ops to the same word,
> that's asking for trouble.
> 
> But why don't you do something like:
> 
> nohz_kick()
> 
>   flags = NOHZ_STAT;
>   if (!only_update)
>   flags |= NOHZ_BALANCE;
> 
>   atomic_long_or(flags, _cpu(cpu));
> 
> 
> nohz_idle_balance()
> 
>   unsigned long do_flags = 
> atomic_long_fetch_andnot(NOHZ_BALANCE|NOHZ_STAT, _flags(cpu));
> 
>   if (do_flags & NOHZ_STAT)
>   update_blocked_stuff();
> 
>   if (do_flags & NOHZ_BALANCE)
>   rebalance_domains();
> 
> That way its far more readable.

we could use atomic_t too, there's not that many flags in there, the
only reason its long is because of that bitmap crud.


Re: [PATCH v2 1/2] sched: force update of blocked load of idle cpus

2017-12-20 Thread Peter Zijlstra
On Wed, Dec 20, 2017 at 04:01:10PM +0100, Peter Zijlstra wrote:
> Well, you shouldn't mix atomic and non-atomic ops to the same word,
> that's asking for trouble.
> 
> But why don't you do something like:
> 
> nohz_kick()
> 
>   flags = NOHZ_STAT;
>   if (!only_update)
>   flags |= NOHZ_BALANCE;
> 
>   atomic_long_or(flags, _cpu(cpu));
> 
> 
> nohz_idle_balance()
> 
>   unsigned long do_flags = 
> atomic_long_fetch_andnot(NOHZ_BALANCE|NOHZ_STAT, _flags(cpu));
> 
>   if (do_flags & NOHZ_STAT)
>   update_blocked_stuff();
> 
>   if (do_flags & NOHZ_BALANCE)
>   rebalance_domains();
> 
> That way its far more readable.

we could use atomic_t too, there's not that many flags in there, the
only reason its long is because of that bitmap crud.


Re: [PATCH v2 1/2] sched: force update of blocked load of idle cpus

2017-12-20 Thread Peter Zijlstra
On Wed, Dec 20, 2017 at 03:23:01PM +0100, Vincent Guittot wrote:
> On 20 December 2017 at 15:03, Peter Zijlstra  wrote:
> > On Fri, Dec 01, 2017 at 06:01:56PM +, Brendan Jackman wrote:
> >> @@ -8955,8 +8964,20 @@ static void nohz_balancer_kick(void)
> >>   if (ilb_cpu >= nr_cpu_ids)
> >>   return;
> >>
> >> - if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
> >> + if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu))) {
> >> + if (!only_update) {
> >> + /*
> >> +  * There's a pending/ongoing nohz kick/balance. If 
> >> it's
> >> +  * just for stats, convert it to a proper load 
> >> balance.
> >> +  */
> >> + clear_bit(NOHZ_STATS_KICK, nohz_flags(ilb_cpu));
> >> + }
> >>   return;
> >> + }
> >> +
> >> + if (only_update)
> >> + set_bit(NOHZ_STATS_KICK, nohz_flags(ilb_cpu));
> >> +
> >>   /*
> >>* Use smp_send_reschedule() instead of resched_cpu().
> >>* This way we generate a sched IPI on the target cpu which
> >
> > This looks racy.. if its not we don't need atomic ops, if it is but is
> > still fine it needs a comment.
> 
> NOHZ_STATS_KICK modification is protected by 
> test_and_set_bit(NOHZ_BALANCE_KICK.
> You're right that we don't need atomics ops and __set_bit() is enough

Well, you shouldn't mix atomic and non-atomic ops to the same word,
that's asking for trouble.

But why don't you do something like:

nohz_kick()

flags = NOHZ_STAT;
if (!only_update)
flags |= NOHZ_BALANCE;

atomic_long_or(flags, _cpu(cpu));


nohz_idle_balance()

unsigned long do_flags = 
atomic_long_fetch_andnot(NOHZ_BALANCE|NOHZ_STAT, _flags(cpu));

if (do_flags & NOHZ_STAT)
update_blocked_stuff();

if (do_flags & NOHZ_BALANCE)
rebalance_domains();

That way its far more readable.



Re: [PATCH v2 1/2] sched: force update of blocked load of idle cpus

2017-12-20 Thread Peter Zijlstra
On Wed, Dec 20, 2017 at 03:23:01PM +0100, Vincent Guittot wrote:
> On 20 December 2017 at 15:03, Peter Zijlstra  wrote:
> > On Fri, Dec 01, 2017 at 06:01:56PM +, Brendan Jackman wrote:
> >> @@ -8955,8 +8964,20 @@ static void nohz_balancer_kick(void)
> >>   if (ilb_cpu >= nr_cpu_ids)
> >>   return;
> >>
> >> - if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
> >> + if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu))) {
> >> + if (!only_update) {
> >> + /*
> >> +  * There's a pending/ongoing nohz kick/balance. If 
> >> it's
> >> +  * just for stats, convert it to a proper load 
> >> balance.
> >> +  */
> >> + clear_bit(NOHZ_STATS_KICK, nohz_flags(ilb_cpu));
> >> + }
> >>   return;
> >> + }
> >> +
> >> + if (only_update)
> >> + set_bit(NOHZ_STATS_KICK, nohz_flags(ilb_cpu));
> >> +
> >>   /*
> >>* Use smp_send_reschedule() instead of resched_cpu().
> >>* This way we generate a sched IPI on the target cpu which
> >
> > This looks racy.. if its not we don't need atomic ops, if it is but is
> > still fine it needs a comment.
> 
> NOHZ_STATS_KICK modification is protected by 
> test_and_set_bit(NOHZ_BALANCE_KICK.
> You're right that we don't need atomics ops and __set_bit() is enough

Well, you shouldn't mix atomic and non-atomic ops to the same word,
that's asking for trouble.

But why don't you do something like:

nohz_kick()

flags = NOHZ_STAT;
if (!only_update)
flags |= NOHZ_BALANCE;

atomic_long_or(flags, _cpu(cpu));


nohz_idle_balance()

unsigned long do_flags = 
atomic_long_fetch_andnot(NOHZ_BALANCE|NOHZ_STAT, _flags(cpu));

if (do_flags & NOHZ_STAT)
update_blocked_stuff();

if (do_flags & NOHZ_BALANCE)
rebalance_domains();

That way its far more readable.



Re: [PATCH v2 1/2] sched: force update of blocked load of idle cpus

2017-12-20 Thread Vincent Guittot
On 20 December 2017 at 15:09, Peter Zijlstra  wrote:
> On Fri, Dec 01, 2017 at 06:01:56PM +, Brendan Jackman wrote:
>
>> @@ -9210,7 +9256,15 @@ static void nohz_idle_balance(struct rq *this_rq, 
>> enum cpu_idle_type idle)
>>   cpu_load_update_idle(rq);
>>   rq_unlock_irq(rq, );
>>
>> - rebalance_domains(rq, CPU_IDLE);
>> + update_blocked_averages(balance_cpu);
>> + /*
>> +  * This idle load balance softirq may have been
>> +  * triggered only to update the blocked load and shares
>> +  * of idle CPUs (which we have just done for
>> +  * balance_cpu). In that case skip the actual balance.
>> +  */
>> + if (!in_nohz_stats_kick(this_cpu))
>> + rebalance_domains(rq, idle);
>>   }
>>
>>   if (time_after(next_balance, rq->next_balance)) {
>
>> @@ -9336,7 +9396,12 @@ static __latent_entropy void 
>> run_rebalance_domains(struct softirq_action *h)
>>* and abort nohz_idle_balance altogether if we pull some load.
>>*/
>>   nohz_idle_balance(this_rq, idle);
>> - rebalance_domains(this_rq, idle);
>> + update_blocked_averages(this_rq->cpu);
>> + if (!in_nohz_stats_kick(this_rq->cpu))
>> + rebalance_domains(this_rq, idle);
>> +#ifdef CONFIG_NO_HZ_COMMON
>> + clear_bit(NOHZ_STATS_KICK, nohz_flags(this_rq->cpu));
>> +#endif
>>  }
>>
>>  /*
>
> You're doing the same thing to both (all) callsites of
> rebalance_domains(), does that not suggest doing it inside and leaving
> update_blocked_averages() where it is?

The goal of moving update_blocked_averages() outside rebalance_domains
is to not add a new parameter or use a special  cpu_idle_type value in
rebalance_domains parameters in order to abort the rebalance sequence
just after updating blocked load

>
>


Re: [PATCH v2 1/2] sched: force update of blocked load of idle cpus

2017-12-20 Thread Vincent Guittot
On 20 December 2017 at 15:09, Peter Zijlstra  wrote:
> On Fri, Dec 01, 2017 at 06:01:56PM +, Brendan Jackman wrote:
>
>> @@ -9210,7 +9256,15 @@ static void nohz_idle_balance(struct rq *this_rq, 
>> enum cpu_idle_type idle)
>>   cpu_load_update_idle(rq);
>>   rq_unlock_irq(rq, );
>>
>> - rebalance_domains(rq, CPU_IDLE);
>> + update_blocked_averages(balance_cpu);
>> + /*
>> +  * This idle load balance softirq may have been
>> +  * triggered only to update the blocked load and shares
>> +  * of idle CPUs (which we have just done for
>> +  * balance_cpu). In that case skip the actual balance.
>> +  */
>> + if (!in_nohz_stats_kick(this_cpu))
>> + rebalance_domains(rq, idle);
>>   }
>>
>>   if (time_after(next_balance, rq->next_balance)) {
>
>> @@ -9336,7 +9396,12 @@ static __latent_entropy void 
>> run_rebalance_domains(struct softirq_action *h)
>>* and abort nohz_idle_balance altogether if we pull some load.
>>*/
>>   nohz_idle_balance(this_rq, idle);
>> - rebalance_domains(this_rq, idle);
>> + update_blocked_averages(this_rq->cpu);
>> + if (!in_nohz_stats_kick(this_rq->cpu))
>> + rebalance_domains(this_rq, idle);
>> +#ifdef CONFIG_NO_HZ_COMMON
>> + clear_bit(NOHZ_STATS_KICK, nohz_flags(this_rq->cpu));
>> +#endif
>>  }
>>
>>  /*
>
> You're doing the same thing to both (all) callsites of
> rebalance_domains(), does that not suggest doing it inside and leaving
> update_blocked_averages() where it is?

The goal of moving update_blocked_averages() outside rebalance_domains
is to not add a new parameter or use a special  cpu_idle_type value in
rebalance_domains parameters in order to abort the rebalance sequence
just after updating blocked load

>
>


Re: [PATCH v2 1/2] sched: force update of blocked load of idle cpus

2017-12-20 Thread Vincent Guittot
On 20 December 2017 at 15:03, Peter Zijlstra  wrote:
> On Fri, Dec 01, 2017 at 06:01:56PM +, Brendan Jackman wrote:
>> @@ -8955,8 +8964,20 @@ static void nohz_balancer_kick(void)
>>   if (ilb_cpu >= nr_cpu_ids)
>>   return;
>>
>> - if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
>> + if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu))) {
>> + if (!only_update) {
>> + /*
>> +  * There's a pending/ongoing nohz kick/balance. If it's
>> +  * just for stats, convert it to a proper load balance.
>> +  */
>> + clear_bit(NOHZ_STATS_KICK, nohz_flags(ilb_cpu));
>> + }
>>   return;
>> + }
>> +
>> + if (only_update)
>> + set_bit(NOHZ_STATS_KICK, nohz_flags(ilb_cpu));
>> +
>>   /*
>>* Use smp_send_reschedule() instead of resched_cpu().
>>* This way we generate a sched IPI on the target cpu which
>
> This looks racy.. if its not we don't need atomic ops, if it is but is
> still fine it needs a comment.

NOHZ_STATS_KICK modification is protected by test_and_set_bit(NOHZ_BALANCE_KICK.
You're right that we don't need atomics ops and __set_bit() is enough


Re: [PATCH v2 1/2] sched: force update of blocked load of idle cpus

2017-12-20 Thread Vincent Guittot
On 20 December 2017 at 15:03, Peter Zijlstra  wrote:
> On Fri, Dec 01, 2017 at 06:01:56PM +, Brendan Jackman wrote:
>> @@ -8955,8 +8964,20 @@ static void nohz_balancer_kick(void)
>>   if (ilb_cpu >= nr_cpu_ids)
>>   return;
>>
>> - if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
>> + if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu))) {
>> + if (!only_update) {
>> + /*
>> +  * There's a pending/ongoing nohz kick/balance. If it's
>> +  * just for stats, convert it to a proper load balance.
>> +  */
>> + clear_bit(NOHZ_STATS_KICK, nohz_flags(ilb_cpu));
>> + }
>>   return;
>> + }
>> +
>> + if (only_update)
>> + set_bit(NOHZ_STATS_KICK, nohz_flags(ilb_cpu));
>> +
>>   /*
>>* Use smp_send_reschedule() instead of resched_cpu().
>>* This way we generate a sched IPI on the target cpu which
>
> This looks racy.. if its not we don't need atomic ops, if it is but is
> still fine it needs a comment.

NOHZ_STATS_KICK modification is protected by test_and_set_bit(NOHZ_BALANCE_KICK.
You're right that we don't need atomics ops and __set_bit() is enough


Re: [PATCH v2 1/2] sched: force update of blocked load of idle cpus

2017-12-20 Thread Peter Zijlstra
On Fri, Dec 01, 2017 at 06:01:56PM +, Brendan Jackman wrote:

> @@ -9210,7 +9256,15 @@ static void nohz_idle_balance(struct rq *this_rq, enum 
> cpu_idle_type idle)
>   cpu_load_update_idle(rq);
>   rq_unlock_irq(rq, );
>  
> - rebalance_domains(rq, CPU_IDLE);
> + update_blocked_averages(balance_cpu);
> + /*
> +  * This idle load balance softirq may have been
> +  * triggered only to update the blocked load and shares
> +  * of idle CPUs (which we have just done for
> +  * balance_cpu). In that case skip the actual balance.
> +  */
> + if (!in_nohz_stats_kick(this_cpu))
> + rebalance_domains(rq, idle);
>   }
>  
>   if (time_after(next_balance, rq->next_balance)) {

> @@ -9336,7 +9396,12 @@ static __latent_entropy void 
> run_rebalance_domains(struct softirq_action *h)
>* and abort nohz_idle_balance altogether if we pull some load.
>*/
>   nohz_idle_balance(this_rq, idle);
> - rebalance_domains(this_rq, idle);
> + update_blocked_averages(this_rq->cpu);
> + if (!in_nohz_stats_kick(this_rq->cpu))
> + rebalance_domains(this_rq, idle);
> +#ifdef CONFIG_NO_HZ_COMMON
> + clear_bit(NOHZ_STATS_KICK, nohz_flags(this_rq->cpu));
> +#endif
>  }
>  
>  /*

You're doing the same thing to both (all) callsites of
rebalance_domains(), does that not suggest doing it inside and leaving
update_blocked_averages() where it is?




Re: [PATCH v2 1/2] sched: force update of blocked load of idle cpus

2017-12-20 Thread Peter Zijlstra
On Fri, Dec 01, 2017 at 06:01:56PM +, Brendan Jackman wrote:

> @@ -9210,7 +9256,15 @@ static void nohz_idle_balance(struct rq *this_rq, enum 
> cpu_idle_type idle)
>   cpu_load_update_idle(rq);
>   rq_unlock_irq(rq, );
>  
> - rebalance_domains(rq, CPU_IDLE);
> + update_blocked_averages(balance_cpu);
> + /*
> +  * This idle load balance softirq may have been
> +  * triggered only to update the blocked load and shares
> +  * of idle CPUs (which we have just done for
> +  * balance_cpu). In that case skip the actual balance.
> +  */
> + if (!in_nohz_stats_kick(this_cpu))
> + rebalance_domains(rq, idle);
>   }
>  
>   if (time_after(next_balance, rq->next_balance)) {

> @@ -9336,7 +9396,12 @@ static __latent_entropy void 
> run_rebalance_domains(struct softirq_action *h)
>* and abort nohz_idle_balance altogether if we pull some load.
>*/
>   nohz_idle_balance(this_rq, idle);
> - rebalance_domains(this_rq, idle);
> + update_blocked_averages(this_rq->cpu);
> + if (!in_nohz_stats_kick(this_rq->cpu))
> + rebalance_domains(this_rq, idle);
> +#ifdef CONFIG_NO_HZ_COMMON
> + clear_bit(NOHZ_STATS_KICK, nohz_flags(this_rq->cpu));
> +#endif
>  }
>  
>  /*

You're doing the same thing to both (all) callsites of
rebalance_domains(), does that not suggest doing it inside and leaving
update_blocked_averages() where it is?




Re: [PATCH v2 1/2] sched: force update of blocked load of idle cpus

2017-12-20 Thread Peter Zijlstra
On Fri, Dec 01, 2017 at 06:01:56PM +, Brendan Jackman wrote:
> @@ -8955,8 +8964,20 @@ static void nohz_balancer_kick(void)
>   if (ilb_cpu >= nr_cpu_ids)
>   return;
>  
> - if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
> + if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu))) {
> + if (!only_update) {
> + /*
> +  * There's a pending/ongoing nohz kick/balance. If it's
> +  * just for stats, convert it to a proper load balance.
> +  */
> + clear_bit(NOHZ_STATS_KICK, nohz_flags(ilb_cpu));
> + }
>   return;
> + }
> +
> + if (only_update)
> + set_bit(NOHZ_STATS_KICK, nohz_flags(ilb_cpu));
> +
>   /*
>* Use smp_send_reschedule() instead of resched_cpu().
>* This way we generate a sched IPI on the target cpu which

This looks racy.. if its not we don't need atomic ops, if it is but is
still fine it needs a comment.


Re: [PATCH v2 1/2] sched: force update of blocked load of idle cpus

2017-12-20 Thread Peter Zijlstra
On Fri, Dec 01, 2017 at 06:01:56PM +, Brendan Jackman wrote:
> @@ -8955,8 +8964,20 @@ static void nohz_balancer_kick(void)
>   if (ilb_cpu >= nr_cpu_ids)
>   return;
>  
> - if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
> + if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu))) {
> + if (!only_update) {
> + /*
> +  * There's a pending/ongoing nohz kick/balance. If it's
> +  * just for stats, convert it to a proper load balance.
> +  */
> + clear_bit(NOHZ_STATS_KICK, nohz_flags(ilb_cpu));
> + }
>   return;
> + }
> +
> + if (only_update)
> + set_bit(NOHZ_STATS_KICK, nohz_flags(ilb_cpu));
> +
>   /*
>* Use smp_send_reschedule() instead of resched_cpu().
>* This way we generate a sched IPI on the target cpu which

This looks racy.. if its not we don't need atomic ops, if it is but is
still fine it needs a comment.


[PATCH v2 1/2] sched: force update of blocked load of idle cpus

2017-12-01 Thread Brendan Jackman
From: Vincent Guittot 

When idle, the blocked load of CPUs will be updated only when an idle
load balance is triggered which may never happen. Because of this
uncertainty on the execution of idle load balance, the utilization,
the load and the shares of idle cfs_rq can stay artificially high and
steal shares and running time to busy cfs_rqs of the task group.
Add a new light idle load balance state which ensures that blocked loads
are periodically updated and decayed but does not perform any task
migration.

The remote load udpates are rate-limited, so that they are not
performed with a shorter period than LOAD_AVG_PERIOD (i.e. PELT
half-life). This is the period after which we have a known 50% error
in stale load.

Cc: Dietmar Eggemann 
Cc: Vincent Guittot 
Cc: Ingo Molnar 
Cc: Morten Rasmussen 
Cc: Peter Zijlstra 
Signed-off-by: Vincent Guittot 
[Switched remote update interval to use PELT half life]
[Moved update_blocked_averges call outside rebalance_domains
 to simplify code]
Signed-off-by: Brendan Jackman 
---
 kernel/sched/fair.c  | 86 ++--
 kernel/sched/sched.h |  1 +
 2 files changed, 77 insertions(+), 10 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 4037e19bbca2..f83e8f0d4f06 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6258,6 +6258,9 @@ static int wake_cap(struct task_struct *p, int cpu, int 
prev_cpu)
return min_cap * 1024 < task_util(p) * capacity_margin;
 }
 
+static inline bool nohz_kick_needed(struct rq *rq, bool only_update);
+static void nohz_balancer_kick(bool only_update);
+
 /*
  * select_task_rq_fair: Select target runqueue for the waking task in domains
  * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
@@ -6334,6 +6337,11 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, 
int sd_flag, int wake_f
}
rcu_read_unlock();
 
+#ifdef CONFIG_NO_HZ_COMMON
+   if (nohz_kick_needed(cpu_rq(new_cpu), true))
+   nohz_balancer_kick(true);
+#endif
+
return new_cpu;
 }
 
@@ -8927,6 +8935,7 @@ static struct {
cpumask_var_t idle_cpus_mask;
atomic_t nr_cpus;
unsigned long next_balance; /* in jiffy units */
+   unsigned long next_update; /* in jiffy units */
 } nohz cacheline_aligned;
 
 static inline int find_new_ilb(void)
@@ -8944,7 +8953,7 @@ static inline int find_new_ilb(void)
  * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
  * CPU (if there is one).
  */
-static void nohz_balancer_kick(void)
+static void nohz_balancer_kick(bool only_update)
 {
int ilb_cpu;
 
@@ -8955,8 +8964,20 @@ static void nohz_balancer_kick(void)
if (ilb_cpu >= nr_cpu_ids)
return;
 
-   if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
+   if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu))) {
+   if (!only_update) {
+   /*
+* There's a pending/ongoing nohz kick/balance. If it's
+* just for stats, convert it to a proper load balance.
+*/
+   clear_bit(NOHZ_STATS_KICK, nohz_flags(ilb_cpu));
+   }
return;
+   }
+
+   if (only_update)
+   set_bit(NOHZ_STATS_KICK, nohz_flags(ilb_cpu));
+
/*
 * Use smp_send_reschedule() instead of resched_cpu().
 * This way we generate a sched IPI on the target cpu which
@@ -9044,6 +9065,8 @@ void nohz_balance_enter_idle(int cpu)
atomic_inc(_cpus);
set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
 }
+#else
+static inline void nohz_balancer_kick(bool only_update) {}
 #endif
 
 static DEFINE_SPINLOCK(balancing);
@@ -9075,8 +9098,6 @@ static void rebalance_domains(struct rq *rq, enum 
cpu_idle_type idle)
int need_serialize, need_decay = 0;
u64 max_cost = 0;
 
-   update_blocked_averages(cpu);
-
rcu_read_lock();
for_each_domain(cpu, sd) {
/*
@@ -9167,6 +9188,13 @@ static void rebalance_domains(struct rq *rq, enum 
cpu_idle_type idle)
 }
 
 #ifdef CONFIG_NO_HZ_COMMON
+
+/* True iff @cpu was kicked for nohz balance, but just to update blocked load 
*/
+static inline bool in_nohz_stats_kick(int cpu)
+{
+   return test_bit(NOHZ_STATS_KICK, nohz_flags(cpu));
+}
+
 /*
  * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
  * rebalancing for all the cpus for whom scheduler ticks are stopped.
@@ -9175,6 +9203,7 @@ static void nohz_idle_balance(struct rq *this_rq, enum 
cpu_idle_type idle)
 {
int this_cpu = this_rq->cpu;
struct rq *rq;
+   struct sched_domain *sd;
int balance_cpu;
/* Earliest time when 

[PATCH v2 1/2] sched: force update of blocked load of idle cpus

2017-12-01 Thread Brendan Jackman
From: Vincent Guittot 

When idle, the blocked load of CPUs will be updated only when an idle
load balance is triggered which may never happen. Because of this
uncertainty on the execution of idle load balance, the utilization,
the load and the shares of idle cfs_rq can stay artificially high and
steal shares and running time to busy cfs_rqs of the task group.
Add a new light idle load balance state which ensures that blocked loads
are periodically updated and decayed but does not perform any task
migration.

The remote load udpates are rate-limited, so that they are not
performed with a shorter period than LOAD_AVG_PERIOD (i.e. PELT
half-life). This is the period after which we have a known 50% error
in stale load.

Cc: Dietmar Eggemann 
Cc: Vincent Guittot 
Cc: Ingo Molnar 
Cc: Morten Rasmussen 
Cc: Peter Zijlstra 
Signed-off-by: Vincent Guittot 
[Switched remote update interval to use PELT half life]
[Moved update_blocked_averges call outside rebalance_domains
 to simplify code]
Signed-off-by: Brendan Jackman 
---
 kernel/sched/fair.c  | 86 ++--
 kernel/sched/sched.h |  1 +
 2 files changed, 77 insertions(+), 10 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 4037e19bbca2..f83e8f0d4f06 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6258,6 +6258,9 @@ static int wake_cap(struct task_struct *p, int cpu, int 
prev_cpu)
return min_cap * 1024 < task_util(p) * capacity_margin;
 }
 
+static inline bool nohz_kick_needed(struct rq *rq, bool only_update);
+static void nohz_balancer_kick(bool only_update);
+
 /*
  * select_task_rq_fair: Select target runqueue for the waking task in domains
  * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
@@ -6334,6 +6337,11 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, 
int sd_flag, int wake_f
}
rcu_read_unlock();
 
+#ifdef CONFIG_NO_HZ_COMMON
+   if (nohz_kick_needed(cpu_rq(new_cpu), true))
+   nohz_balancer_kick(true);
+#endif
+
return new_cpu;
 }
 
@@ -8927,6 +8935,7 @@ static struct {
cpumask_var_t idle_cpus_mask;
atomic_t nr_cpus;
unsigned long next_balance; /* in jiffy units */
+   unsigned long next_update; /* in jiffy units */
 } nohz cacheline_aligned;
 
 static inline int find_new_ilb(void)
@@ -8944,7 +8953,7 @@ static inline int find_new_ilb(void)
  * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
  * CPU (if there is one).
  */
-static void nohz_balancer_kick(void)
+static void nohz_balancer_kick(bool only_update)
 {
int ilb_cpu;
 
@@ -8955,8 +8964,20 @@ static void nohz_balancer_kick(void)
if (ilb_cpu >= nr_cpu_ids)
return;
 
-   if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
+   if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu))) {
+   if (!only_update) {
+   /*
+* There's a pending/ongoing nohz kick/balance. If it's
+* just for stats, convert it to a proper load balance.
+*/
+   clear_bit(NOHZ_STATS_KICK, nohz_flags(ilb_cpu));
+   }
return;
+   }
+
+   if (only_update)
+   set_bit(NOHZ_STATS_KICK, nohz_flags(ilb_cpu));
+
/*
 * Use smp_send_reschedule() instead of resched_cpu().
 * This way we generate a sched IPI on the target cpu which
@@ -9044,6 +9065,8 @@ void nohz_balance_enter_idle(int cpu)
atomic_inc(_cpus);
set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
 }
+#else
+static inline void nohz_balancer_kick(bool only_update) {}
 #endif
 
 static DEFINE_SPINLOCK(balancing);
@@ -9075,8 +9098,6 @@ static void rebalance_domains(struct rq *rq, enum 
cpu_idle_type idle)
int need_serialize, need_decay = 0;
u64 max_cost = 0;
 
-   update_blocked_averages(cpu);
-
rcu_read_lock();
for_each_domain(cpu, sd) {
/*
@@ -9167,6 +9188,13 @@ static void rebalance_domains(struct rq *rq, enum 
cpu_idle_type idle)
 }
 
 #ifdef CONFIG_NO_HZ_COMMON
+
+/* True iff @cpu was kicked for nohz balance, but just to update blocked load 
*/
+static inline bool in_nohz_stats_kick(int cpu)
+{
+   return test_bit(NOHZ_STATS_KICK, nohz_flags(cpu));
+}
+
 /*
  * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
  * rebalancing for all the cpus for whom scheduler ticks are stopped.
@@ -9175,6 +9203,7 @@ static void nohz_idle_balance(struct rq *this_rq, enum 
cpu_idle_type idle)
 {
int this_cpu = this_rq->cpu;
struct rq *rq;
+   struct sched_domain *sd;
int balance_cpu;
/* Earliest time when we have to do rebalance again */
unsigned long next_balance = jiffies + 60*HZ;
@@ -9184,6 +9213,23 @@ static void nohz_idle_balance(struct rq *this_rq, enum 
cpu_idle_type idle)