Re: [PATCH] sched/rt.c: pick and check task if double_lock_balance() unlock the rq

2018-04-12 Thread Libin (Huawei)


在 2018/4/11 18:26, Peter Zijlstra 写道:

On Tue, Apr 10, 2018 at 06:05:46PM -0400, Steven Rostedt wrote:


Peter,

Going through my inbox, I stumbled across this one. And it doesn't
appear to be addressed.

I think this patch is a reasonable solution.


Urgh, yeah, also seem to have forgotten about it. The proposed solution
is in fact simpler than the existing code. Also, I think deadline.c has
the exact same problem.

Zhou, could you respin and fix both?


Thanks for your reply, and I will fix the deadline.c and resend the two 
patches together.


Thanks,
Li Bin








Re: [PATCH] sched/rt.c: pick and check task if double_lock_balance() unlock the rq

2018-04-12 Thread Libin (Huawei)


在 2018/4/11 18:26, Peter Zijlstra 写道:

On Tue, Apr 10, 2018 at 06:05:46PM -0400, Steven Rostedt wrote:


Peter,

Going through my inbox, I stumbled across this one. And it doesn't
appear to be addressed.

I think this patch is a reasonable solution.


Urgh, yeah, also seem to have forgotten about it. The proposed solution
is in fact simpler than the existing code. Also, I think deadline.c has
the exact same problem.

Zhou, could you respin and fix both?


Thanks for your reply, and I will fix the deadline.c and resend the two 
patches together.


Thanks,
Li Bin








Re: [PATCH] sched/rt.c: pick and check task if double_lock_balance() unlock the rq

2018-04-11 Thread Peter Zijlstra
On Tue, Apr 10, 2018 at 06:05:46PM -0400, Steven Rostedt wrote:
> 
> Peter,
> 
> Going through my inbox, I stumbled across this one. And it doesn't
> appear to be addressed.
> 
> I think this patch is a reasonable solution.

Urgh, yeah, also seem to have forgotten about it. The proposed solution
is in fact simpler than the existing code. Also, I think deadline.c has
the exact same problem.

Zhou, could you respin and fix both?


Re: [PATCH] sched/rt.c: pick and check task if double_lock_balance() unlock the rq

2018-04-11 Thread Peter Zijlstra
On Tue, Apr 10, 2018 at 06:05:46PM -0400, Steven Rostedt wrote:
> 
> Peter,
> 
> Going through my inbox, I stumbled across this one. And it doesn't
> appear to be addressed.
> 
> I think this patch is a reasonable solution.

Urgh, yeah, also seem to have forgotten about it. The proposed solution
is in fact simpler than the existing code. Also, I think deadline.c has
the exact same problem.

Zhou, could you respin and fix both?


Re: [PATCH] sched/rt.c: pick and check task if double_lock_balance() unlock the rq

2018-04-10 Thread Steven Rostedt

Peter,

Going through my inbox, I stumbled across this one. And it doesn't
appear to be addressed.

I think this patch is a reasonable solution.

One small nit below though, but other than that.

Reviewed-by: Steven Rostedt (VMware) 


On Mon, 11 Sep 2017 14:51:49 +0800
Zhou Chengming  wrote:

> push_rt_task() pick the first pushable task and find an eligible
> lowest_rq, then double_lock_balance(rq, lowest_rq). So if
> double_lock_balance() unlock the rq (when double_lock_balance() return 1),
> we have to check if this task is still on the rq.
> 
> The problem is that the check conditions are not sufficient:
> 
> if (unlikely(task_rq(task) != rq ||
>!cpumask_test_cpu(lowest_rq->cpu, >cpus_allowed) ||
>task_running(rq, task) ||
>!rt_task(task) ||
>!task_on_rq_queued(task))) {
> 
> cpu2  cpu1cpu0
> push_rt_task(rq1)
>   pick task_A on rq1
>   find rq0
> double_lock_balance(rq1, rq0)
>   unlock(rq1)
>   rq1 __schedule
> pick task_A run
>   task_A sleep (dequeued)
>   lock(rq0)
>   lock(rq1)
> do_above_check(task_A)
>   task_rq(task_A) == rq1
>   cpus_allowed unchanged
>   task_running == false
>   rt_task(task_A) == true
>   try_to_wake_up(task_A)
> select_cpu = cpu3
> enqueue(rq3, task_A)
> task_A->on_rq = 1
>   task_on_rq_queued(task_A)
> above_check passed, return rq0
> ...
> migrate task_A from rq1 to rq0
> 
> So we can't rely on these checks of task_A to make sure the task_A is
> still on the rq1, even though we hold the rq1->lock. This patch will
> repick the first pushable task to be sure the task is still on the rq.
> 
> Signed-off-by: Zhou Chengming 
> ---
>  kernel/sched/rt.c | 49 +++--
>  1 file changed, 23 insertions(+), 26 deletions(-)
> 
> diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
> index 45caf93..787b721 100644
> --- a/kernel/sched/rt.c
> +++ b/kernel/sched/rt.c
> @@ -1703,6 +1703,26 @@ static int find_lowest_rq(struct task_struct *task)
>   return -1;
>  }
>  
> +static struct task_struct *pick_next_pushable_task(struct rq *rq)
> +{
> + struct task_struct *p;
> +
> + if (!has_pushable_tasks(rq))
> + return NULL;
> +
> + p = plist_first_entry(>rt.pushable_tasks,
> +   struct task_struct, pushable_tasks);
> +
> + BUG_ON(rq->cpu != task_cpu(p));
> + BUG_ON(task_current(rq, p));
> + BUG_ON(p->nr_cpus_allowed <= 1);
> +
> + BUG_ON(!task_on_rq_queued(p));
> + BUG_ON(!rt_task(p));
> +
> + return p;
> +}
> +
>  /* Will lock the rq it finds */
>  static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq 
> *rq)
>  {
> @@ -1734,13 +1754,10 @@ static struct rq *find_lock_lowest_rq(struct 
> task_struct *task, struct rq *rq)
>* We had to unlock the run queue. In
>* the mean time, task could have
>* migrated already or had its affinity changed.
> -  * Also make sure that it wasn't scheduled on its rq.
>*/
> - if (unlikely(task_rq(task) != rq ||
> -  !cpumask_test_cpu(lowest_rq->cpu, 
> >cpus_allowed) ||
> -  task_running(rq, task) ||
> -  !rt_task(task) ||
> -  !task_on_rq_queued(task))) {
> + struct task_struct *next_task = 
> pick_next_pushable_task(rq);

I would put the above declaration before the above comment.

-- Steve

> + if (unlikely(next_task != task ||
> +  !cpumask_test_cpu(lowest_rq->cpu, 
> >cpus_allowed))) {
>  
>   double_unlock_balance(rq, lowest_rq);
>   lowest_rq = NULL;
> @@ -1760,26 +1777,6 @@ static struct rq *find_lock_lowest_rq(struct 
> task_struct *task, struct rq *rq)
>   return lowest_rq;
>  }
>  
> -static struct task_struct *pick_next_pushable_task(struct rq *rq)
> -{
> - struct task_struct *p;
> -
> - if (!has_pushable_tasks(rq))
> - return NULL;
> -
> - p = plist_first_entry(>rt.pushable_tasks,
> -   struct task_struct, pushable_tasks);
> -
> - BUG_ON(rq->cpu != task_cpu(p));
> - BUG_ON(task_current(rq, p));
> - BUG_ON(p->nr_cpus_allowed <= 1);
> -
> - BUG_ON(!task_on_rq_queued(p));
> - BUG_ON(!rt_task(p));
> -
> - return p;
> -}
> -
>  /*
>   * If the 

Re: [PATCH] sched/rt.c: pick and check task if double_lock_balance() unlock the rq

2018-04-10 Thread Steven Rostedt

Peter,

Going through my inbox, I stumbled across this one. And it doesn't
appear to be addressed.

I think this patch is a reasonable solution.

One small nit below though, but other than that.

Reviewed-by: Steven Rostedt (VMware) 


On Mon, 11 Sep 2017 14:51:49 +0800
Zhou Chengming  wrote:

> push_rt_task() pick the first pushable task and find an eligible
> lowest_rq, then double_lock_balance(rq, lowest_rq). So if
> double_lock_balance() unlock the rq (when double_lock_balance() return 1),
> we have to check if this task is still on the rq.
> 
> The problem is that the check conditions are not sufficient:
> 
> if (unlikely(task_rq(task) != rq ||
>!cpumask_test_cpu(lowest_rq->cpu, >cpus_allowed) ||
>task_running(rq, task) ||
>!rt_task(task) ||
>!task_on_rq_queued(task))) {
> 
> cpu2  cpu1cpu0
> push_rt_task(rq1)
>   pick task_A on rq1
>   find rq0
> double_lock_balance(rq1, rq0)
>   unlock(rq1)
>   rq1 __schedule
> pick task_A run
>   task_A sleep (dequeued)
>   lock(rq0)
>   lock(rq1)
> do_above_check(task_A)
>   task_rq(task_A) == rq1
>   cpus_allowed unchanged
>   task_running == false
>   rt_task(task_A) == true
>   try_to_wake_up(task_A)
> select_cpu = cpu3
> enqueue(rq3, task_A)
> task_A->on_rq = 1
>   task_on_rq_queued(task_A)
> above_check passed, return rq0
> ...
> migrate task_A from rq1 to rq0
> 
> So we can't rely on these checks of task_A to make sure the task_A is
> still on the rq1, even though we hold the rq1->lock. This patch will
> repick the first pushable task to be sure the task is still on the rq.
> 
> Signed-off-by: Zhou Chengming 
> ---
>  kernel/sched/rt.c | 49 +++--
>  1 file changed, 23 insertions(+), 26 deletions(-)
> 
> diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
> index 45caf93..787b721 100644
> --- a/kernel/sched/rt.c
> +++ b/kernel/sched/rt.c
> @@ -1703,6 +1703,26 @@ static int find_lowest_rq(struct task_struct *task)
>   return -1;
>  }
>  
> +static struct task_struct *pick_next_pushable_task(struct rq *rq)
> +{
> + struct task_struct *p;
> +
> + if (!has_pushable_tasks(rq))
> + return NULL;
> +
> + p = plist_first_entry(>rt.pushable_tasks,
> +   struct task_struct, pushable_tasks);
> +
> + BUG_ON(rq->cpu != task_cpu(p));
> + BUG_ON(task_current(rq, p));
> + BUG_ON(p->nr_cpus_allowed <= 1);
> +
> + BUG_ON(!task_on_rq_queued(p));
> + BUG_ON(!rt_task(p));
> +
> + return p;
> +}
> +
>  /* Will lock the rq it finds */
>  static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq 
> *rq)
>  {
> @@ -1734,13 +1754,10 @@ static struct rq *find_lock_lowest_rq(struct 
> task_struct *task, struct rq *rq)
>* We had to unlock the run queue. In
>* the mean time, task could have
>* migrated already or had its affinity changed.
> -  * Also make sure that it wasn't scheduled on its rq.
>*/
> - if (unlikely(task_rq(task) != rq ||
> -  !cpumask_test_cpu(lowest_rq->cpu, 
> >cpus_allowed) ||
> -  task_running(rq, task) ||
> -  !rt_task(task) ||
> -  !task_on_rq_queued(task))) {
> + struct task_struct *next_task = 
> pick_next_pushable_task(rq);

I would put the above declaration before the above comment.

-- Steve

> + if (unlikely(next_task != task ||
> +  !cpumask_test_cpu(lowest_rq->cpu, 
> >cpus_allowed))) {
>  
>   double_unlock_balance(rq, lowest_rq);
>   lowest_rq = NULL;
> @@ -1760,26 +1777,6 @@ static struct rq *find_lock_lowest_rq(struct 
> task_struct *task, struct rq *rq)
>   return lowest_rq;
>  }
>  
> -static struct task_struct *pick_next_pushable_task(struct rq *rq)
> -{
> - struct task_struct *p;
> -
> - if (!has_pushable_tasks(rq))
> - return NULL;
> -
> - p = plist_first_entry(>rt.pushable_tasks,
> -   struct task_struct, pushable_tasks);
> -
> - BUG_ON(rq->cpu != task_cpu(p));
> - BUG_ON(task_current(rq, p));
> - BUG_ON(p->nr_cpus_allowed <= 1);
> -
> - BUG_ON(!task_on_rq_queued(p));
> - BUG_ON(!rt_task(p));
> -
> - return p;
> -}
> -
>  /*
>   * If the current CPU has more than one RT task, see if the non
>   * running task 

Re: [PATCH] sched/rt.c: pick and check task if double_lock_balance() unlock the rq

2017-10-06 Thread zhouchengming

Hi Steven, Peter,

On 2017/9/26 11:18, Steven Rostedt wrote:

On Tue, 26 Sep 2017 09:23:20 +0800
zhouchengming  wrote:


On 2017/9/26 3:40, Steven Rostedt wrote:

On Mon, 11 Sep 2017 14:51:49 +0800
Zhou Chengming   wrote:


push_rt_task() pick the first pushable task and find an eligible
lowest_rq, then double_lock_balance(rq, lowest_rq). So if
double_lock_balance() unlock the rq (when double_lock_balance() return 1),
we have to check if this task is still on the rq.

The problem is that the check conditions are not sufficient:

if (unlikely(task_rq(task) != rq ||
 !cpumask_test_cpu(lowest_rq->cpu,>cpus_allowed) ||
 task_running(rq, task) ||
 !rt_task(task) ||
 !task_on_rq_queued(task))) {

cpu2cpu1cpu0
push_rt_task(rq1)
pick task_A on rq1
find rq0
  double_lock_balance(rq1, rq0)
unlock(rq1)
rq1 __schedule
  pick task_A run
task_A sleep (dequeued)
lock(rq0)
lock(rq1)
  do_above_check(task_A)
task_rq(task_A) == rq1
cpus_allowed unchanged
task_running == false
rt_task(task_A) == true
try_to_wake_up(task_A)
  select_cpu = cpu3
  enqueue(rq3, task_A)

How can this happen? The try_to_wake_up(task_A) needs to grab the rq
that task A is on, and we have that rq lock.

/me confused.

-- Steve

Thanks for the reply!
After the task_A sleep on cpu1, the try_to_wake_up(task_A) on cpu0 select a 
different cpu3,
so it will grab the rq3 lock, not the rq1 lock.

Ah crap. This is caused by 7608dec2ce20 ("sched: Drop the rq argument
to sched_class::select_task_rq()"). Because this code depends on
try_to_wake_up() grabbing the task's rq lock. But it no longer does
that, and it causes this race.

OK, I need to look at this deeper when I'm not so jetlagged and typing
this because I can't sleep at 5am.

Thanks for pointing this out!

It may be fixed by simply grabbing the run queue lock on migration, as
that would sync things up.


Is there any new solution? I don't think grabbing the rq lock without the 
task->pi_lock
will fix this problem. And I think my patch is correct and the changes are 
small.

Thanks!


Peter?


-- Steve



.






Re: [PATCH] sched/rt.c: pick and check task if double_lock_balance() unlock the rq

2017-10-06 Thread zhouchengming

Hi Steven, Peter,

On 2017/9/26 11:18, Steven Rostedt wrote:

On Tue, 26 Sep 2017 09:23:20 +0800
zhouchengming  wrote:


On 2017/9/26 3:40, Steven Rostedt wrote:

On Mon, 11 Sep 2017 14:51:49 +0800
Zhou Chengming   wrote:


push_rt_task() pick the first pushable task and find an eligible
lowest_rq, then double_lock_balance(rq, lowest_rq). So if
double_lock_balance() unlock the rq (when double_lock_balance() return 1),
we have to check if this task is still on the rq.

The problem is that the check conditions are not sufficient:

if (unlikely(task_rq(task) != rq ||
 !cpumask_test_cpu(lowest_rq->cpu,>cpus_allowed) ||
 task_running(rq, task) ||
 !rt_task(task) ||
 !task_on_rq_queued(task))) {

cpu2cpu1cpu0
push_rt_task(rq1)
pick task_A on rq1
find rq0
  double_lock_balance(rq1, rq0)
unlock(rq1)
rq1 __schedule
  pick task_A run
task_A sleep (dequeued)
lock(rq0)
lock(rq1)
  do_above_check(task_A)
task_rq(task_A) == rq1
cpus_allowed unchanged
task_running == false
rt_task(task_A) == true
try_to_wake_up(task_A)
  select_cpu = cpu3
  enqueue(rq3, task_A)

How can this happen? The try_to_wake_up(task_A) needs to grab the rq
that task A is on, and we have that rq lock.

/me confused.

-- Steve

Thanks for the reply!
After the task_A sleep on cpu1, the try_to_wake_up(task_A) on cpu0 select a 
different cpu3,
so it will grab the rq3 lock, not the rq1 lock.

Ah crap. This is caused by 7608dec2ce20 ("sched: Drop the rq argument
to sched_class::select_task_rq()"). Because this code depends on
try_to_wake_up() grabbing the task's rq lock. But it no longer does
that, and it causes this race.

OK, I need to look at this deeper when I'm not so jetlagged and typing
this because I can't sleep at 5am.

Thanks for pointing this out!

It may be fixed by simply grabbing the run queue lock on migration, as
that would sync things up.


Is there any new solution? I don't think grabbing the rq lock without the 
task->pi_lock
will fix this problem. And I think my patch is correct and the changes are 
small.

Thanks!


Peter?


-- Steve



.






Re: [PATCH] sched/rt.c: pick and check task if double_lock_balance() unlock the rq

2017-09-25 Thread Steven Rostedt
On Tue, 26 Sep 2017 09:23:20 +0800
zhouchengming  wrote:

> On 2017/9/26 3:40, Steven Rostedt wrote:
> > On Mon, 11 Sep 2017 14:51:49 +0800
> > Zhou Chengming  wrote:
> >  
> >> push_rt_task() pick the first pushable task and find an eligible
> >> lowest_rq, then double_lock_balance(rq, lowest_rq). So if
> >> double_lock_balance() unlock the rq (when double_lock_balance() return 1),
> >> we have to check if this task is still on the rq.
> >>
> >> The problem is that the check conditions are not sufficient:
> >>
> >> if (unlikely(task_rq(task) != rq ||
> >> !cpumask_test_cpu(lowest_rq->cpu,>cpus_allowed) ||
> >> task_running(rq, task) ||
> >> !rt_task(task) ||
> >> !task_on_rq_queued(task))) {
> >>
> >> cpu2   cpu1cpu0
> >> push_rt_task(rq1)
> >>pick task_A on rq1
> >>find rq0
> >>  double_lock_balance(rq1, rq0)
> >>unlock(rq1)
> >>rq1 __schedule
> >>  pick task_A run
> >>task_A sleep (dequeued)
> >>lock(rq0)
> >>lock(rq1)
> >>  do_above_check(task_A)
> >>task_rq(task_A) == rq1
> >>cpus_allowed unchanged
> >>task_running == false
> >>rt_task(task_A) == true
> >>try_to_wake_up(task_A)
> >>  select_cpu = cpu3
> >>  enqueue(rq3, task_A)  
> > How can this happen? The try_to_wake_up(task_A) needs to grab the rq
> > that task A is on, and we have that rq lock.
> >
> > /me confused.
> >
> > -- Steve  
> 
> Thanks for the reply!
> After the task_A sleep on cpu1, the try_to_wake_up(task_A) on cpu0 select a 
> different cpu3,
> so it will grab the rq3 lock, not the rq1 lock.

Ah crap. This is caused by 7608dec2ce20 ("sched: Drop the rq argument
to sched_class::select_task_rq()"). Because this code depends on
try_to_wake_up() grabbing the task's rq lock. But it no longer does
that, and it causes this race.

OK, I need to look at this deeper when I'm not so jetlagged and typing
this because I can't sleep at 5am.

Thanks for pointing this out!

It may be fixed by simply grabbing the run queue lock on migration, as
that would sync things up.

Peter?


-- Steve




Re: [PATCH] sched/rt.c: pick and check task if double_lock_balance() unlock the rq

2017-09-25 Thread Steven Rostedt
On Tue, 26 Sep 2017 09:23:20 +0800
zhouchengming  wrote:

> On 2017/9/26 3:40, Steven Rostedt wrote:
> > On Mon, 11 Sep 2017 14:51:49 +0800
> > Zhou Chengming  wrote:
> >  
> >> push_rt_task() pick the first pushable task and find an eligible
> >> lowest_rq, then double_lock_balance(rq, lowest_rq). So if
> >> double_lock_balance() unlock the rq (when double_lock_balance() return 1),
> >> we have to check if this task is still on the rq.
> >>
> >> The problem is that the check conditions are not sufficient:
> >>
> >> if (unlikely(task_rq(task) != rq ||
> >> !cpumask_test_cpu(lowest_rq->cpu,>cpus_allowed) ||
> >> task_running(rq, task) ||
> >> !rt_task(task) ||
> >> !task_on_rq_queued(task))) {
> >>
> >> cpu2   cpu1cpu0
> >> push_rt_task(rq1)
> >>pick task_A on rq1
> >>find rq0
> >>  double_lock_balance(rq1, rq0)
> >>unlock(rq1)
> >>rq1 __schedule
> >>  pick task_A run
> >>task_A sleep (dequeued)
> >>lock(rq0)
> >>lock(rq1)
> >>  do_above_check(task_A)
> >>task_rq(task_A) == rq1
> >>cpus_allowed unchanged
> >>task_running == false
> >>rt_task(task_A) == true
> >>try_to_wake_up(task_A)
> >>  select_cpu = cpu3
> >>  enqueue(rq3, task_A)  
> > How can this happen? The try_to_wake_up(task_A) needs to grab the rq
> > that task A is on, and we have that rq lock.
> >
> > /me confused.
> >
> > -- Steve  
> 
> Thanks for the reply!
> After the task_A sleep on cpu1, the try_to_wake_up(task_A) on cpu0 select a 
> different cpu3,
> so it will grab the rq3 lock, not the rq1 lock.

Ah crap. This is caused by 7608dec2ce20 ("sched: Drop the rq argument
to sched_class::select_task_rq()"). Because this code depends on
try_to_wake_up() grabbing the task's rq lock. But it no longer does
that, and it causes this race.

OK, I need to look at this deeper when I'm not so jetlagged and typing
this because I can't sleep at 5am.

Thanks for pointing this out!

It may be fixed by simply grabbing the run queue lock on migration, as
that would sync things up.

Peter?


-- Steve




Re: [PATCH] sched/rt.c: pick and check task if double_lock_balance() unlock the rq

2017-09-25 Thread zhouchengming

On 2017/9/26 3:40, Steven Rostedt wrote:

On Mon, 11 Sep 2017 14:51:49 +0800
Zhou Chengming  wrote:


push_rt_task() pick the first pushable task and find an eligible
lowest_rq, then double_lock_balance(rq, lowest_rq). So if
double_lock_balance() unlock the rq (when double_lock_balance() return 1),
we have to check if this task is still on the rq.

The problem is that the check conditions are not sufficient:

if (unlikely(task_rq(task) != rq ||
 !cpumask_test_cpu(lowest_rq->cpu,>cpus_allowed) ||
 task_running(rq, task) ||
 !rt_task(task) ||
 !task_on_rq_queued(task))) {

cpu2cpu1cpu0
push_rt_task(rq1)
   pick task_A on rq1
   find rq0
 double_lock_balance(rq1, rq0)
   unlock(rq1)
rq1 __schedule
  pick task_A run
task_A sleep (dequeued)
   lock(rq0)
   lock(rq1)
 do_above_check(task_A)
   task_rq(task_A) == rq1
   cpus_allowed unchanged
   task_running == false
   rt_task(task_A) == true
try_to_wake_up(task_A)
  select_cpu = cpu3
  enqueue(rq3, task_A)

How can this happen? The try_to_wake_up(task_A) needs to grab the rq
that task A is on, and we have that rq lock.

/me confused.

-- Steve


Thanks for the reply!
After the task_A sleep on cpu1, the try_to_wake_up(task_A) on cpu0 select a 
different cpu3,
so it will grab the rq3 lock, not the rq1 lock.

Thanks.




task_A->on_rq = 1
   task_on_rq_queued(task_A)
 above_check passed, return rq0
 ...
 migrate task_A from rq1 to rq0

So we can't rely on these checks of task_A to make sure the task_A is
still on the rq1, even though we hold the rq1->lock. This patch will
repick the first pushable task to be sure the task is still on the rq.

Signed-off-by: Zhou Chengming


.






Re: [PATCH] sched/rt.c: pick and check task if double_lock_balance() unlock the rq

2017-09-25 Thread zhouchengming

On 2017/9/26 3:40, Steven Rostedt wrote:

On Mon, 11 Sep 2017 14:51:49 +0800
Zhou Chengming  wrote:


push_rt_task() pick the first pushable task and find an eligible
lowest_rq, then double_lock_balance(rq, lowest_rq). So if
double_lock_balance() unlock the rq (when double_lock_balance() return 1),
we have to check if this task is still on the rq.

The problem is that the check conditions are not sufficient:

if (unlikely(task_rq(task) != rq ||
 !cpumask_test_cpu(lowest_rq->cpu,>cpus_allowed) ||
 task_running(rq, task) ||
 !rt_task(task) ||
 !task_on_rq_queued(task))) {

cpu2cpu1cpu0
push_rt_task(rq1)
   pick task_A on rq1
   find rq0
 double_lock_balance(rq1, rq0)
   unlock(rq1)
rq1 __schedule
  pick task_A run
task_A sleep (dequeued)
   lock(rq0)
   lock(rq1)
 do_above_check(task_A)
   task_rq(task_A) == rq1
   cpus_allowed unchanged
   task_running == false
   rt_task(task_A) == true
try_to_wake_up(task_A)
  select_cpu = cpu3
  enqueue(rq3, task_A)

How can this happen? The try_to_wake_up(task_A) needs to grab the rq
that task A is on, and we have that rq lock.

/me confused.

-- Steve


Thanks for the reply!
After the task_A sleep on cpu1, the try_to_wake_up(task_A) on cpu0 select a 
different cpu3,
so it will grab the rq3 lock, not the rq1 lock.

Thanks.




task_A->on_rq = 1
   task_on_rq_queued(task_A)
 above_check passed, return rq0
 ...
 migrate task_A from rq1 to rq0

So we can't rely on these checks of task_A to make sure the task_A is
still on the rq1, even though we hold the rq1->lock. This patch will
repick the first pushable task to be sure the task is still on the rq.

Signed-off-by: Zhou Chengming


.






Re: [PATCH] sched/rt.c: pick and check task if double_lock_balance() unlock the rq

2017-09-25 Thread Steven Rostedt
On Mon, 11 Sep 2017 14:51:49 +0800
Zhou Chengming  wrote:

> push_rt_task() pick the first pushable task and find an eligible
> lowest_rq, then double_lock_balance(rq, lowest_rq). So if
> double_lock_balance() unlock the rq (when double_lock_balance() return 1),
> we have to check if this task is still on the rq.
> 
> The problem is that the check conditions are not sufficient:
> 
> if (unlikely(task_rq(task) != rq ||
>!cpumask_test_cpu(lowest_rq->cpu, >cpus_allowed) ||
>task_running(rq, task) ||
>!rt_task(task) ||
>!task_on_rq_queued(task))) {
> 
> cpu2  cpu1cpu0
> push_rt_task(rq1)
>   pick task_A on rq1
>   find rq0
> double_lock_balance(rq1, rq0)
>   unlock(rq1)
>   rq1 __schedule
> pick task_A run
>   task_A sleep (dequeued)
>   lock(rq0)
>   lock(rq1)
> do_above_check(task_A)
>   task_rq(task_A) == rq1
>   cpus_allowed unchanged
>   task_running == false
>   rt_task(task_A) == true
>   try_to_wake_up(task_A)
> select_cpu = cpu3
> enqueue(rq3, task_A)

How can this happen? The try_to_wake_up(task_A) needs to grab the rq
that task A is on, and we have that rq lock. 

/me confused.

-- Steve


> task_A->on_rq = 1
>   task_on_rq_queued(task_A)
> above_check passed, return rq0
> ...
> migrate task_A from rq1 to rq0
> 
> So we can't rely on these checks of task_A to make sure the task_A is
> still on the rq1, even though we hold the rq1->lock. This patch will
> repick the first pushable task to be sure the task is still on the rq.
> 
> Signed-off-by: Zhou Chengming 
>


Re: [PATCH] sched/rt.c: pick and check task if double_lock_balance() unlock the rq

2017-09-25 Thread Steven Rostedt
On Mon, 11 Sep 2017 14:51:49 +0800
Zhou Chengming  wrote:

> push_rt_task() pick the first pushable task and find an eligible
> lowest_rq, then double_lock_balance(rq, lowest_rq). So if
> double_lock_balance() unlock the rq (when double_lock_balance() return 1),
> we have to check if this task is still on the rq.
> 
> The problem is that the check conditions are not sufficient:
> 
> if (unlikely(task_rq(task) != rq ||
>!cpumask_test_cpu(lowest_rq->cpu, >cpus_allowed) ||
>task_running(rq, task) ||
>!rt_task(task) ||
>!task_on_rq_queued(task))) {
> 
> cpu2  cpu1cpu0
> push_rt_task(rq1)
>   pick task_A on rq1
>   find rq0
> double_lock_balance(rq1, rq0)
>   unlock(rq1)
>   rq1 __schedule
> pick task_A run
>   task_A sleep (dequeued)
>   lock(rq0)
>   lock(rq1)
> do_above_check(task_A)
>   task_rq(task_A) == rq1
>   cpus_allowed unchanged
>   task_running == false
>   rt_task(task_A) == true
>   try_to_wake_up(task_A)
> select_cpu = cpu3
> enqueue(rq3, task_A)

How can this happen? The try_to_wake_up(task_A) needs to grab the rq
that task A is on, and we have that rq lock. 

/me confused.

-- Steve


> task_A->on_rq = 1
>   task_on_rq_queued(task_A)
> above_check passed, return rq0
> ...
> migrate task_A from rq1 to rq0
> 
> So we can't rely on these checks of task_A to make sure the task_A is
> still on the rq1, even though we hold the rq1->lock. This patch will
> repick the first pushable task to be sure the task is still on the rq.
> 
> Signed-off-by: Zhou Chengming 
>


Re: [PATCH] sched/rt.c: pick and check task if double_lock_balance() unlock the rq

2017-09-25 Thread zhouchengming

ping...
Or it isn't a real problem ?

Thanks.

On 2017/9/11 14:51, Zhou Chengming wrote:

push_rt_task() pick the first pushable task and find an eligible
lowest_rq, then double_lock_balance(rq, lowest_rq). So if
double_lock_balance() unlock the rq (when double_lock_balance() return 1),
we have to check if this task is still on the rq.

The problem is that the check conditions are not sufficient:

if (unlikely(task_rq(task) != rq ||
 !cpumask_test_cpu(lowest_rq->cpu,>cpus_allowed) ||
 task_running(rq, task) ||
 !rt_task(task) ||
 !task_on_rq_queued(task))) {

cpu2cpu1cpu0
push_rt_task(rq1)
   pick task_A on rq1
   find rq0
 double_lock_balance(rq1, rq0)
   unlock(rq1)
rq1 __schedule
  pick task_A run
task_A sleep (dequeued)
   lock(rq0)
   lock(rq1)
 do_above_check(task_A)
   task_rq(task_A) == rq1
   cpus_allowed unchanged
   task_running == false
   rt_task(task_A) == true
try_to_wake_up(task_A)
  select_cpu = cpu3
  enqueue(rq3, task_A)
  task_A->on_rq = 1
   task_on_rq_queued(task_A)
 above_check passed, return rq0
 ...
 migrate task_A from rq1 to rq0

So we can't rely on these checks of task_A to make sure the task_A is
still on the rq1, even though we hold the rq1->lock. This patch will
repick the first pushable task to be sure the task is still on the rq.

Signed-off-by: Zhou Chengming
---
  kernel/sched/rt.c | 49 +++--
  1 file changed, 23 insertions(+), 26 deletions(-)

diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 45caf93..787b721 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1703,6 +1703,26 @@ static int find_lowest_rq(struct task_struct *task)
return -1;
  }

+static struct task_struct *pick_next_pushable_task(struct rq *rq)
+{
+   struct task_struct *p;
+
+   if (!has_pushable_tasks(rq))
+   return NULL;
+
+   p = plist_first_entry(>rt.pushable_tasks,
+ struct task_struct, pushable_tasks);
+
+   BUG_ON(rq->cpu != task_cpu(p));
+   BUG_ON(task_current(rq, p));
+   BUG_ON(p->nr_cpus_allowed<= 1);
+
+   BUG_ON(!task_on_rq_queued(p));
+   BUG_ON(!rt_task(p));
+
+   return p;
+}
+
  /* Will lock the rq it finds */
  static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
  {
@@ -1734,13 +1754,10 @@ static struct rq *find_lock_lowest_rq(struct 
task_struct *task, struct rq *rq)
 * We had to unlock the run queue. In
 * the mean time, task could have
 * migrated already or had its affinity changed.
-* Also make sure that it wasn't scheduled on its rq.
 */
-   if (unlikely(task_rq(task) != rq ||
-
!cpumask_test_cpu(lowest_rq->cpu,>cpus_allowed) ||
-task_running(rq, task) ||
-!rt_task(task) ||
-!task_on_rq_queued(task))) {
+   struct task_struct *next_task = 
pick_next_pushable_task(rq);
+   if (unlikely(next_task != task ||
+
!cpumask_test_cpu(lowest_rq->cpu,>cpus_allowed))) {

double_unlock_balance(rq, lowest_rq);
lowest_rq = NULL;
@@ -1760,26 +1777,6 @@ static struct rq *find_lock_lowest_rq(struct task_struct 
*task, struct rq *rq)
return lowest_rq;
  }

-static struct task_struct *pick_next_pushable_task(struct rq *rq)
-{
-   struct task_struct *p;
-
-   if (!has_pushable_tasks(rq))
-   return NULL;
-
-   p = plist_first_entry(>rt.pushable_tasks,
- struct task_struct, pushable_tasks);
-
-   BUG_ON(rq->cpu != task_cpu(p));
-   BUG_ON(task_current(rq, p));
-   BUG_ON(p->nr_cpus_allowed<= 1);
-
-   BUG_ON(!task_on_rq_queued(p));
-   BUG_ON(!rt_task(p));
-
-   return p;
-}
-
  /*
   * If the current CPU has more than one RT task, see if the non
   * running task can migrate over to a CPU that is running a task





Re: [PATCH] sched/rt.c: pick and check task if double_lock_balance() unlock the rq

2017-09-25 Thread zhouchengming

ping...
Or it isn't a real problem ?

Thanks.

On 2017/9/11 14:51, Zhou Chengming wrote:

push_rt_task() pick the first pushable task and find an eligible
lowest_rq, then double_lock_balance(rq, lowest_rq). So if
double_lock_balance() unlock the rq (when double_lock_balance() return 1),
we have to check if this task is still on the rq.

The problem is that the check conditions are not sufficient:

if (unlikely(task_rq(task) != rq ||
 !cpumask_test_cpu(lowest_rq->cpu,>cpus_allowed) ||
 task_running(rq, task) ||
 !rt_task(task) ||
 !task_on_rq_queued(task))) {

cpu2cpu1cpu0
push_rt_task(rq1)
   pick task_A on rq1
   find rq0
 double_lock_balance(rq1, rq0)
   unlock(rq1)
rq1 __schedule
  pick task_A run
task_A sleep (dequeued)
   lock(rq0)
   lock(rq1)
 do_above_check(task_A)
   task_rq(task_A) == rq1
   cpus_allowed unchanged
   task_running == false
   rt_task(task_A) == true
try_to_wake_up(task_A)
  select_cpu = cpu3
  enqueue(rq3, task_A)
  task_A->on_rq = 1
   task_on_rq_queued(task_A)
 above_check passed, return rq0
 ...
 migrate task_A from rq1 to rq0

So we can't rely on these checks of task_A to make sure the task_A is
still on the rq1, even though we hold the rq1->lock. This patch will
repick the first pushable task to be sure the task is still on the rq.

Signed-off-by: Zhou Chengming
---
  kernel/sched/rt.c | 49 +++--
  1 file changed, 23 insertions(+), 26 deletions(-)

diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 45caf93..787b721 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1703,6 +1703,26 @@ static int find_lowest_rq(struct task_struct *task)
return -1;
  }

+static struct task_struct *pick_next_pushable_task(struct rq *rq)
+{
+   struct task_struct *p;
+
+   if (!has_pushable_tasks(rq))
+   return NULL;
+
+   p = plist_first_entry(>rt.pushable_tasks,
+ struct task_struct, pushable_tasks);
+
+   BUG_ON(rq->cpu != task_cpu(p));
+   BUG_ON(task_current(rq, p));
+   BUG_ON(p->nr_cpus_allowed<= 1);
+
+   BUG_ON(!task_on_rq_queued(p));
+   BUG_ON(!rt_task(p));
+
+   return p;
+}
+
  /* Will lock the rq it finds */
  static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
  {
@@ -1734,13 +1754,10 @@ static struct rq *find_lock_lowest_rq(struct 
task_struct *task, struct rq *rq)
 * We had to unlock the run queue. In
 * the mean time, task could have
 * migrated already or had its affinity changed.
-* Also make sure that it wasn't scheduled on its rq.
 */
-   if (unlikely(task_rq(task) != rq ||
-
!cpumask_test_cpu(lowest_rq->cpu,>cpus_allowed) ||
-task_running(rq, task) ||
-!rt_task(task) ||
-!task_on_rq_queued(task))) {
+   struct task_struct *next_task = 
pick_next_pushable_task(rq);
+   if (unlikely(next_task != task ||
+
!cpumask_test_cpu(lowest_rq->cpu,>cpus_allowed))) {

double_unlock_balance(rq, lowest_rq);
lowest_rq = NULL;
@@ -1760,26 +1777,6 @@ static struct rq *find_lock_lowest_rq(struct task_struct 
*task, struct rq *rq)
return lowest_rq;
  }

-static struct task_struct *pick_next_pushable_task(struct rq *rq)
-{
-   struct task_struct *p;
-
-   if (!has_pushable_tasks(rq))
-   return NULL;
-
-   p = plist_first_entry(>rt.pushable_tasks,
- struct task_struct, pushable_tasks);
-
-   BUG_ON(rq->cpu != task_cpu(p));
-   BUG_ON(task_current(rq, p));
-   BUG_ON(p->nr_cpus_allowed<= 1);
-
-   BUG_ON(!task_on_rq_queued(p));
-   BUG_ON(!rt_task(p));
-
-   return p;
-}
-
  /*
   * If the current CPU has more than one RT task, see if the non
   * running task can migrate over to a CPU that is running a task





Re: [PATCH] sched/rt.c: pick and check task if double_lock_balance() unlock the rq

2017-09-11 Thread zhouchengming

polite ping and +cc, thanks!

On 2017/9/11 14:51, Zhou Chengming wrote:

push_rt_task() pick the first pushable task and find an eligible
lowest_rq, then double_lock_balance(rq, lowest_rq). So if
double_lock_balance() unlock the rq (when double_lock_balance() return 1),
we have to check if this task is still on the rq.

The problem is that the check conditions are not sufficient:

if (unlikely(task_rq(task) != rq ||
 !cpumask_test_cpu(lowest_rq->cpu,>cpus_allowed) ||
 task_running(rq, task) ||
 !rt_task(task) ||
 !task_on_rq_queued(task))) {

cpu2cpu1cpu0
push_rt_task(rq1)
   pick task_A on rq1
   find rq0
 double_lock_balance(rq1, rq0)
   unlock(rq1)
rq1 __schedule
  pick task_A run
task_A sleep (dequeued)
   lock(rq0)
   lock(rq1)
 do_above_check(task_A)
   task_rq(task_A) == rq1
   cpus_allowed unchanged
   task_running == false
   rt_task(task_A) == true
try_to_wake_up(task_A)
  select_cpu = cpu3
  enqueue(rq3, task_A)
  task_A->on_rq = 1
   task_on_rq_queued(task_A)
 above_check passed, return rq0
 ...
 migrate task_A from rq1 to rq0

So we can't rely on these checks of task_A to make sure the task_A is
still on the rq1, even though we hold the rq1->lock. This patch will
repick the first pushable task to be sure the task is still on the rq.

Signed-off-by: Zhou Chengming
---
  kernel/sched/rt.c | 49 +++--
  1 file changed, 23 insertions(+), 26 deletions(-)

diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 45caf93..787b721 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1703,6 +1703,26 @@ static int find_lowest_rq(struct task_struct *task)
return -1;
  }

+static struct task_struct *pick_next_pushable_task(struct rq *rq)
+{
+   struct task_struct *p;
+
+   if (!has_pushable_tasks(rq))
+   return NULL;
+
+   p = plist_first_entry(>rt.pushable_tasks,
+ struct task_struct, pushable_tasks);
+
+   BUG_ON(rq->cpu != task_cpu(p));
+   BUG_ON(task_current(rq, p));
+   BUG_ON(p->nr_cpus_allowed<= 1);
+
+   BUG_ON(!task_on_rq_queued(p));
+   BUG_ON(!rt_task(p));
+
+   return p;
+}
+
  /* Will lock the rq it finds */
  static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
  {
@@ -1734,13 +1754,10 @@ static struct rq *find_lock_lowest_rq(struct 
task_struct *task, struct rq *rq)
 * We had to unlock the run queue. In
 * the mean time, task could have
 * migrated already or had its affinity changed.
-* Also make sure that it wasn't scheduled on its rq.
 */
-   if (unlikely(task_rq(task) != rq ||
-
!cpumask_test_cpu(lowest_rq->cpu,>cpus_allowed) ||
-task_running(rq, task) ||
-!rt_task(task) ||
-!task_on_rq_queued(task))) {
+   struct task_struct *next_task = 
pick_next_pushable_task(rq);
+   if (unlikely(next_task != task ||
+
!cpumask_test_cpu(lowest_rq->cpu,>cpus_allowed))) {

double_unlock_balance(rq, lowest_rq);
lowest_rq = NULL;
@@ -1760,26 +1777,6 @@ static struct rq *find_lock_lowest_rq(struct task_struct 
*task, struct rq *rq)
return lowest_rq;
  }

-static struct task_struct *pick_next_pushable_task(struct rq *rq)
-{
-   struct task_struct *p;
-
-   if (!has_pushable_tasks(rq))
-   return NULL;
-
-   p = plist_first_entry(>rt.pushable_tasks,
- struct task_struct, pushable_tasks);
-
-   BUG_ON(rq->cpu != task_cpu(p));
-   BUG_ON(task_current(rq, p));
-   BUG_ON(p->nr_cpus_allowed<= 1);
-
-   BUG_ON(!task_on_rq_queued(p));
-   BUG_ON(!rt_task(p));
-
-   return p;
-}
-
  /*
   * If the current CPU has more than one RT task, see if the non
   * running task can migrate over to a CPU that is running a task





Re: [PATCH] sched/rt.c: pick and check task if double_lock_balance() unlock the rq

2017-09-11 Thread zhouchengming

polite ping and +cc, thanks!

On 2017/9/11 14:51, Zhou Chengming wrote:

push_rt_task() pick the first pushable task and find an eligible
lowest_rq, then double_lock_balance(rq, lowest_rq). So if
double_lock_balance() unlock the rq (when double_lock_balance() return 1),
we have to check if this task is still on the rq.

The problem is that the check conditions are not sufficient:

if (unlikely(task_rq(task) != rq ||
 !cpumask_test_cpu(lowest_rq->cpu,>cpus_allowed) ||
 task_running(rq, task) ||
 !rt_task(task) ||
 !task_on_rq_queued(task))) {

cpu2cpu1cpu0
push_rt_task(rq1)
   pick task_A on rq1
   find rq0
 double_lock_balance(rq1, rq0)
   unlock(rq1)
rq1 __schedule
  pick task_A run
task_A sleep (dequeued)
   lock(rq0)
   lock(rq1)
 do_above_check(task_A)
   task_rq(task_A) == rq1
   cpus_allowed unchanged
   task_running == false
   rt_task(task_A) == true
try_to_wake_up(task_A)
  select_cpu = cpu3
  enqueue(rq3, task_A)
  task_A->on_rq = 1
   task_on_rq_queued(task_A)
 above_check passed, return rq0
 ...
 migrate task_A from rq1 to rq0

So we can't rely on these checks of task_A to make sure the task_A is
still on the rq1, even though we hold the rq1->lock. This patch will
repick the first pushable task to be sure the task is still on the rq.

Signed-off-by: Zhou Chengming
---
  kernel/sched/rt.c | 49 +++--
  1 file changed, 23 insertions(+), 26 deletions(-)

diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 45caf93..787b721 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1703,6 +1703,26 @@ static int find_lowest_rq(struct task_struct *task)
return -1;
  }

+static struct task_struct *pick_next_pushable_task(struct rq *rq)
+{
+   struct task_struct *p;
+
+   if (!has_pushable_tasks(rq))
+   return NULL;
+
+   p = plist_first_entry(>rt.pushable_tasks,
+ struct task_struct, pushable_tasks);
+
+   BUG_ON(rq->cpu != task_cpu(p));
+   BUG_ON(task_current(rq, p));
+   BUG_ON(p->nr_cpus_allowed<= 1);
+
+   BUG_ON(!task_on_rq_queued(p));
+   BUG_ON(!rt_task(p));
+
+   return p;
+}
+
  /* Will lock the rq it finds */
  static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
  {
@@ -1734,13 +1754,10 @@ static struct rq *find_lock_lowest_rq(struct 
task_struct *task, struct rq *rq)
 * We had to unlock the run queue. In
 * the mean time, task could have
 * migrated already or had its affinity changed.
-* Also make sure that it wasn't scheduled on its rq.
 */
-   if (unlikely(task_rq(task) != rq ||
-
!cpumask_test_cpu(lowest_rq->cpu,>cpus_allowed) ||
-task_running(rq, task) ||
-!rt_task(task) ||
-!task_on_rq_queued(task))) {
+   struct task_struct *next_task = 
pick_next_pushable_task(rq);
+   if (unlikely(next_task != task ||
+
!cpumask_test_cpu(lowest_rq->cpu,>cpus_allowed))) {

double_unlock_balance(rq, lowest_rq);
lowest_rq = NULL;
@@ -1760,26 +1777,6 @@ static struct rq *find_lock_lowest_rq(struct task_struct 
*task, struct rq *rq)
return lowest_rq;
  }

-static struct task_struct *pick_next_pushable_task(struct rq *rq)
-{
-   struct task_struct *p;
-
-   if (!has_pushable_tasks(rq))
-   return NULL;
-
-   p = plist_first_entry(>rt.pushable_tasks,
- struct task_struct, pushable_tasks);
-
-   BUG_ON(rq->cpu != task_cpu(p));
-   BUG_ON(task_current(rq, p));
-   BUG_ON(p->nr_cpus_allowed<= 1);
-
-   BUG_ON(!task_on_rq_queued(p));
-   BUG_ON(!rt_task(p));
-
-   return p;
-}
-
  /*
   * If the current CPU has more than one RT task, see if the non
   * running task can migrate over to a CPU that is running a task