Re: [RFC PATCH v2 4/6] sched/deadline: Introduce deadline servers

2020-10-06 Thread luca abeni
On Tue, 6 Oct 2020 11:35:23 +0200
Juri Lelli  wrote:
[...]
> > > + if (dl_se->server_has_tasks(dl_se)) {
> > > + enqueue_dl_entity(dl_se, dl_se, 
> > > ENQUEUE_REPLENISH);
> > > + resched_curr(rq);
> > > + __push_dl_task(rq, );
> > > + } else {
> > > + replenish_dl_entity(dl_se, dl_se);  
> > 
> > I am wondering if here we need a "task_non_contending(dl_se)" after
> > "replenish_dl_entity(dl_se, dl_se);"...
> > 
> > Basically, what happened is that all the served tasks blocked while the
> > server was throttled... So, now the server should be disabled (so, we
> > replenish the dl entity but we do not insert it in runqueue).
> > But when the server finished its budget and has been throttled, it has
> > not been disabled (so, its utilization is still in running_bw).  
> 
> Hummm. For CFS, we call dl_server_stop() after the last CFS task blocks
> and that calls dequeue_dl(SLEEP), which should be calling
> task_non_contending(). That should be happening also while the server is
> throttled and CFS tasks are running outside of it, no?

You are right... I somehow lost this detail.


> Guess I'm missing something.

No, I was the one missing something :)
Sorry about the noise.



Thanks,
Luca


Re: [RFC PATCH v2 4/6] sched/deadline: Introduce deadline servers

2020-10-06 Thread luca abeni
Hi,

sorry for the late reply... Anyway, I am currently testing this
patchset (and trying to use it for the "SCHED_DEADLINE-based cgroup
scheduling" patchset).
And during my tests I had a doubt:



On Fri,  7 Aug 2020 11:50:49 +0200
Juri Lelli  wrote:

> From: Peter Zijlstra 
> 
> Low priority tasks (e.g., SCHED_OTHER) can suffer starvation if tasks
> with higher priority (e.g., SCHED_FIFO) monopolize CPU(s).
[...]
> @@ -1024,10 +1062,34 @@ static enum hrtimer_restart dl_task_timer(struct 
> hrtimer *timer)
>   struct sched_dl_entity *dl_se = container_of(timer,
>struct sched_dl_entity,
>dl_timer);
> - struct task_struct *p = dl_task_of(dl_se);
> + struct task_struct *p;
>   struct rq_flags rf;
>   struct rq *rq;
>  
> + if (dl_server(dl_se)) {
> + struct rq *rq = rq_of_dl_se(dl_se);
> + struct rq_flags rf;
> +
> + rq_lock(rq, );
> + if (dl_se->dl_throttled) {
> + sched_clock_tick();
> + update_rq_clock(rq);
> +
> + if (dl_se->server_has_tasks(dl_se)) {
> + enqueue_dl_entity(dl_se, dl_se, 
> ENQUEUE_REPLENISH);
> + resched_curr(rq);
> + __push_dl_task(rq, );
> + } else {
> + replenish_dl_entity(dl_se, dl_se);

I am wondering if here we need a "task_non_contending(dl_se)" after
"replenish_dl_entity(dl_se, dl_se);"...

Basically, what happened is that all the served tasks blocked while the
server was throttled... So, now the server should be disabled (so, we
replenish the dl entity but we do not insert it in runqueue).
But when the server finished its budget and has been throttled, it has
not been disabled (so, its utilization is still in running_bw).

I think that if we do not call task_non_contending() the server's
utilization is not correctly removed from running_bw (this does not
happen for regular tasks, because task_non_contending() is invoked when
they block, even if the dl entity is throttled... But for servers I
think we have an issue).

What do you think? Do you agree with this change?

I have no easy way to reproduce the issue, but this change fixed some
strange behaviours I was seeing when using this patch to schedule RT
cgroups.



Thanks,
Luca
> + }
> +
> + }
> + rq_unlock(rq, );
> +
> + return HRTIMER_NORESTART;
> + }
> +
> + p = dl_task_of(dl_se);
>   rq = task_rq_lock(p, );
>  
>   /*
> @@ -1098,21 +1160,7 @@ static enum hrtimer_restart dl_task_timer(struct 
> hrtimer *timer)
>   else
>   resched_curr(rq);
>  
> -#ifdef CONFIG_SMP
> - /*
> -  * Queueing this task back might have overloaded rq, check if we need
> -  * to kick someone away.
> -  */
> - if (has_pushable_dl_tasks(rq)) {
> - /*
> -  * Nothing relies on rq->lock after this, so its safe to drop
> -  * rq->lock.
> -  */
> - rq_unpin_lock(rq, );
> - push_dl_task(rq);
> - rq_repin_lock(rq, );
> - }
> -#endif
> + __push_dl_task(rq, );
>  
>  unlock:
>   task_rq_unlock(rq, p, );
> @@ -1154,12 +1202,11 @@ static void init_dl_task_timer(struct sched_dl_entity 
> *dl_se)
>   */
>  static inline void dl_check_constrained_dl(struct sched_dl_entity *dl_se)
>  {
> - struct task_struct *p = dl_task_of(dl_se);
> - struct rq *rq = rq_of_dl_rq(dl_rq_of_se(dl_se));
> + struct rq *rq = rq_of_dl_se(dl_se);
>  
>   if (dl_time_before(dl_se->deadline, rq_clock(rq)) &&
>   dl_time_before(rq_clock(rq), dl_next_period(dl_se))) {
> - if (unlikely(dl_se->dl_boosted || !start_dl_timer(p)))
> + if (unlikely(dl_se->dl_boosted || !start_dl_timer(dl_se)))
>   return;
>   dl_se->dl_throttled = 1;
>   if (dl_se->runtime > 0)
> @@ -1216,29 +1263,10 @@ static u64 grub_reclaim(u64 delta, struct rq *rq, 
> struct sched_dl_entity *dl_se)
>   return (delta * u_act) >> BW_SHIFT;
>  }
>  
> -/*
> - * Update the current task's runtime statistics (provided it is still
> - * a -deadline task and has not been removed from the dl_rq).
> - */
> -static void update_curr_dl(struct rq *rq)
> +static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, 
> s64 delta_exec)
>  {
> - struct task_struct *curr = rq->curr;
> - struct sched_dl_entity *dl_se = >dl;
> - s64 delta_exec, scaled_delta_exec;
> - int cpu = cpu_of(rq);
> + s64 scaled_delta_exec;
>  
> - if (!dl_task(curr) || !on_dl_rq(dl_se))
> - return;
> -
> - /*
> -  * Consumed budget is computed considering the time as
> -  * observed by schedulable 

Re: [RFC PATCH v2 0/6] SCHED_DEADLINE server infrastructure

2020-08-07 Thread luca abeni
Hi Juri,

thanks for sharing the v2 patchset!

In the next days I'll have a look at it, and try some tests...

In the meanwhile, I have some questions/comments after a first quick
look.

If I understand well, the patchset does not apply deadline servers to
FIFO and RR tasks, right? How does this patchset interact with RT
throttling?

If I understand well, patch 6/6 does something like "use deadline
servers for SCHED_OTHER only if FIFO/RR tasks risk to starve
SCHED_OTHER tasks"... Right? I understand this is because you do not
want to delay RT tasks if they are not starving other tasks. But then,
maybe what you want is not deadline-based scheduling. Maybe a
reservation-based scheduler based on fixed priorities is what you want?
(with SCHED_DEADLINE, you could provide exact performance guarantees to
SCHED_OTHER tasks, but I suspect patch 6/6 breaks these guarantees?)


Thanks,
Luca

On Fri,  7 Aug 2020 11:50:45 +0200
Juri Lelli  wrote:

> Hi,
> 
> This is RFC v2 of Peter's SCHED_DEADLINE server infrastructure
> implementation [1].
> 
> SCHED_DEADLINE servers can help fixing starvation issues of low
> priority tasks (e.g., SCHED_OTHER) when higher priority tasks
> monopolize CPU cycles. Today we have RT Throttling; DEADLINE servers
> should be able to replace and improve that.
> 
> I rebased Peter's patches (adding changelogs where needed) on
> tip/sched/core as of today and incorporated fixes to issues discussed
> during RFC v1. Current set seems to even boot on real HW! :-)
> 
> While playing with RFC v1 set (and discussing it further offline with
> Daniel) it has emerged the need to slightly change the behavior. Patch
> 6/6 is a (cumbersome?) attempt to show what's probably needed.
> The problem with "original" implementation is that FIFO tasks might
> suffer preemption from NORMAL even when spare CPU cycles are
> available. In fact, fair deadline server is enqueued right away when
> NORMAL tasks wake up and they are first scheduled by the server, thus
> potentially preempting a well behaving FIFO task. This is of course
> not ideal. So, in patch 6/6 I propose to use some kind of starvation
> monitor/ watchdog that delays enqueuing of deadline servers to the
> point when fair tasks might start to actually suffer from starvation
> (just randomly picked HZ/2 for now). One problem I already see with
> the current implementation is that it adds overhead to fair paths, so
> I'm pretty sure there are better ways to implement the idea (e.g.,
> Daniel already suggested using a starvation monitor kthread sort of
> thing).
> 
> Receiving comments and suggestions is the sole purpose of this posting
> at this stage. Hopefully we can further discuss the idea at Plumbers
> in a few weeks. So, please don't focus too much into actual
> implementation (which I plan to revise anyway after I'm back from pto
> :), but try to see if this might actually fly. The feature seems to
> be very much needed.
> 
> Thanks!
> 
> Juri
> 
> 1 -
> https://lore.kernel.org/lkml/20190726145409.947503...@infradead.org/
> 
> Juri Lelli (1):
>   sched/fair: Implement starvation monitor
> 
> Peter Zijlstra (5):
>   sched: Unify runtime accounting across classes
>   sched/deadline: Collect sched_dl_entity initialization
>   sched/deadline: Move bandwidth accounting into
> {en,de}queue_dl_entity sched/deadline: Introduce deadline servers
>   sched/fair: Add trivial fair server
> 
>  include/linux/sched.h|  28 ++-
>  kernel/sched/core.c  |  23 +-
>  kernel/sched/deadline.c  | 483
> --- kernel/sched/fair.c  |
> 136 ++- kernel/sched/rt.c|  17 +-
>  kernel/sched/sched.h |  50 +++-
>  kernel/sched/stop_task.c |  16 +-
>  7 files changed, 522 insertions(+), 231 deletions(-)
> 



Re: [RFC PATCH v2 6/6] sched/fair: Implement starvation monitor

2020-08-07 Thread luca abeni
On Fri, 7 Aug 2020 15:43:53 +0200
Juri Lelli  wrote:

> On 07/08/20 15:28, luca abeni wrote:
> > Hi Juri,
> > 
> > On Fri, 7 Aug 2020 11:56:04 +0200
> > Juri Lelli  wrote:
> >   
> > > Starting deadline server for lower priority classes right away
> > > when first task is enqueued might break guarantees  
> > 
> > Which guarantees are you thinking about, here? Response times of
> > fixed priority tasks?  
> 
> Response time, but also wakeup latency (which, for better or worse, is
> another important metric).
> 
> > If fixed priority tasks are also scheduled through deadline servers,
> > then you can provide response-time guarantees to them even when
> > lower-priority/non-real-time tasks are scheduled through deadline
> > servers.  
> 
> Right, but I fear we won't be able to keep current behavior for
> wakeups: RT with highest prio always gets scheduled right away?

Uhm... I think this depends on how the servers' parameters are
designed: assigning "wrong" (or "bad") parameters to the server used to
schedule RT tasks, this property is broken.

(however, notice that even with the current patchset the highest
priority task might be scheduled with some delay --- if the SCHED_OTHER
deadline server is active because SCHED_OTHER tasks are being starved).



Luca


Re: [RFC PATCH v2 6/6] sched/fair: Implement starvation monitor

2020-08-07 Thread luca abeni
Hi Peter,

On Fri, 7 Aug 2020 12:46:18 +0200
pet...@infradead.org wrote:

> On Fri, Aug 07, 2020 at 11:56:04AM +0200, Juri Lelli wrote:
> > Starting deadline server for lower priority classes right away when
> > first task is enqueued might break guarantees, as tasks belonging to
> > intermediate priority classes could be uselessly preempted. E.g., a
> > well behaving (non hog) FIFO task can be preempted by NORMAL tasks
> > even if there are still CPU cycles available for NORMAL tasks to
> > run, as they'll be running inside the fair deadline server for some
> > period of time.
> > 
> > To prevent this issue, implement a starvation monitor mechanism that
> > starts the deadline server only if a (fair in this case) task hasn't
> > been scheduled for some interval of time after it has been enqueued.
> > Use pick/put functions to manage starvation monitor status.  
> 
> One thing I considerd was scheduling this as a least-laxity entity --
> such that it runs late, not early

Are you thinking about scheduling both RT and non-RT tasks through
deadline servers? If yes, then I think that using something like
laxity-based scheduling for the SCHED_OTHER server can be a good idea
(but then we need to understand how to combine deadline-based
scheduling with laxity-based scheduling, etc...)

Or are you thinking about keeping the SCHED_OTHER server throttled
until its laxity is 0 (or until its laxity is lower than some small
value)? In this second case, the approach would work even if RT tasks
are not scheduled through a server (but I do not know which kind of
performance guarantee we could provide).


> -- and start the server when
> rq->nr_running != rq->cfs.h_nr_running, IOW when there's !fair tasks
> around.

Yes, this could be a good optimization.



Luca
> 
> Not saying we should do it like that, but that's perhaps more
> deterministic than this.



Re: [RFC PATCH v2 0/6] SCHED_DEADLINE server infrastructure

2020-08-07 Thread luca abeni
Hi Juri,

On Fri, 7 Aug 2020 15:30:41 +0200
Juri Lelli  wrote:
[...]
> > In the meanwhile, I have some questions/comments after a first quick
> > look.
> > 
> > If I understand well, the patchset does not apply deadline servers
> > to FIFO and RR tasks, right? How does this patchset interact with RT
> > throttling?  
> 
> Well, it's something like the dual of it, in that RT Throttling
> directly throttles RT tasks to make spare CPU cycles available to
> fair tasks while this patchset introduces deadline servers to
> schedule fair tasks, thus still reserving CPU time for those (when
> needed).

Ah, OK... I was thinking about using deadline servers for both RT and
non-RT tasks. And to use them not only to throttle, but also to provide
some kind of performance guarantees (to all the scheduling classes).
Think about what can be done when combining this mechanism with
cgroups/containers :)

[...]
> > I understand this is because you do not
> > want to delay RT tasks if they are not starving other tasks. But
> > then, maybe what you want is not deadline-based scheduling. Maybe a
> > reservation-based scheduler based on fixed priorities is what you
> > want? (with SCHED_DEADLINE, you could provide exact performance
> > guarantees to SCHED_OTHER tasks, but I suspect patch 6/6 breaks
> > these guarantees?)  
> 
> I agree that we are not interested in exact guarantees in this case,
> but why not using something that it's already there and would give us
> the functionality we need (fix starvation for fair)?

Ok, if performance guarantees to non-RT tasks are not the goal, then I
agree. I was thinking that since the patchset provides a mechanism to
schedule various classes of tasks through deadline servers, then
using these servers to provide some kinds of guarantees could be
interesting ;-)



Thanks,
Luca

> It would also
> work well in presence of "real" deadline tasks I think, in that you
> could account for these fair servers while performing admission
> control.
> 
> Best,
> 
> Juri
> 



Re: [RFC PATCH v2 6/6] sched/fair: Implement starvation monitor

2020-08-07 Thread luca abeni
Hi Juri,

On Fri, 7 Aug 2020 11:56:04 +0200
Juri Lelli  wrote:

> Starting deadline server for lower priority classes right away when
> first task is enqueued might break guarantees

Which guarantees are you thinking about, here? Response times of fixed
priority tasks?

If fixed priority tasks are also scheduled through deadline servers,
then you can provide response-time guarantees to them even when
lower-priority/non-real-time tasks are scheduled through deadline
servers.


Thanks,
Luca

> as tasks belonging to
> intermediate priority classes could be uselessly preempted. E.g., a
> well behaving (non hog) FIFO task can be preempted by NORMAL tasks
> even if there are still CPU cycles available for NORMAL tasks to run,
> as they'll be running inside the fair deadline server for some period
> of time.
> 
> To prevent this issue, implement a starvation monitor mechanism that
> starts the deadline server only if a (fair in this case) task hasn't
> been scheduled for some interval of time after it has been enqueued.
> Use pick/put functions to manage starvation monitor status.
> 
> Signed-off-by: Juri Lelli 
> ---
>  kernel/sched/fair.c  | 57
> ++-- kernel/sched/sched.h |
> 4  2 files changed, 59 insertions(+), 2 deletions(-)
> 
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 6a97ee2a4e26d..5cdf76e508074 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5494,6 +5494,53 @@ static int sched_idle_cpu(int cpu)
>  }
>  #endif
>  
> +
> +static void fair_server_watchdog(struct timer_list *list)
> +{
> + struct rq *rq = container_of(list, struct rq,
> fair_server_wd);
> + struct rq_flags rf;
> +
> + rq_lock_irqsave(rq, );
> + rq->fair_server_wd_running = 0;
> +
> + if (!rq->cfs.h_nr_running)
> + goto out;
> +
> + update_rq_clock(rq);
> + dl_server_start(>fair_server);
> + rq->fair_server_active = 1;
> + resched_curr(rq);
> +
> +out:
> + rq_unlock_irqrestore(rq, );
> +}
> +
> +static inline void fair_server_watchdog_start(struct rq *rq)
> +{
> + if (rq->fair_server_wd_running || rq->fair_server_active)
> + return;
> +
> + timer_setup(>fair_server_wd, fair_server_watchdog, 0);
> + rq->fair_server_wd.expires = jiffies +
> FAIR_SERVER_WATCHDOG_INTERVAL;
> + add_timer_on(>fair_server_wd, cpu_of(rq));
> + rq->fair_server_active = 0;
> + rq->fair_server_wd_running = 1;
> +}
> +
> +static inline void fair_server_watchdog_stop(struct rq *rq, bool
> stop_server) +{
> + if (!rq->fair_server_wd_running && !stop_server)
> + return;
> +
> + del_timer(>fair_server_wd);
> + rq->fair_server_wd_running = 0;
> +
> + if (stop_server && rq->fair_server_active) {
> + dl_server_stop(>fair_server);
> + rq->fair_server_active = 0;
> + }
> +}
> +
>  /*
>   * The enqueue_task method is called before nr_running is
>   * increased. Here we update the fair scheduling stats and
> @@ -5515,7 +5562,7 @@ enqueue_task_fair(struct rq *rq, struct
> task_struct *p, int flags) util_est_enqueue(>cfs, p);
>  
>   if (!rq->cfs.h_nr_running)
> - dl_server_start(>fair_server);
> + fair_server_watchdog_start(rq);
>  
>   /*
>* If in_iowait is set, the code below may not trigger any
> cpufreq @@ -5670,7 +5717,7 @@ static void dequeue_task_fair(struct rq
> *rq, struct task_struct *p, int flags) 
>  dequeue_throttle:
>   if (!rq->cfs.h_nr_running)
> - dl_server_stop(>fair_server);
> + fair_server_watchdog_stop(rq, true);
>  
>   util_est_dequeue(>cfs, p, task_sleep);
>   hrtick_update(rq);
> @@ -7123,6 +7170,7 @@ done: __maybe_unused;
>   hrtick_start_fair(rq, p);
>  
>   update_misfit_status(p, rq);
> + fair_server_watchdog_stop(rq, false);
>  
>   return p;
>  
> @@ -7178,6 +7226,8 @@ void fair_server_init(struct rq *rq)
>   dl_se->dl_period = 20 * TICK_NSEC;
>  
>   dl_server_init(dl_se, rq, fair_server_has_tasks,
> fair_server_pick); +
> + rq->fair_server_wd_running = 0;
>  }
>  
>  /*
> @@ -7192,6 +7242,9 @@ static void put_prev_task_fair(struct rq *rq,
> struct task_struct *prev) cfs_rq = cfs_rq_of(se);
>   put_prev_entity(cfs_rq, se);
>   }
> +
> + if (rq->cfs.h_nr_running)
> + fair_server_watchdog_start(rq);
>  }
>  
>  /*
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index bf8c9c07705c9..1e1a5436be725 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -375,6 +375,7 @@ extern void dl_server_init(struct sched_dl_entity
> *dl_se, struct rq *rq, dl_server_has_tasks_f has_tasks,
>   dl_server_pick_f pick);
>  
> +#define FAIR_SERVER_WATCHDOG_INTERVAL (HZ >> 1)
>  extern void fair_server_init(struct rq *);
>  
>  #ifdef CONFIG_CGROUP_SCHED
> @@ -962,6 +963,9 @@ struct rq {
>   struct dl_rq  

[tip: sched/core] sched/deadline: Implement fallback mechanism for !fit case

2020-06-16 Thread tip-bot2 for Luca Abeni
The following commit has been merged into the sched/core branch of tip:

Commit-ID: 23e71d8ba42933bff12e453858fd68c073bc5258
Gitweb:
https://git.kernel.org/tip/23e71d8ba42933bff12e453858fd68c073bc5258
Author:Luca Abeni 
AuthorDate:Wed, 20 May 2020 15:42:43 +02:00
Committer: Peter Zijlstra 
CommitterDate: Mon, 15 Jun 2020 14:10:05 +02:00

sched/deadline: Implement fallback mechanism for !fit case

When a task has a runtime that cannot be served within the scheduling
deadline by any of the idle CPU (later_mask) the task is doomed to miss
its deadline.

This can happen since the SCHED_DEADLINE admission control guarantees
only bounded tardiness and not the hard respect of all deadlines.
In this case try to select the idle CPU with the largest CPU capacity
to minimize tardiness.

Favor task_cpu(p) if it has max capacity of !fitting CPUs so that
find_later_rq() can potentially still return it (most likely cache-hot)
early.

Signed-off-by: Luca Abeni 
Signed-off-by: Dietmar Eggemann 
Signed-off-by: Peter Zijlstra (Intel) 
Acked-by: Juri Lelli 
Link: https://lkml.kernel.org/r/20200520134243.19352-6-dietmar.eggem...@arm.com
---
 kernel/sched/cpudeadline.c | 20 
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index 8630f2a..8cb06c8 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -121,19 +121,31 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
 
if (later_mask &&
cpumask_and(later_mask, cp->free_cpus, p->cpus_ptr)) {
-   int cpu;
+   unsigned long cap, max_cap = 0;
+   int cpu, max_cpu = -1;
 
if (!static_branch_unlikely(_asym_cpucapacity))
return 1;
 
/* Ensure the capacity of the CPUs fits the task. */
for_each_cpu(cpu, later_mask) {
-   if (!dl_task_fits_capacity(p, cpu))
+   if (!dl_task_fits_capacity(p, cpu)) {
cpumask_clear_cpu(cpu, later_mask);
+
+   cap = capacity_orig_of(cpu);
+
+   if (cap > max_cap ||
+   (cpu == task_cpu(p) && cap == max_cap)) {
+   max_cap = cap;
+   max_cpu = cpu;
+   }
+   }
}
 
-   if (!cpumask_empty(later_mask))
-   return 1;
+   if (cpumask_empty(later_mask))
+   cpumask_set_cpu(max_cpu, later_mask);
+
+   return 1;
} else {
int best_cpu = cpudl_maximum(cp);
 


[tip: sched/core] sched/deadline: Improve admission control for asymmetric CPU capacities

2020-06-16 Thread tip-bot2 for Luca Abeni
The following commit has been merged into the sched/core branch of tip:

Commit-ID: 60ffd5edc5e4fa69622c125c54ef8e7d5d894af8
Gitweb:
https://git.kernel.org/tip/60ffd5edc5e4fa69622c125c54ef8e7d5d894af8
Author:Luca Abeni 
AuthorDate:Wed, 20 May 2020 15:42:41 +02:00
Committer: Peter Zijlstra 
CommitterDate: Mon, 15 Jun 2020 14:10:05 +02:00

sched/deadline: Improve admission control for asymmetric CPU capacities

The current SCHED_DEADLINE (DL) admission control ensures that

sum of reserved CPU bandwidth < x * M

where

x = /proc/sys/kernel/sched_rt_{runtime,period}_us
M = # CPUs in root domain.

DL admission control works well for homogeneous systems where the
capacity of all CPUs are equal (1024). I.e. bounded tardiness for DL
and non-starvation of non-DL tasks is guaranteed.

But on heterogeneous systems where capacity of CPUs are different it
could fail by over-allocating CPU time on smaller capacity CPUs.

On an Arm big.LITTLE/DynamIQ system DL tasks can easily starve other
tasks making it unusable.

Fix this by explicitly considering the CPU capacity in the DL admission
test by replacing M with the root domain CPU capacity sum.

Signed-off-by: Luca Abeni 
Signed-off-by: Dietmar Eggemann 
Signed-off-by: Peter Zijlstra (Intel) 
Acked-by: Juri Lelli 
Link: https://lkml.kernel.org/r/20200520134243.19352-4-dietmar.eggem...@arm.com
---
 kernel/sched/deadline.c | 30 +-
 kernel/sched/sched.h|  6 +++---
 2 files changed, 20 insertions(+), 16 deletions(-)

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 01f474a..9ebd0a9 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -2590,11 +2590,12 @@ void sched_dl_do_global(void)
 int sched_dl_overflow(struct task_struct *p, int policy,
  const struct sched_attr *attr)
 {
-   struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
u64 period = attr->sched_period ?: attr->sched_deadline;
u64 runtime = attr->sched_runtime;
u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0;
-   int cpus, err = -1;
+   int cpus, err = -1, cpu = task_cpu(p);
+   struct dl_bw *dl_b = dl_bw_of(cpu);
+   unsigned long cap;
 
if (attr->sched_flags & SCHED_FLAG_SUGOV)
return 0;
@@ -2609,15 +2610,17 @@ int sched_dl_overflow(struct task_struct *p, int policy,
 * allocated bandwidth of the container.
 */
raw_spin_lock(_b->lock);
-   cpus = dl_bw_cpus(task_cpu(p));
+   cpus = dl_bw_cpus(cpu);
+   cap = dl_bw_capacity(cpu);
+
if (dl_policy(policy) && !task_has_dl_policy(p) &&
-   !__dl_overflow(dl_b, cpus, 0, new_bw)) {
+   !__dl_overflow(dl_b, cap, 0, new_bw)) {
if (hrtimer_active(>dl.inactive_timer))
__dl_sub(dl_b, p->dl.dl_bw, cpus);
__dl_add(dl_b, new_bw, cpus);
err = 0;
} else if (dl_policy(policy) && task_has_dl_policy(p) &&
-  !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) {
+  !__dl_overflow(dl_b, cap, p->dl.dl_bw, new_bw)) {
/*
 * XXX this is slightly incorrect: when the task
 * utilization decreases, we should delay the total
@@ -2772,19 +2775,19 @@ bool dl_param_changed(struct task_struct *p, const 
struct sched_attr *attr)
 #ifdef CONFIG_SMP
 int dl_task_can_attach(struct task_struct *p, const struct cpumask 
*cs_cpus_allowed)
 {
+   unsigned long flags, cap;
unsigned int dest_cpu;
struct dl_bw *dl_b;
bool overflow;
-   int cpus, ret;
-   unsigned long flags;
+   int ret;
 
dest_cpu = cpumask_any_and(cpu_active_mask, cs_cpus_allowed);
 
rcu_read_lock_sched();
dl_b = dl_bw_of(dest_cpu);
raw_spin_lock_irqsave(_b->lock, flags);
-   cpus = dl_bw_cpus(dest_cpu);
-   overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw);
+   cap = dl_bw_capacity(dest_cpu);
+   overflow = __dl_overflow(dl_b, cap, 0, p->dl.dl_bw);
if (overflow) {
ret = -EBUSY;
} else {
@@ -2794,6 +2797,8 @@ int dl_task_can_attach(struct task_struct *p, const 
struct cpumask *cs_cpus_allo
 * We will free resources in the source root_domain
 * later on (see set_cpus_allowed_dl()).
 */
+   int cpus = dl_bw_cpus(dest_cpu);
+
__dl_add(dl_b, p->dl.dl_bw, cpus);
ret = 0;
}
@@ -2826,16 +2831,15 @@ int dl_cpuset_cpumask_can_shrink(const struct cpumask 
*cur,
 
 bool dl_cpu_busy(unsigned int cpu)
 {
-   unsigned long flags;
+   unsigned long flags, cap;
struct dl_bw *dl_b;
bool overflow;
-   int cpus;
 
rcu_read_lock_sched();
dl_b = dl_bw_of(cpu);
raw_spin_lock_irqsave(_b->lock, fl

[tip: sched/core] sched/deadline: Make DL capacity-aware

2020-06-16 Thread tip-bot2 for Luca Abeni
The following commit has been merged into the sched/core branch of tip:

Commit-ID: b4118988fdcb4554ea6687dd8ff68bcab690b8ea
Gitweb:
https://git.kernel.org/tip/b4118988fdcb4554ea6687dd8ff68bcab690b8ea
Author:Luca Abeni 
AuthorDate:Wed, 20 May 2020 15:42:42 +02:00
Committer: Peter Zijlstra 
CommitterDate: Mon, 15 Jun 2020 14:10:05 +02:00

sched/deadline: Make DL capacity-aware

The current SCHED_DEADLINE (DL) scheduler uses a global EDF scheduling
algorithm w/o considering CPU capacity or task utilization.
This works well on homogeneous systems where DL tasks are guaranteed
to have a bounded tardiness but presents issues on heterogeneous
systems.

A DL task can migrate to a CPU which does not have enough CPU capacity
to correctly serve the task (e.g. a task w/ 70ms runtime and 100ms
period on a CPU w/ 512 capacity).

Add the DL fitness function dl_task_fits_capacity() for DL admission
control on heterogeneous systems. A task fits onto a CPU if:

CPU original capacity / 1024 >= task runtime / task deadline

Use this function on heterogeneous systems to try to find a CPU which
meets this criterion during task wakeup, push and offline migration.

On homogeneous systems the original behavior of the DL admission
control should be retained.

Signed-off-by: Luca Abeni 
Signed-off-by: Dietmar Eggemann 
Signed-off-by: Peter Zijlstra (Intel) 
Acked-by: Juri Lelli 
Link: https://lkml.kernel.org/r/20200520134243.19352-5-dietmar.eggem...@arm.com
---
 kernel/sched/cpudeadline.c | 14 +-
 kernel/sched/deadline.c| 18 ++
 kernel/sched/sched.h   | 15 +++
 3 files changed, 42 insertions(+), 5 deletions(-)

diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index 5cc4012..8630f2a 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -121,7 +121,19 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
 
if (later_mask &&
cpumask_and(later_mask, cp->free_cpus, p->cpus_ptr)) {
-   return 1;
+   int cpu;
+
+   if (!static_branch_unlikely(_asym_cpucapacity))
+   return 1;
+
+   /* Ensure the capacity of the CPUs fits the task. */
+   for_each_cpu(cpu, later_mask) {
+   if (!dl_task_fits_capacity(p, cpu))
+   cpumask_clear_cpu(cpu, later_mask);
+   }
+
+   if (!cpumask_empty(later_mask))
+   return 1;
} else {
int best_cpu = cpudl_maximum(cp);
 
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 9ebd0a9..84e84ba 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1643,6 +1643,7 @@ static int
 select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
 {
struct task_struct *curr;
+   bool select_rq;
struct rq *rq;
 
if (sd_flag != SD_BALANCE_WAKE)
@@ -1662,10 +1663,19 @@ select_task_rq_dl(struct task_struct *p, int cpu, int 
sd_flag, int flags)
 * other hand, if it has a shorter deadline, we
 * try to make it stay here, it might be important.
 */
-   if (unlikely(dl_task(curr)) &&
-   (curr->nr_cpus_allowed < 2 ||
-!dl_entity_preempt(>dl, >dl)) &&
-   (p->nr_cpus_allowed > 1)) {
+   select_rq = unlikely(dl_task(curr)) &&
+   (curr->nr_cpus_allowed < 2 ||
+!dl_entity_preempt(>dl, >dl)) &&
+   p->nr_cpus_allowed > 1;
+
+   /*
+* Take the capacity of the CPU into account to
+* ensure it fits the requirement of the task.
+*/
+   if (static_branch_unlikely(_asym_cpucapacity))
+   select_rq |= !dl_task_fits_capacity(p, cpu);
+
+   if (select_rq) {
int target = find_later_rq(p);
 
if (target != -1 &&
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 91b250f..3368876 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -317,6 +317,21 @@ static inline bool __dl_overflow(struct dl_bw *dl_b, 
unsigned long cap,
   cap_scale(dl_b->bw, cap) < dl_b->total_bw - old_bw + new_bw;
 }
 
+/*
+ * Verify the fitness of task @p to run on @cpu taking into account the
+ * CPU original capacity and the runtime/deadline ratio of the task.
+ *
+ * The function will return true if the CPU original capacity of the
+ * @cpu scaled by SCHED_CAPACITY_SCALE >= runtime/deadline ratio of the
+ * task and false otherwise.
+ */
+static inline bool dl_task_fits_capacity(struct task_struct *p, int cpu)
+{
+   unsigned long cap = arch_scale_cpu_capacity(cpu);
+
+   return cap_scale(p->dl.dl_deadline, cap) >= p->dl.dl_runtime;
+}
+
 extern void init_dl_bw(struct dl_bw *dl_b);
 extern int  sched_dl_global_validate(void);
 extern void sched_dl_do_global(void);


Re: [PATCH 4/5] sched/deadline: Cleanup on_dl_rq() handling

2019-07-31 Thread luca abeni
On Wed, 31 Jul 2019 18:32:47 +0100
Dietmar Eggemann  wrote:
[...]
>   static void dequeue_dl_entity(struct sched_dl_entity *dl_se)
>   {
>  +if (!on_dl_rq(dl_se))
>  +return;  
> >>>
> >>> Why allow double dequeue instead of WARN?  
> >>
> >> As I was saying to Valentin, it can currently happen that a task
> >> could have already been dequeued by update_curr_dl()->throttle
> >> called by dequeue_task_dl() before calling __dequeue_task_dl(). Do
> >> you think we should check for this condition before calling into
> >> dequeue_dl_entity()?  
> > 
> > Yes, that's what ->dl_throttled is for, right? And !->dl_throttled
> > && !on_dl_rq() is a BUG.  
> 
> OK, I will add the following snippet to the patch.
> Although it's easy to provoke a situation in which DL tasks are
> throttled, I haven't seen a throttling happening when the task is
> being dequeued.

This is a not-so-common situation, that can happen with periodic tasks
(a-la rt-app) blocking on clock_nanosleep() (or similar) after
executing for an amount of time comparable with the SCHED_DEADLINE
runtime.

It might happen that the task consumed a little bit more than the
remaining runtime (but has not been throttled yet, because the
accounting happens at every tick)... So, when dequeue_task_dl() invokes
update_task_dl() the runtime becomes negative and the task is throttled.

This happens infrequently, but if you try rt-app tasksets with multiple
tasks and execution times near to the runtime you will see it
happening, sooner or later.


[...]
> @@ -1592,6 +1591,10 @@ static void __dequeue_task_dl(struct rq *rq,
> struct task_struct *p) static void dequeue_task_dl(struct rq *rq,
> struct task_struct *p, int flags) {
> update_curr_dl(rq);
> +
> +   if (p->dl.dl_throttled)
> +   return;

Sorry, I missed part of the previous discussion, so maybe I am missing
something... But I suspect this "return" might be wrong (you risk to
miss a call to task_non_contending(), coming later in this function).

Maybe you cound use
if (!p->dl_throttled)
__dequeue_task_dl(rq, p)

Or did I misunderstand something?



Thanks,
Luca


Re: [RFC][PATCH 00/13] SCHED_DEADLINE server infrastructure

2019-07-26 Thread luca abeni
Hi Peter,

On Fri, 26 Jul 2019 16:54:09 +0200
Peter Zijlstra  wrote:

> Hi,
> 
> So recently syzcaller ran into the big deadline/period issue (again),
> and I figured I should at least propose a patch that puts limits on
> that -- see Patch 1.
> 
> During that discussion; SCHED_OTHER servers got mentioned (also
> again), and I figured I should have a poke at that too. So I took
> some inspiration from patches Alessio Balsini send a while back and
> cobbled something together for that too.

I think Patch 1 is a very good idea!

The server patches look interesting (and they seem to be much simpler
than our patchset :). I need to have a better look at them, but this
seems to be very promising.



Thanks,
Luca



> 
> Included are also a bunch of patches I did for core scheduling (2-8),
> which I'm probably going to just merge as they're generic cleanups.
> They're included here because they make pick_next_task() simpler and
> thereby thinking about the nested pick_next_task() logic inherent in
> servers was less of a head-ache. (I think it can work fine without
> them, but its easier with them on)
> 
> Anyway; after it all compiled it booted a kvm image all the way to
> userspace on the first run, so clearly this code isn't to be trusted
> at all.
> 
> There's still lots of missing bits and pieces -- like changelogs and
> the fair server isn't configurable or hooked into the bandwidth
> accounting, but the foundation is there I think.
> 
> Enjoy!
> 



Re: [PATCH 1/5] sched/deadline: Fix double accounting of rq/running bw in push_dl_task()

2019-07-26 Thread luca abeni
Hi,

On Fri, 26 Jul 2019 09:27:52 +0100
Dietmar Eggemann  wrote:
[...]
> @@ -2121,17 +2121,13 @@ static int push_dl_task(struct rq *rq)
>   }
>  
>   deactivate_task(rq, next_task, 0);
> - sub_running_bw(_task->dl, >dl);
> - sub_rq_bw(_task->dl, >dl);
>   set_task_cpu(next_task, later_rq->cpu);
> - add_rq_bw(_task->dl, _rq->dl);
>  
>   /*
>* Update the later_rq clock here, because the clock is used
>* by the cpufreq_update_util() inside __add_running_bw().
>*/
>   update_rq_clock(later_rq);
> - add_running_bw(_task->dl, _rq->dl);

Looking at the code again and thinking a little bit more about this
issue, I suspect a similar change is needed in pull_dl_task() too, no?



Luca


Re: [PATCH 5/5] sched/deadline: Use return value of SCHED_WARN_ON() in bw accounting

2019-07-26 Thread luca abeni
Hi Dietmar,

On Fri, 26 Jul 2019 09:27:56 +0100
Dietmar Eggemann  wrote:

> To make the decision whether to set rq or running bw to 0 in underflow
> case use the return value of SCHED_WARN_ON() rather than an extra if
> condition.

I think I tried this at some point, but if I remember well this
solution does not work correctly when SCHED_DEBUG is not enabled.



Luca


> 
> Signed-off-by: Dietmar Eggemann 
> ---
>  kernel/sched/deadline.c | 6 ++
>  1 file changed, 2 insertions(+), 4 deletions(-)
> 
> diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
> index a9cb52ceb761..66c594b5507e 100644
> --- a/kernel/sched/deadline.c
> +++ b/kernel/sched/deadline.c
> @@ -95,8 +95,7 @@ void __sub_running_bw(u64 dl_bw, struct dl_rq
> *dl_rq) 
>   lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock);
>   dl_rq->running_bw -= dl_bw;
> - SCHED_WARN_ON(dl_rq->running_bw > old); /* underflow */
> - if (dl_rq->running_bw > old)
> + if (SCHED_WARN_ON(dl_rq->running_bw > old)) /* underflow */
>   dl_rq->running_bw = 0;
>   /* kick cpufreq (see the comment in kernel/sched/sched.h). */
>   cpufreq_update_util(rq_of_dl_rq(dl_rq), 0);
> @@ -119,8 +118,7 @@ void __sub_rq_bw(u64 dl_bw, struct dl_rq *dl_rq)
>  
>   lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock);
>   dl_rq->this_bw -= dl_bw;
> - SCHED_WARN_ON(dl_rq->this_bw > old); /* underflow */
> - if (dl_rq->this_bw > old)
> + if (SCHED_WARN_ON(dl_rq->this_bw > old)) /* underflow */
>   dl_rq->this_bw = 0;
>   SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw);
>  }



Re: [PATCH 1/5] sched/deadline: Fix double accounting of rq/running bw in push_dl_task()

2019-07-26 Thread luca abeni
Hi Dietmar,

On Fri, 26 Jul 2019 09:27:52 +0100
Dietmar Eggemann  wrote:

> push_dl_task() always calls deactivate_task() with flags=0 which sets
> p->on_rq=TASK_ON_RQ_MIGRATING.

Uhm... This is a recent change in the deactivate_task() behaviour,
right? Because I tested SCHED_DEADLINE a lot, but I've never seen this
issue :)


Anyway, looking at the current code the change looks OK. Thanks for
fixing this issue!


Luca

> push_dl_task()->deactivate_task()->dequeue_task()->dequeue_task_dl()
> calls sub_[running/rq]_bw() since p->on_rq=TASK_ON_RQ_MIGRATING.
> So sub_[running/rq]_bw() in push_dl_task() is double-accounting for
> that task.
> 
> The same is true for add_[rq/running]_bw() and activate_task() on the
> destination (later) CPU.
> push_dl_task()->activate_task()->enqueue_task()->enqueue_task_dl()
> calls add_[rq/running]_bw() again since p->on_rq is still set to
> TASK_ON_RQ_MIGRATING.
> So the add_[rq/running]_bw() in enqueue_task_dl() is double-accounting
> for that task.
> 
> Fix this by removing the rq/running bw accounting in push_dl_task().
> 
> Trace (CONFIG_SCHED_DEBUG=y) before the fix on a 6 CPUs system with 6
> DL (12000, 10, 10) tasks showing the issue:
> 
> [   48.147868] dl_rq->running_bw > old
> [   48.147886] WARNING: CPU: 1 PID: 0 at kernel/sched/deadline.c:98
> ...
> [   48.274832]  inactive_task_timer+0x468/0x4e8
> [   48.279057]  __hrtimer_run_queues+0x10c/0x3b8
> [   48.283364]  hrtimer_interrupt+0xd4/0x250
> [   48.287330]  tick_handle_oneshot_broadcast+0x198/0x1d0
> ...
> [   48.360057] dl_rq->running_bw > dl_rq->this_bw
> [   48.360065] WARNING: CPU: 1 PID: 0 at kernel/sched/deadline.c:86
> ...
> [   48.488294]  task_contending+0x1a0/0x208
> [   48.492172]  enqueue_task_dl+0x3b8/0x970
> [   48.496050]  activate_task+0x70/0xd0
> [   48.499584]  ttwu_do_activate+0x50/0x78
> [   48.503375]  try_to_wake_up+0x270/0x7a0
> [   48.507167]  wake_up_process+0x14/0x20
> [   48.510873]  hrtimer_wakeup+0x1c/0x30
> ...
> [   50.062867] dl_rq->this_bw > old
> [   50.062885] WARNING: CPU: 1 PID: 2048 at
> kernel/sched/deadline.c:122 ...
> [   50.190520]  dequeue_task_dl+0x1e4/0x1f8
> [   50.194400]  __sched_setscheduler+0x1d0/0x860
> [   50.198707]  _sched_setscheduler+0x74/0x98
> [   50.202757]  do_sched_setscheduler+0xa8/0x110
> [   50.207065]  __arm64_sys_sched_setscheduler+0x1c/0x30
> 
> Signed-off-by: Dietmar Eggemann 
> ---
>  kernel/sched/deadline.c | 4 
>  1 file changed, 4 deletions(-)
> 
> diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
> index de2bd006fe93..d1aeada374e1 100644
> --- a/kernel/sched/deadline.c
> +++ b/kernel/sched/deadline.c
> @@ -2121,17 +2121,13 @@ static int push_dl_task(struct rq *rq)
>   }
>  
>   deactivate_task(rq, next_task, 0);
> - sub_running_bw(_task->dl, >dl);
> - sub_rq_bw(_task->dl, >dl);
>   set_task_cpu(next_task, later_rq->cpu);
> - add_rq_bw(_task->dl, _rq->dl);
>  
>   /*
>* Update the later_rq clock here, because the clock is used
>* by the cpufreq_update_util() inside __add_running_bw().
>*/
>   update_rq_clock(later_rq);
> - add_running_bw(_task->dl, _rq->dl);
>   activate_task(later_rq, next_task, ENQUEUE_NOCLOCK);
>   ret = 1;
>  



Re: [RFC PATCH 3/6] sched/dl: Try better placement even for deadline tasks that do not block

2019-07-09 Thread luca abeni
Hi Peter,

On Mon, 8 Jul 2019 15:55:36 +0200
Peter Zijlstra  wrote:

> On Mon, May 06, 2019 at 06:48:33AM +0200, Luca Abeni wrote:
> > @@ -1223,8 +1250,17 @@ static void update_curr_dl(struct rq *rq)
> > dl_se->dl_overrun = 1;
> >  
> > __dequeue_task_dl(rq, curr, 0);
> > -   if (unlikely(dl_se->dl_boosted
> > || !start_dl_timer(curr)))
> > +   if (unlikely(dl_se->dl_boosted
> > || !start_dl_timer(curr))) { enqueue_task_dl(rq, curr,
> > ENQUEUE_REPLENISH); +#ifdef CONFIG_SMP
> > +   } else if (dl_se->dl_adjust) {
> > +   if (rq->migrating_task == NULL) {
> > +   queue_balance_callback(rq,
> > _cpu(dl_migrate_head, rq->cpu), migrate_dl_task);  
> 
> I'm not entirely sure about this one.
> 
> That is, we only do those callbacks from:
> 
>   schedule_tail()
>   __schedule()
>   rt_mutex_setprio()
>   __sched_setscheduler()
> 
> and the above looks like it can happen outside of those.

Sorry, I did not know the constraints or requirements for using
queue_balance_callback()...

I used it because I wanted to trigger a migration from
update_curr_dl(), but invoking double_lock_balance() from this function
obviously resulted in a warning. So, I probably misunderstood the
purpose of the balance callback API, and I misused it.

What would have been the "right way" to trigger a migration for a task
when it is throttled?


> 
> The pattern in those sites is:
> 
>   rq_lock();
>   ... do crap that leads to queue_balance_callback()
>   rq_unlock()
>   if (rq->balance_callback) {
>   raw_spin_lock_irqsave(rq->lock, flags);
>   ... do callbacks
>   raw_spin_unlock_irqrestore(rq->lock, flags);
>   }
> 
> So I suppose can catch abuse of this API by doing something like the
> below; can you validate?

Sorry; right now I cannot run tests on big.LITTLE machines... 
Maybe Dietmar (added in cc), who is working on mainlining this patcset,
can test?



Thanks,
Luca

> 
> ---
> 
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index aaca0e743776..89e615f1eae6 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -1134,6 +1134,14 @@ static inline void rq_pin_lock(struct rq *rq,
> struct rq_flags *rf) rf->cookie = lockdep_pin_lock(>lock);
>  
>  #ifdef CONFIG_SCHED_DEBUG
> +#ifdef CONFIG_SMP
> + /*
> +  * There should not be pending callbacks at the start of
> rq_lock();
> +  * all sites that handle them flush them at the end.
> +  */
> + WARN_ON_ONCE(rq->balance_callback);
> +#endif
> +
>   rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP);
>   rf->clock_update_flags = 0;
>  #endif



Re: [RFC PATCH 2/6] sched/dl: Capacity-aware migrations

2019-07-08 Thread luca abeni
Hi Dietmar,

On Thu, 4 Jul 2019 14:05:22 +0200
Dietmar Eggemann  wrote:

> On 5/6/19 6:48 AM, Luca Abeni wrote:
> 
> [...]
> 
> > diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
> > index 5b981eeeb944..3436f3d8fa8f 100644
> > --- a/kernel/sched/deadline.c
> > +++ b/kernel/sched/deadline.c
> > @@ -1584,6 +1584,9 @@ select_task_rq_dl(struct task_struct *p, int
> > cpu, int sd_flag, int flags) if (sd_flag != SD_BALANCE_WAKE)
> > goto out;
> >  
> > +   if (dl_entity_is_special(>dl))
> > +   goto out;  
> 
> I wonder if this is really required. The if condition
> 
> 1591 if (unlikely(dl_task(curr)) &&
> 1592 (curr->nr_cpus_allowed < 2 ||
> 1593  !dl_entity_preempt(>dl, >dl)) &&
> 1594 (p->nr_cpus_allowed > 1)) {
> 
> further below uses '!dl_entity_preempt(>dl, >dl))' which
> returns 'dl_entity_is_special(a) || ...'

Uhm... I do not remember the details; I remember that the check fixed
something during the development of the patchset, but I did not check
if it was still needed when I forward-ported the patches...

So, maybe it worked around some bugs in previous versions of the
kernel, but is not needed now.



Luca


Re: [RFC PATCH 4/6] sched/dl: Improve capacity-aware wakeup

2019-05-08 Thread luca abeni
Hi Juri,

On Wed, 8 May 2019 15:10:18 +0200
Juri Lelli  wrote:

> On 08/05/19 14:47, luca abeni wrote:
> 
> [...]
> 
> > Notice that all this logic is used only to select one of the idle
> > cores (instead of picking the first idle core, we select the less
> > powerful core on which the task "fits").
> > 
> > So, running_bw does not provide any useful information, in this
> > case; maybe this_bw can be more useful.  
> 
> Ah, indeed.
> 
> However, what happens when cores are all busy? Can load balancing
> decisions based on deadline values only break capacity awareness?  (I
> understand this is an RFC, so a "yes, something we need to look at" is
> OK :-)

I have no definitive answer about this :)

If we do not want to break EDF and its properties, migrations have to
be performed so that the "global EDF invariant" (schedule the m tasks
with earliest deadlines) is not broken (and capacity-based migrations
for non-idle cores risk to break this invariant).

In theory, we should do something like "schedule the earliest deadline
task on the fastest core", but this would require too many migrations
(and would require to migrate a task while it is executing, etc...)

I am convinced that doing the capacity-aware migrations when we have
idle cores helps to reduce migrations (avoiding "useless migrations")
when there are no idle cores, but I have no proof about this (just some
simple examples); this is why I say that I have no definitive answer...



Luca


Re: [RFC PATCH 4/6] sched/dl: Improve capacity-aware wakeup

2019-05-08 Thread luca abeni
Hi Juri,

On Wed, 8 May 2019 14:05:26 +0200
Juri Lelli  wrote:
[...]
> > > > +   if ((rel_deadline < 0) || (rel_deadline *
> > > > dl_se->dl_runtime < dl_se->dl_deadline * rem_runtime)) {
> > > > +   rel_deadline = dl_se->dl_deadline;
> > > > +   rem_runtime  = dl_se->dl_runtime;
> > > > +   }
> > > 
> > > So, are you basically checking if current remaining bw can be
> > > consumed safely?  
> > 
> > I check if the current runtime (rescaled based on the capacity) is
> > smaller than the time to the current scheduling deadline
> > (basically, if it can be consumed in time).
> > 
> > However, if
> > q / (d - t) > Q / P 
> > (where "q" is the current runtime, "d" is the scheduling deadline,
> > "Q" is the maximum runtime, and "P" is the CBS period), then a new
> > scheduling deadline will be generated (later), and the runtime will
> > be reset to Q... So, I need to use the maximum budget and CBS
> > period for checking if the task fits in the core.  
> 
> OK. I'd add a comment about it.

Ok; I'll add a comment in the next version of the patchset.


[...]
> > > I'm not actually sure if looking at dynamic values is what we
> > > need to do at this stage. By considering static values we fix
> > > admission control (and scheduling). Aren't dynamic values more to
> > > do with energy tradeoffs (and so to be introduced when starting
> > > to look at the energy model)?  
> > 
> > Using the current runtime and scheduling deadline might allow to
> > migrate a task to SMALL cores (if its remaining runtime is small
> > enough), even if the rescaled Q is larger than P.
> > So, in theory it might allow to reduce the load on big cores.
> > 
> > If we decide that this is overkilling, I can just drop the patch.  
> 
> So, my first impression was that we shouldn't be too clever until we
> start using info from the energy model (using which one should be able
> to understand if, for example, reducing load on big cores is a winning
> power/perf decision).

Ok.


> However, I was also wondering if we should actually compare dynamic
> parameters with {running,this}_bw (per-rq) the moment we search for
> potential migration candidates (so that we are not overloading rqs).

Notice that all this logic is used only to select one of the idle cores
(instead of picking the first idle core, we select the less powerful
core on which the task "fits").

So, running_bw does not provide any useful information, in this case;
maybe this_bw can be more useful.



Luca


Re: [RFC PATCH 4/6] sched/dl: Improve capacity-aware wakeup

2019-05-08 Thread luca abeni
On Wed, 8 May 2019 11:08:55 +0200
Juri Lelli  wrote:

> On 06/05/19 06:48, Luca Abeni wrote:
> > From: luca abeni 
> > 
> > Instead of considering the "static CPU bandwidth" allocated to
> > a SCHED_DEADLINE task (ratio between its maximum runtime and
> > reservation period), try to use the remaining runtime and time
> > to scheduling deadline.
> > 
> > Signed-off-by: luca abeni 
> > ---
> >  kernel/sched/cpudeadline.c | 9 +++--
> >  1 file changed, 7 insertions(+), 2 deletions(-)
> > 
> > diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
> > index d21f7905b9c1..111dd9ac837b 100644
> > --- a/kernel/sched/cpudeadline.c
> > +++ b/kernel/sched/cpudeadline.c
> > @@ -114,8 +114,13 @@ static inline int dl_task_fit(const struct
> > sched_dl_entity *dl_se, int cpu, u64 *c)
> >  {
> > u64 cap = (arch_scale_cpu_capacity(NULL, cpu) *
> > arch_scale_freq_capacity(cpu)) >> SCHED_CAPACITY_SHIFT;
> > -   s64 rel_deadline = dl_se->dl_deadline;
> > -   u64 rem_runtime  = dl_se->dl_runtime;
> > +   s64 rel_deadline = dl_se->deadline -
> > sched_clock_cpu(smp_processor_id());
> > +   u64 rem_runtime  = dl_se->runtime;
> > +
> > +   if ((rel_deadline < 0) || (rel_deadline *
> > dl_se->dl_runtime < dl_se->dl_deadline * rem_runtime)) {
> > +   rel_deadline = dl_se->dl_deadline;
> > +   rem_runtime  = dl_se->dl_runtime;
> > +   }  
> 
> So, are you basically checking if current remaining bw can be consumed
> safely?

I check if the current runtime (rescaled based on the capacity) is
smaller than the time to the current scheduling deadline (basically, if
it can be consumed in time).

However, if
q / (d - t) > Q / P 
(where "q" is the current runtime, "d" is the scheduling deadline, "Q"
is the maximum runtime, and "P" is the CBS period), then a new
scheduling deadline will be generated (later), and the runtime will be
reset to Q... So, I need to use the maximum budget and CBS period for
checking if the task fits in the core.

> 
> I'm not actually sure if looking at dynamic values is what we need to
> do at this stage. By considering static values we fix admission
> control (and scheduling). Aren't dynamic values more to do with
> energy tradeoffs (and so to be introduced when starting to look at
> the energy model)?

Using the current runtime and scheduling deadline might allow to
migrate a task to SMALL cores (if its remaining runtime is small
enough), even if the rescaled Q is larger than P.
So, in theory it might allow to reduce the load on big cores.

If we decide that this is overkilling, I can just drop the patch.



Luca

> Another pair of hands maybe is to look at the dynamic spare bw of CPUs
> (to check that we don't overload CPUs).
> 
> Thanks,
> 
> - Juri



Re: [RFC PATCH 2/6] sched/dl: Capacity-aware migrations

2019-05-08 Thread luca abeni
On Wed, 8 May 2019 10:04:36 +0200
Juri Lelli  wrote:

> Hi Luca,
> 
> On 06/05/19 06:48, Luca Abeni wrote:
> > From: luca abeni 
> > 
> > Currently, the SCHED_DEADLINE scheduler uses a global EDF scheduling
> > algorithm, migrating tasks to CPU cores without considering the core
> > capacity and the task utilization. This works well on homogeneous
> > systems (SCHED_DEADLINE tasks are guaranteed to have a bounded
> > tardiness), but presents some issues on heterogeneous systems. For
> > example, a SCHED_DEADLINE task might be migrated on a core that has
> > not enough processing capacity to correctly serve the task (think
> > about a task with runtime 70ms and period 100ms migrated to a core
> > with processing capacity 0.5)
> > 
> > This commit is a first step to address the issue: When a task wakes
> > up or migrates away from a CPU core, the scheduler tries to find an
> > idle core having enough processing capacity to serve the task.
> > 
> > Signed-off-by: luca abeni 
> > ---
> >  kernel/sched/cpudeadline.c | 31 +--
> >  kernel/sched/deadline.c|  8 ++--
> >  kernel/sched/sched.h   |  7 ++-
> >  3 files changed, 41 insertions(+), 5 deletions(-)
> > 
> > diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
> > index 50316455ea66..d21f7905b9c1 100644
> > --- a/kernel/sched/cpudeadline.c
> > +++ b/kernel/sched/cpudeadline.c
> > @@ -110,6 +110,22 @@ static inline int cpudl_maximum(struct cpudl
> > *cp) return cp->elements[0].cpu;
> >  }
> >  
> > +static inline int dl_task_fit(const struct sched_dl_entity *dl_se,
> > + int cpu, u64 *c)
> > +{
> > +   u64 cap = (arch_scale_cpu_capacity(NULL, cpu) *
> > arch_scale_freq_capacity(cpu)) >> SCHED_CAPACITY_SHIFT;
> > +   s64 rel_deadline = dl_se->dl_deadline;
> > +   u64 rem_runtime  = dl_se->dl_runtime;  
> 
> This is not the dynamic remaining one, is it?

Right; I preferred to split this in two patches so that if we decide to
use only the static task parameters (dl_deadline and dl_runtime) I can
simply drop a patch ;-)


Luca

> 
> I see however 4/6.. lemme better look at that.
> 
> Best,
> 
> - Juri



Re: [RFC PATCH 3/6] sched/dl: Try better placement even for deadline tasks that do not block

2019-05-08 Thread luca abeni
Hi Juri,

On Wed, 8 May 2019 10:01:16 +0200
Juri Lelli  wrote:

> Hi Luca,
> 
> On 06/05/19 06:48, Luca Abeni wrote:
> > From: luca abeni 
> > 
> > Currently, the scheduler tries to find a proper placement for
> > SCHED_DEADLINE tasks when they are pushed out of a core or when
> > they wake up. Hence, if there is a single SCHED_DEADLINE task
> > that never blocks and wakes up, such a task is never migrated to
> > an appropriate CPU core, but continues to execute on its original
> > core.
> > 
> > This commit addresses the issue by trying to migrate a
> > SCHED_DEADLINE task (searching for an appropriate CPU core) the
> > first time it is throttled.  
> 
> Why we failed to put the task on a CPU with enough (max) capacity
> right after it passed admission control? The very first time the task
> was scheduled I mean.

I think the currently executing task cannot be pushed out of a
CPU/core, right?

So, if a task switches from SCHED_OTHER to SCHED_DEADLINE while it is
executing on a fast core, the only way to migrate it would be to
preempt it (by using the stop_sched_class, I think), no?
(the typical situation here is a "cpu hog" task that switches from
SCHED_OTHER to SCHED_DEADLINE, and it is the only SCHED_DEADLINE
task... The task never blocks, so push/pull functions are never invoked)

Or am I missing something?



Thanks,
Luca


Re: [RFC PATCH 6/6] sched/dl: Try not to select a too fast core

2019-05-08 Thread luca abeni
Hi Quentin,

On Tue, 7 May 2019 16:57:34 +0100
Quentin Perret  wrote:

> On Monday 06 May 2019 at 06:48:36 (+0200), Luca Abeni wrote:
> > From: luca abeni 
> > 
> > When a task can fit on multiple CPU cores, try to select the slowest
> > core that is able to properly serve the task. This avoids useless
> > future migrations, leaving the "fast cores" idle for more
> > heavyweight tasks.  
> 
> But only if the _current_ capacity of big CPUs (at the current freq)
> is higher than the current capacity of the littles, is that right ?
> So we don't really have a guarantee to pack small tasks on little
> cores ...

Yes, the capacity is estimated at the current frequency, so this is a
potential problem.


> What is the rationale for looking at the current freq in
> dl_task_fit() ?

Mainly two reasons: the first one is to try to reduce frequency
switches (I did not perform measurements on the hikey960, I remember
that on other CPUs a frequency switch can take a considerable amount of
time).

Then, I wanted to have a mechanism that can work with all the possible
cpufreq governors... So, I did not assume that the frequency can change
(for example, I remember that without considering the current
frequency I had issues when using the "userspace" governor).


Maybe I just do not know this kernel subsystem well enough, but I did
not find any way to know the maximum frequency that the current
governor can set (I mean, I found a "maximum frequency" field that
tells me the maximum frequency that the cpufreq driver can set, but I
do not know if the governor will be able to set it --- again, consider
the "userspace" governor).

If there is a way to know this value, then I can use it for checking if
a task can fit in a core.



Thanks,
Luca



> Energy reasons ? If so, I'd argue you should look at
> the energy model to break the tie between CPU candidates ... ;)
> 
> And in the mean time, you could just look at arch_scale_cpu_capacity()
> to check if a task fits ?
> 
> > Signed-off-by: luca abeni 
> > ---
> >  kernel/sched/cpudeadline.c | 17 -
> >  1 file changed, 12 insertions(+), 5 deletions(-)
> > 
> > diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
> > index 2a4ac7b529b7..897ed71af515 100644
> > --- a/kernel/sched/cpudeadline.c
> > +++ b/kernel/sched/cpudeadline.c
> > @@ -143,17 +143,24 @@ int cpudl_find(struct cpudl *cp, struct
> > task_struct *p, struct cpumask *later_mask)
> >  {
> > const struct sched_dl_entity *dl_se = >dl;
> > +   struct cpumask tmp_mask;  
> 
> Hmm, these can get pretty big, so not sure about having one on the
> stack ...
> 
> >  
> > if (later_mask &&
> > -   cpumask_and(later_mask, cp->free_cpus,
> > >cpus_allowed)) {
> > +   cpumask_and(_mask, cp->free_cpus,
> > >cpus_allowed)) { int cpu, max_cpu = -1;
> > -   u64 max_cap = 0;
> > +   u64 max_cap = 0, min_cap = SCHED_CAPACITY_SCALE *
> > SCHED_CAPACITY_SCALE; 
> > -   for_each_cpu(cpu, later_mask) {
> > +   cpumask_clear(later_mask);
> > +   for_each_cpu(cpu, _mask) {
> > u64 cap;
> >  
> > -   if (!dl_task_fit(>dl, cpu, ))
> > -   cpumask_clear_cpu(cpu, later_mask);
> > +   if (dl_task_fit(>dl, cpu, ) && (cap
> > <= min_cap)) {
> > +   if (cap < min_cap) {
> > +   min_cap = cap;
> > +   cpumask_clear(later_mask);
> > +   }
> > +   cpumask_set_cpu(cpu, later_mask);
> > +   }
> >  
> > if (cap > max_cap) {
> > max_cap = cap;
> > -- 
> > 2.20.1
> >   
> 
> Thanks,
> Quentin



Re: [RFC PATCH 2/6] sched/dl: Capacity-aware migrations

2019-05-07 Thread luca abeni
Hi,

On Tue, 7 May 2019 15:10:50 +0100
Quentin Perret  wrote:

> On Monday 06 May 2019 at 06:48:32 (+0200), Luca Abeni wrote:
> >  static inline unsigned long cpu_bw_dl(struct rq *rq)
> >  {
> > -   return (rq->dl.running_bw * SCHED_CAPACITY_SCALE) >>
> > BW_SHIFT;
> > +   unsigned long res;
> > +
> > +   res = (rq->dl.running_bw * SCHED_CAPACITY_SCALE) >>
> > BW_SHIFT; +
> > +   return (res << SCHED_CAPACITY_SHIFT) /
> > +  arch_scale_cpu_capacity(NULL, rq->cpu);  
> 
> The only user of cpu_bw_dl() is schedutil right ? If yes, we probably
> don't want to scale things here -- schedutil already does this I
> believe.

I think I added this modification because I arrived to the conclusion
that schedutils does not perform this rescaling (at least, I think it
did not perform it when I wrote the patch :)


BTW, while re-reading the patch I do not understand why this change was
added in this specific patch... I suspect it should have gone in a
separate patch.



Luca

> 
> Thanks,
> Quentin
> 
> >  }
> >  
> >  static inline unsigned long cpu_util_dl(struct rq *rq)
> > -- 
> > 2.20.1
> >   



Re: [RFC PATCH 1/6] sched/dl: Improve deadline admission control for asymmetric CPU capacities

2019-05-07 Thread luca abeni
On Tue, 7 May 2019 15:31:27 +0100
Quentin Perret  wrote:

> On Tuesday 07 May 2019 at 16:25:23 (+0200), luca abeni wrote:
> > On Tue, 7 May 2019 14:48:52 +0100
> > Quentin Perret  wrote:
> >   
> > > Hi Luca,
> > > 
> > > On Monday 06 May 2019 at 06:48:31 (+0200), Luca Abeni wrote:  
> > > > diff --git a/drivers/base/arch_topology.c
> > > > b/drivers/base/arch_topology.c index edfcf8d982e4..646d6d349d53
> > > > 100644 --- a/drivers/base/arch_topology.c
> > > > +++ b/drivers/base/arch_topology.c
> > > > @@ -36,6 +36,7 @@ DEFINE_PER_CPU(unsigned long, cpu_scale) =
> > > > SCHED_CAPACITY_SCALE; 
> > > >  void topology_set_cpu_scale(unsigned int cpu, unsigned long
> > > > capacity) {
> > > > +   topology_update_cpu_capacity(cpu, per_cpu(cpu_scale,
> > > > cpu), capacity);
> > > 
> > > Why is that one needed ? Don't you end up re-building the sched
> > > domains after this anyways ?  
> > 
> > If I remember correctly, this function was called at boot time when
> > the capacities are assigned to the CPU cores.
> > 
> > I do not remember if the sched domain was re-built after this call,
> > but I admit I do not know this part of the kernel very well...  
> 
> Right and things moved recently in this area, see bb1fbdd3c3fd
> ("sched/topology, drivers/base/arch_topology: Rebuild the sched_domain
> hierarchy when capacities change")

Ah, thanks! I missed this change when rebasing the patchset.
I guess this part of the patch has to be updated (and probably became
useless?), then.


Thanks,
Luca



> 
> > This achieved the effect of correctly setting up the "rd_capacity"
> > field, but I do not know if there is a better/simpler way to achieve
> > the same result :)  
> 
> OK, that's really an implementation detail, so no need to worry too
> much about it at the RFC stage I suppose :-)
> 
> Thanks,
> Quentin



Re: [RFC PATCH 1/6] sched/dl: Improve deadline admission control for asymmetric CPU capacities

2019-05-07 Thread luca abeni
On Tue, 7 May 2019 14:48:52 +0100
Quentin Perret  wrote:

> Hi Luca,
> 
> On Monday 06 May 2019 at 06:48:31 (+0200), Luca Abeni wrote:
> > diff --git a/drivers/base/arch_topology.c
> > b/drivers/base/arch_topology.c index edfcf8d982e4..646d6d349d53
> > 100644 --- a/drivers/base/arch_topology.c
> > +++ b/drivers/base/arch_topology.c
> > @@ -36,6 +36,7 @@ DEFINE_PER_CPU(unsigned long, cpu_scale) =
> > SCHED_CAPACITY_SCALE; 
> >  void topology_set_cpu_scale(unsigned int cpu, unsigned long
> > capacity) {
> > +   topology_update_cpu_capacity(cpu, per_cpu(cpu_scale, cpu),
> > capacity);  
> 
> Why is that one needed ? Don't you end up re-building the sched
> domains after this anyways ?

If I remember correctly, this function was called at boot time when the
capacities are assigned to the CPU cores.

I do not remember if the sched domain was re-built after this call, but
I admit I do not know this part of the kernel very well...

This achieved the effect of correctly setting up the "rd_capacity"
field, but I do not know if there is a better/simpler way to achieve
the same result :)



Thanks,
Luca



> 
> > per_cpu(cpu_scale, cpu) = capacity;
> >  }  
> 
> Thanks,
> Quentin



Re: [RFC PATCH 2/6] sched/dl: Capacity-aware migrations

2019-05-07 Thread luca abeni
Hi Quentin,

On Tue, 7 May 2019 14:35:28 +0100
Quentin Perret  wrote:

> Hi Luca,
> 
> On Monday 06 May 2019 at 06:48:32 (+0200), Luca Abeni wrote:
> > +static inline int dl_task_fit(const struct sched_dl_entity *dl_se,
> > + int cpu, u64 *c)
> > +{
> > +   u64 cap = (arch_scale_cpu_capacity(NULL, cpu) *
> > arch_scale_freq_capacity(cpu)) >> SCHED_CAPACITY_SHIFT;  
> 
> I'm a little bit confused by this use of arch_scale_freq_capacity()
> here. IIUC this means you would say a big DL task doesn't fit on a big
> CPU just because it happens to be running at a low frequency when this
> function is called. Is this what we want ?

The idea of this approach was to avoid frequency switches when
possible; so, I wanted to check if the task fits on a CPU core at its
current operating frequency.


> If the frequency is low, we can (probably) raise it to accommodate
> this DL task so perhaps we should say it fits ?

In a later patch, if the task does not fit on any core (at its current
frequency), the task is moved to the core having the maximum capacity
(without considering the operating frequency --- at least, this was my
intention when I wrote the patches :)



Luca

> 
> > +   s64 rel_deadline = dl_se->dl_deadline;
> > +   u64 rem_runtime  = dl_se->dl_runtime;
> > +
> > +   if (c)
> > +   *c = cap;
> > +
> > +   if ((rel_deadline * cap) >> SCHED_CAPACITY_SHIFT <
> > rem_runtime)
> > +   return 0;
> > +
> > +   return 1;
> > +}  
> 
> Thanks,
> Quentin



[RFC PATCH 2/6] sched/dl: Capacity-aware migrations

2019-05-05 Thread Luca Abeni
From: luca abeni 

Currently, the SCHED_DEADLINE scheduler uses a global EDF scheduling
algorithm, migrating tasks to CPU cores without considering the core
capacity and the task utilization. This works well on homogeneous
systems (SCHED_DEADLINE tasks are guaranteed to have a bounded
tardiness), but presents some issues on heterogeneous systems. For
example, a SCHED_DEADLINE task might be migrated on a core that has not
enough processing capacity to correctly serve the task (think about a
task with runtime 70ms and period 100ms migrated to a core with
processing capacity 0.5)

This commit is a first step to address the issue: When a task wakes
up or migrates away from a CPU core, the scheduler tries to find an
idle core having enough processing capacity to serve the task.

Signed-off-by: luca abeni 
---
 kernel/sched/cpudeadline.c | 31 +--
 kernel/sched/deadline.c|  8 ++--
 kernel/sched/sched.h   |  7 ++-
 3 files changed, 41 insertions(+), 5 deletions(-)

diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index 50316455ea66..d21f7905b9c1 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -110,6 +110,22 @@ static inline int cpudl_maximum(struct cpudl *cp)
return cp->elements[0].cpu;
 }
 
+static inline int dl_task_fit(const struct sched_dl_entity *dl_se,
+ int cpu, u64 *c)
+{
+   u64 cap = (arch_scale_cpu_capacity(NULL, cpu) * 
arch_scale_freq_capacity(cpu)) >> SCHED_CAPACITY_SHIFT;
+   s64 rel_deadline = dl_se->dl_deadline;
+   u64 rem_runtime  = dl_se->dl_runtime;
+
+   if (c)
+   *c = cap;
+
+   if ((rel_deadline * cap) >> SCHED_CAPACITY_SHIFT < rem_runtime)
+   return 0;
+
+   return 1;
+}
+
 /*
  * cpudl_find - find the best (later-dl) CPU in the system
  * @cp: the cpudl max-heap context
@@ -125,8 +141,19 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
 
if (later_mask &&
cpumask_and(later_mask, cp->free_cpus, >cpus_allowed)) {
-   return 1;
-   } else {
+   int cpu;
+
+   for_each_cpu(cpu, later_mask) {
+   u64 cap;
+
+   if (!dl_task_fit(>dl, cpu, ))
+   cpumask_clear_cpu(cpu, later_mask);
+   }
+
+   if (!cpumask_empty(later_mask))
+   return 1;
+   }
+   {
int best_cpu = cpudl_maximum(cp);
 
WARN_ON(best_cpu != -1 && !cpu_present(best_cpu));
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 5b981eeeb944..3436f3d8fa8f 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1584,6 +1584,9 @@ select_task_rq_dl(struct task_struct *p, int cpu, int 
sd_flag, int flags)
if (sd_flag != SD_BALANCE_WAKE)
goto out;
 
+   if (dl_entity_is_special(>dl))
+   goto out;
+
rq = cpu_rq(cpu);
 
rcu_read_lock();
@@ -1598,10 +1601,11 @@ select_task_rq_dl(struct task_struct *p, int cpu, int 
sd_flag, int flags)
 * other hand, if it has a shorter deadline, we
 * try to make it stay here, it might be important.
 */
-   if (unlikely(dl_task(curr)) &&
+   if ((unlikely(dl_task(curr)) &&
(curr->nr_cpus_allowed < 2 ||
 !dl_entity_preempt(>dl, >dl)) &&
-   (p->nr_cpus_allowed > 1)) {
+   (p->nr_cpus_allowed > 1)) ||
+   static_branch_unlikely(_asym_cpucapacity)) {
int target = find_later_rq(p);
 
if (target != -1 &&
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 32d242694863..e5f9fd3aee80 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2367,7 +2367,12 @@ unsigned long schedutil_cpu_util(int cpu, unsigned long 
util_cfs,
 
 static inline unsigned long cpu_bw_dl(struct rq *rq)
 {
-   return (rq->dl.running_bw * SCHED_CAPACITY_SCALE) >> BW_SHIFT;
+   unsigned long res;
+
+   res = (rq->dl.running_bw * SCHED_CAPACITY_SCALE) >> BW_SHIFT;
+
+   return (res << SCHED_CAPACITY_SHIFT) /
+  arch_scale_cpu_capacity(NULL, rq->cpu);
 }
 
 static inline unsigned long cpu_util_dl(struct rq *rq)
-- 
2.20.1



[RFC PATCH 5/6] sched/dl: If the task does not fit anywhere, select the fastest core

2019-05-05 Thread Luca Abeni
From: luca abeni 

When a task has a remaining runtime that cannot be served within the
scheduling deadline by anyone of the idle cores, the task is doomed
to miss its deadline (this can happen, because the admission control
only guarantees a bounded tardiness, not the hard respect of all
deadlines). In this case, try to select the fastest idle core, to
minimize the tardiness.

Signed-off-by: luca abeni 
---
 kernel/sched/cpudeadline.c | 12 ++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index 111dd9ac837b..2a4ac7b529b7 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -123,7 +123,7 @@ static inline int dl_task_fit(const struct sched_dl_entity 
*dl_se,
}
 
if (c)
-   *c = cap;
+   *c = arch_scale_cpu_capacity(NULL, cpu);
 
if ((rel_deadline * cap) >> SCHED_CAPACITY_SHIFT < rem_runtime)
return 0;
@@ -146,14 +146,22 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
 
if (later_mask &&
cpumask_and(later_mask, cp->free_cpus, >cpus_allowed)) {
-   int cpu;
+   int cpu, max_cpu = -1;
+   u64 max_cap = 0;
 
for_each_cpu(cpu, later_mask) {
u64 cap;
 
if (!dl_task_fit(>dl, cpu, ))
cpumask_clear_cpu(cpu, later_mask);
+
+   if (cap > max_cap) {
+   max_cap = cap;
+   max_cpu = cpu;
+   }
}
+   if (cpumask_empty(later_mask) && max_cap)
+   cpumask_set_cpu(max_cpu, later_mask);
 
if (!cpumask_empty(later_mask))
return 1;
-- 
2.20.1



[RFC PATCH 6/6] sched/dl: Try not to select a too fast core

2019-05-05 Thread Luca Abeni
From: luca abeni 

When a task can fit on multiple CPU cores, try to select the slowest
core that is able to properly serve the task. This avoids useless
future migrations, leaving the "fast cores" idle for more heavyweight
tasks.

Signed-off-by: luca abeni 
---
 kernel/sched/cpudeadline.c | 17 -
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index 2a4ac7b529b7..897ed71af515 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -143,17 +143,24 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
   struct cpumask *later_mask)
 {
const struct sched_dl_entity *dl_se = >dl;
+   struct cpumask tmp_mask;
 
if (later_mask &&
-   cpumask_and(later_mask, cp->free_cpus, >cpus_allowed)) {
+   cpumask_and(_mask, cp->free_cpus, >cpus_allowed)) {
int cpu, max_cpu = -1;
-   u64 max_cap = 0;
+   u64 max_cap = 0, min_cap = SCHED_CAPACITY_SCALE * 
SCHED_CAPACITY_SCALE;
 
-   for_each_cpu(cpu, later_mask) {
+   cpumask_clear(later_mask);
+   for_each_cpu(cpu, _mask) {
u64 cap;
 
-   if (!dl_task_fit(>dl, cpu, ))
-   cpumask_clear_cpu(cpu, later_mask);
+   if (dl_task_fit(>dl, cpu, ) && (cap <= min_cap)) 
{
+   if (cap < min_cap) {
+   min_cap = cap;
+   cpumask_clear(later_mask);
+   }
+   cpumask_set_cpu(cpu, later_mask);
+   }
 
if (cap > max_cap) {
max_cap = cap;
-- 
2.20.1



[RFC PATCH 4/6] sched/dl: Improve capacity-aware wakeup

2019-05-05 Thread Luca Abeni
From: luca abeni 

Instead of considering the "static CPU bandwidth" allocated to
a SCHED_DEADLINE task (ratio between its maximum runtime and
reservation period), try to use the remaining runtime and time
to scheduling deadline.

Signed-off-by: luca abeni 
---
 kernel/sched/cpudeadline.c | 9 +++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index d21f7905b9c1..111dd9ac837b 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -114,8 +114,13 @@ static inline int dl_task_fit(const struct sched_dl_entity 
*dl_se,
  int cpu, u64 *c)
 {
u64 cap = (arch_scale_cpu_capacity(NULL, cpu) * 
arch_scale_freq_capacity(cpu)) >> SCHED_CAPACITY_SHIFT;
-   s64 rel_deadline = dl_se->dl_deadline;
-   u64 rem_runtime  = dl_se->dl_runtime;
+   s64 rel_deadline = dl_se->deadline - 
sched_clock_cpu(smp_processor_id());
+   u64 rem_runtime  = dl_se->runtime;
+
+   if ((rel_deadline < 0) || (rel_deadline * dl_se->dl_runtime < 
dl_se->dl_deadline * rem_runtime)) {
+   rel_deadline = dl_se->dl_deadline;
+   rem_runtime  = dl_se->dl_runtime;
+   }
 
if (c)
*c = cap;
-- 
2.20.1



[RFC PATCH 1/6] sched/dl: Improve deadline admission control for asymmetric CPU capacities

2019-05-05 Thread Luca Abeni
From: luca abeni 

Currently, the SCHED_DEADLINE admission control ensures that the
sum of reserved CPU bandwidths is smaller than x * M, where the
reserved CPU bandwidth of a SCHED_DEADLINE task is defined as the
ratio between its runtime and its period, x is a user-definable
percentage (95% by default, can be controlled through
/proc/sys/kernel/sched_rt_{runtime,period}_us) and M is the number
of CPU cores.

This admission control works well (guaranteeing bounded tardiness
for SCHED_DEADLINE tasks and non-starvation of non SCHED_DEADLINE
tasks) for homogeneous systems (where all the CPU cores have the
same computing capacity), but ends up over-allocating the CPU time
in presence of less-powerful CPU cores.

It can be easily shown how on asymmetric CPU capacity architectures
(such as ARM big.LITTLE) SCHED_DEADLINE tasks can easily starve all the
other tasks, making the system unusable.

This commit fixes the issue by explicitly considering the cores'
capacities in the admission test (where "M" is replaced by the sum
of all the capacities of cores in the root domain).

Signed-off-by: luca abeni 
---
 drivers/base/arch_topology.c   |  1 +
 include/linux/sched/topology.h |  3 +++
 kernel/sched/core.c|  2 ++
 kernel/sched/deadline.c| 19 +--
 kernel/sched/sched.h   |  7 +--
 kernel/sched/topology.c|  9 +
 6 files changed, 33 insertions(+), 8 deletions(-)

diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c
index edfcf8d982e4..646d6d349d53 100644
--- a/drivers/base/arch_topology.c
+++ b/drivers/base/arch_topology.c
@@ -36,6 +36,7 @@ DEFINE_PER_CPU(unsigned long, cpu_scale) = 
SCHED_CAPACITY_SCALE;
 
 void topology_set_cpu_scale(unsigned int cpu, unsigned long capacity)
 {
+   topology_update_cpu_capacity(cpu, per_cpu(cpu_scale, cpu), capacity);
per_cpu(cpu_scale, cpu) = capacity;
 }
 
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 2bf680b42f3c..a4898e42f368 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -233,4 +233,7 @@ static inline int task_node(const struct task_struct *p)
return cpu_to_node(task_cpu(p));
 }
 
+void topology_update_cpu_capacity(unsigned int cpu, unsigned long old,
+ unsigned long new);
+
 #endif /* _LINUX_SCHED_TOPOLOGY_H */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d8456801caa3..a37bbb246802 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6271,6 +6271,7 @@ void set_rq_online(struct rq *rq)
if (class->rq_online)
class->rq_online(rq);
}
+   rq->rd->rd_capacity += arch_scale_cpu_capacity(NULL, 
cpu_of(rq));
}
 }
 
@@ -6286,6 +6287,7 @@ void set_rq_offline(struct rq *rq)
 
cpumask_clear_cpu(rq->cpu, rq->rd->online);
rq->online = 0;
+   rq->rd->rd_capacity -= arch_scale_cpu_capacity(NULL, 
cpu_of(rq));
}
 }
 
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 6a73e41a2016..5b981eeeb944 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -63,6 +63,10 @@ static inline int dl_bw_cpus(int i)
 
return cpus;
 }
+static inline unsigned long rd_capacity(struct rq *rq)
+{
+   return rq->rd->rd_capacity;
+}
 #else
 static inline struct dl_bw *dl_bw_of(int i)
 {
@@ -73,6 +77,11 @@ static inline int dl_bw_cpus(int i)
 {
return 1;
 }
+
+static inline unsigned long rd_capacity(struct rq *rq)
+{
+   return SCHED_CAPACITY_SCALE;
+}
 #endif
 
 static inline
@@ -2535,13 +2544,13 @@ int sched_dl_overflow(struct task_struct *p, int policy,
raw_spin_lock(_b->lock);
cpus = dl_bw_cpus(task_cpu(p));
if (dl_policy(policy) && !task_has_dl_policy(p) &&
-   !__dl_overflow(dl_b, cpus, 0, new_bw)) {
+   !__dl_overflow(dl_b, rd_capacity(task_rq(p)), 0, new_bw)) {
if (hrtimer_active(>dl.inactive_timer))
__dl_sub(dl_b, p->dl.dl_bw, cpus);
__dl_add(dl_b, new_bw, cpus);
err = 0;
} else if (dl_policy(policy) && task_has_dl_policy(p) &&
-  !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) {
+  !__dl_overflow(dl_b, rd_capacity(task_rq(p)), p->dl.dl_bw, 
new_bw)) {
/*
 * XXX this is slightly incorrect: when the task
 * utilization decreases, we should delay the total
@@ -2689,7 +2698,7 @@ int dl_task_can_attach(struct task_struct *p, const 
struct cpumask *cs_cpus_allo
dl_b = dl_bw_of(dest_cpu);
raw_spin_lock_irqsave(_b->lock, flags);
cpus = dl_bw_cpus(dest_cpu);
-   overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw);
+   overflow = __dl_overflow(dl_b, rd

[RFC PATCH 3/6] sched/dl: Try better placement even for deadline tasks that do not block

2019-05-05 Thread Luca Abeni
From: luca abeni 

Currently, the scheduler tries to find a proper placement for
SCHED_DEADLINE tasks when they are pushed out of a core or when
they wake up. Hence, if there is a single SCHED_DEADLINE task
that never blocks and wakes up, such a task is never migrated to
an appropriate CPU core, but continues to execute on its original
core.

This commit addresses the issue by trying to migrate a SCHED_DEADLINE
task (searching for an appropriate CPU core) the first time it is
throttled.

Signed-off-by: luca abeni 
---
 include/linux/sched.h   |  1 +
 kernel/sched/deadline.c | 53 -
 kernel/sched/sched.h|  2 ++
 3 files changed, 50 insertions(+), 6 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 863f70843875..5e322c8a94e0 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -560,6 +560,7 @@ struct sched_dl_entity {
unsigned intdl_yielded: 1;
unsigned intdl_non_contending : 1;
unsigned intdl_overrun: 1;
+   unsigned intdl_adjust : 1;
 
/*
 * Bandwidth enforcement timer. Each -deadline task has its
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 3436f3d8fa8f..db471889196b 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -515,6 +515,7 @@ static inline bool need_pull_dl_task(struct rq *rq, struct 
task_struct *prev)
return dl_task(prev);
 }
 
+static DEFINE_PER_CPU(struct callback_head, dl_migrate_head);
 static DEFINE_PER_CPU(struct callback_head, dl_push_head);
 static DEFINE_PER_CPU(struct callback_head, dl_pull_head);
 
@@ -1149,6 +1150,32 @@ static u64 grub_reclaim(u64 delta, struct rq *rq, struct 
sched_dl_entity *dl_se)
return (delta * u_act) >> BW_SHIFT;
 }
 
+#ifdef CONFIG_SMP
+static int find_later_rq(struct task_struct *task);
+
+static void migrate_dl_task(struct rq *rq)
+{
+   struct task_struct *t = rq->migrating_task;
+   struct sched_dl_entity *dl_se = >dl;
+   int cpu = find_later_rq(t);
+
+   if ((cpu != -1) && (cpu != rq->cpu)) {
+   struct rq *later_rq;
+
+   later_rq = cpu_rq(cpu);
+
+   double_lock_balance(rq, later_rq);
+   sub_running_bw(>dl, >dl);
+   sub_rq_bw(>dl, >dl);
+   set_task_cpu(t, later_rq->cpu);
+   add_rq_bw(>dl, _rq->dl);
+   add_running_bw(>dl, _rq->dl);
+   double_unlock_balance(rq, later_rq);
+   }
+   rq->migrating_task = NULL;
+   dl_se->dl_adjust = 0;
+}
+#endif
 /*
  * Update the current task's runtime statistics (provided it is still
  * a -deadline task and has not been removed from the dl_rq).
@@ -1223,8 +1250,17 @@ static void update_curr_dl(struct rq *rq)
dl_se->dl_overrun = 1;
 
__dequeue_task_dl(rq, curr, 0);
-   if (unlikely(dl_se->dl_boosted || !start_dl_timer(curr)))
+   if (unlikely(dl_se->dl_boosted || !start_dl_timer(curr))) {
enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH);
+#ifdef CONFIG_SMP
+   } else if (dl_se->dl_adjust) {
+   if (rq->migrating_task == NULL) {
+   queue_balance_callback(rq, 
_cpu(dl_migrate_head, rq->cpu), migrate_dl_task);
+   rq->migrating_task = current;
+   } else
+   printk_deferred("Throttled task before migratin 
g the previous one???\n");
+#endif
+   }
 
if (!is_leftmost(curr, >dl))
resched_curr(rq);
@@ -1573,13 +1609,12 @@ static void yield_task_dl(struct rq *rq)
 
 #ifdef CONFIG_SMP
 
-static int find_later_rq(struct task_struct *task);
-
 static int
 select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
 {
struct task_struct *curr;
struct rq *rq;
+   bool het;
 
if (sd_flag != SD_BALANCE_WAKE)
goto out;
@@ -1591,6 +1626,7 @@ select_task_rq_dl(struct task_struct *p, int cpu, int 
sd_flag, int flags)
 
rcu_read_lock();
curr = READ_ONCE(rq->curr); /* unlocked access */
+   het = static_branch_unlikely(_asym_cpucapacity);
 
/*
 * If we are dealing with a -deadline task, we must
@@ -1604,15 +1640,17 @@ select_task_rq_dl(struct task_struct *p, int cpu, int 
sd_flag, int flags)
if ((unlikely(dl_task(curr)) &&
(curr->nr_cpus_allowed < 2 ||
 !dl_entity_preempt(>dl, >dl)) &&
-   (p->nr_cpus_allowed > 1)) ||
-   static_branch_unlikely(_asym_cpucapacity)) {
+   (p->nr_cpus_allowed > 1)) || het) {
int target = find_later_rq(p);
 
 

[RFC PATCH 0/6] Capacity awareness for SCHED_DEADLINE

2019-05-05 Thread Luca Abeni
From: luca abeni 

The SCHED_DEADLINE scheduling policy currently has some issues with
asymmetric CPU capacity architectures (such as ARM big.LITTLE).
In particular, the admission control mechanism does not work correctly
(because it considers all cores to have the same speed of the fastest
core), and the scheduler does not consider the cores' speed when
migrating tasks. 

This patchset fixes the issues by explicitly considering the cores'
capacities in the admission control mechanism and in tasks' migration.

In particular, the scheduler is improved so that "heavyweight tasks"
(processes or threads having a runtime that on LITTLE cores becomes
larger than the relative deadline) are scheduled on big cores. This
allows respecting deadlines that are missed by the current scheduler.

Moreover, the migration logic is modified so that "lightweight tasks"
(processes or threads having a runtime that on LITTLE cores is still
smaller than the relative deadline) are scheduled on LITTLE cores
when possible. This allows saving some energy without missing deadlines. 
 

luca abeni (6):
  sched/dl: Improve deadline admission control for asymmetric CPU
capacities
  sched/dl: Capacity-aware migrations
  sched/dl: Try better placement even for deadline tasks that do not
block
  sched/dl: Improve capacity-aware wakeup
  sched/dl: If the task does not fit anywhere, select the fastest core
  sched/dl: Try not to select a too fast core

 drivers/base/arch_topology.c   |  1 +
 include/linux/sched.h  |  1 +
 include/linux/sched/topology.h |  3 ++
 kernel/sched/core.c|  2 +
 kernel/sched/cpudeadline.c | 53 ++--
 kernel/sched/deadline.c| 76 --
 kernel/sched/sched.h   | 16 +--
 kernel/sched/topology.c|  9 
 8 files changed, 143 insertions(+), 18 deletions(-)

-- 
2.20.1



[tip:sched/urgent] sched/deadline: Correctly handle active 0-lag timers

2019-04-16 Thread tip-bot for luca abeni
Commit-ID:  1b02cd6a2d7f3e2a6a5262887d2cb2912083e42f
Gitweb: https://git.kernel.org/tip/1b02cd6a2d7f3e2a6a5262887d2cb2912083e42f
Author: luca abeni 
AuthorDate: Mon, 25 Mar 2019 14:15:30 +0100
Committer:  Ingo Molnar 
CommitDate: Tue, 16 Apr 2019 16:54:58 +0200

sched/deadline: Correctly handle active 0-lag timers

syzbot reported the following warning:

   [ ] WARNING: CPU: 4 PID: 17089 at kernel/sched/deadline.c:255 
task_non_contending+0xae0/0x1950

line 255 of deadline.c is:

WARN_ON(hrtimer_active(_se->inactive_timer));

in task_non_contending().

Unfortunately, in some cases (for example, a deadline task
continuosly blocking and waking immediately) it can happen that
a task blocks (and task_non_contending() is called) while the
0-lag timer is still active.

In this case, the safest thing to do is to immediately decrease
the running bandwidth of the task, without trying to re-arm the 0-lag timer.

Signed-off-by: luca abeni 
Signed-off-by: Peter Zijlstra (Intel) 
Acked-by: Juri Lelli 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Thomas Gleixner 
Cc: chengjian (D) 
Link: 
https://lkml.kernel.org/r/20190325131530.34706-1-luca.ab...@santannapisa.it
Signed-off-by: Ingo Molnar 
---
 kernel/sched/deadline.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 6a73e41a2016..43901fa3f269 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -252,7 +252,6 @@ static void task_non_contending(struct task_struct *p)
if (dl_entity_is_special(dl_se))
return;
 
-   WARN_ON(hrtimer_active(_se->inactive_timer));
WARN_ON(dl_se->dl_non_contending);
 
zerolag_time = dl_se->deadline -
@@ -269,7 +268,7 @@ static void task_non_contending(struct task_struct *p)
 * If the "0-lag time" already passed, decrease the active
 * utilization now, instead of starting a timer
 */
-   if (zerolag_time < 0) {
+   if ((zerolag_time < 0) || hrtimer_active(_se->inactive_timer)) {
if (dl_task(p))
sub_running_bw(dl_se, dl_rq);
if (!dl_task(p) || p->state == TASK_DEAD) {


[PATCH] sched/deadline: correctly handle active 0-lag timers

2019-03-25 Thread luca abeni
syzbot reported the following warning:
[  948.126369] WARNING: CPU: 4 PID: 17089 at kernel/sched/deadline.c:255 
task_non_contending+0xae0/0x1950
[  948.130198] Kernel panic - not syncing: panic_on_warn set ...
[  948.130198]
[  948.134221] CPU: 4 PID: 17089 Comm: syz-executor.1 Not tainted 4.19.27 #2
[  948.139072] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 
1.10.2-1ubuntu1 04/01/2014
[  948.141603] Call Trace:
[  948.142277]  dump_stack+0xca/0x13e
[  948.164636]  panic+0x1f7/0x543
[  948.168704]  ? refcount_error_report+0x29d/0x29d
[  948.172438]  ? __warn+0x1d1/0x210
[  948.183359]  ? task_non_contending+0xae0/0x1950
[  948.191747]  __warn+0x1ec/0x210
[  948.196276]  ? task_non_contending+0xae0/0x1950
[  948.202476]  report_bug+0x1ee/0x2b0
[  948.204622]  fixup_bug.part.7+0x37/0x80
[  948.206879]  do_error_trap+0x22c/0x290
[  948.211340]  ? math_error+0x2f0/0x2f0
[  948.217033]  ? trace_hardirqs_off_caller+0x40/0x190
[  948.222477]  ? trace_hardirqs_off_thunk+0x1a/0x1c
[  948.229877]  invalid_op+0x14/0x20
[  948.238317] RIP: 0010:task_non_contending+0xae0/0x1950
[  948.253825] Code: 6d 29 83 48 89 4c 24 20 48 89 54 24 10 c6 05 d0 89
5a 03 01 e8 11 ea ee ff 0f 0b 48 8b 4c 24 20 48 8b 54 24 10 e9 bb f7 ff
ff <0f> 0b e9 1d f6 ff ff e8 d4 a7 09 00 85 c0 0f 85 74 f8 ff ff 48 c7
[  948.272329] RSP: 0018:8883d443f8c0 EFLAGS: 00010002
[  948.293045] RAX: 0001 RBX: 8883d3572468 RCX: 813a6571
[  948.300323] RDX: 08ab RSI: c900030e4000 RDI: 8883e2fe6278
[  948.305278] RBP: 8883e2f0 R08: ed1078ea3ab2 R09: ed1078ea3ab2
[  948.316441] R10: 0001 R11: ed1078ea3ab1 R12: 0002c680
[  948.320257] R13: 8883d357217c R14: 0001 R15: 8883d3572140
[  948.324500]  ? hrtimer_active+0x171/0x1f0
[  948.327421]  ? dequeue_task_dl+0x38/0x970
[  948.330572]  __schedule+0x94b/0x1a80
[  948.333578]  ? __sched_text_start+0x8/0x8
[  948.336141]  ? lock_downgrade+0x5e0/0x5e0
[  948.338111]  ? plist_add+0x23e/0x480
[  948.339706]  schedule+0x7c/0x1a0
[  948.341395]  futex_wait_queue_me+0x319/0x600
[  948.343329]  ? get_futex_key_refs+0xd0/0xd0
[  948.345037]  ? lock_downgrade+0x5e0/0x5e0
[  948.347206]  ? get_futex_key_refs+0xa4/0xd0
[  948.353007]  futex_wait+0x1e7/0x590
[  948.355328]  ? futex_wait_setup+0x2b0/0x2b0
[  948.360578]  ? __lock_acquire+0x60c/0x3b70
[  948.369186]  ? __save_stack_trace+0x92/0x100
[  948.374344]  ? hash_futex+0x15/0x210
[  948.376832]  ? drop_futex_key_refs+0x3c/0xd0
[  948.378591]  ? futex_wake+0x14e/0x450
[  948.381609]  do_futex+0x5c9/0x15e0
[  948.384567]  ? perf_syscall_enter+0xb1/0xc80
[  948.390307]  ? exit_robust_list+0x240/0x240
[  948.393566]  ? ftrace_syscall_exit+0x5c0/0x5c0
[  948.396369]  ? lock_downgrade+0x5e0/0x5e0
[  948.401748]  ? __might_fault+0x17c/0x1c0
[  948.404171]  __x64_sys_futex+0x296/0x380
[  948.406472]  ? __ia32_sys_futex+0x370/0x370
[  948.440630]  ? trace_hardirqs_on_thunk+0x1a/0x1c
[  948.441774]  ? trace_hardirqs_off_caller+0x40/0x190
[  948.442770]  ? do_syscall_64+0x3b/0x580
[  948.486728]  do_syscall_64+0xc8/0x580
[  948.489138]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
[  948.492072] RIP: 0033:0x462eb9
[  948.492788] Code: f7 d8 64 89 02 b8 ff ff ff ff c3 66 0f 1f 44 00 00
48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f
05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 bc ff ff ff f7 d8 64 89 01 48
[  948.532016] RSP: 002b:7f7ac8a67cd8 EFLAGS: 0246 ORIG_RAX: 
00ca
[  948.536811] RAX: ffda RBX: 0073bf08 RCX: 00462eb9
[  948.542138] RDX:  RSI: 0080 RDI: 0073bf08
[  948.548077] RBP: 0073bf00 R08:  R09: 
[  948.562535] R10:  R11: 0246 R12: 0073bf0c
[  948.569184] R13:  R14: 0073bf00 R15: 7fff106d8c10

line 255 of deadline.c is
WARN_ON(hrtimer_active(_se->inactive_timer));
in task_non_contending().
Unfortunately, in some cases (for example, a deadline task
continuosly blocking and waking immediately) it can happen that
a task blocks (and task_non_contending() is called) while the
0-lag timer is still active. In this case, the safest thing to
do is to immediately decrease the running bandwidth of the task,
without trying to re-arm the 0-lag timer.

Signed-off-by: luca abeni 
---
 kernel/sched/deadline.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 6a73e41a2016..43901fa3f269 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -252,7 +252,6 @@ static void task_non_contending(struct task_struct *p)
if (dl_entity_is_special(dl_se))
return;
 
-   WARN_ON(hrtimer_active(_se->inactive_timer));
WARN_ON(dl_se->dl_non_contending);
 
zerolag_time = dl_se->deadline -
@@ -269,7 +268,7 @@ static 

Re: WARN ON at kernel/sched/deadline.c task_non_contending

2019-03-22 Thread luca abeni
Hi Juri,


On Fri, 22 Mar 2019 15:32:32 +0100 Juri Lelli  wrote:
[...]
> > diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
> > index 6a73e41a2016..43901fa3f269 100644
> > --- a/kernel/sched/deadline.c
> > +++ b/kernel/sched/deadline.c
> > @@ -252,7 +252,6 @@ static void task_non_contending(struct
> > task_struct *p) if (dl_entity_is_special(dl_se))
> > return;
> >  
> > -   WARN_ON(hrtimer_active(_se->inactive_timer));
> > WARN_ON(dl_se->dl_non_contending);
> >  
> > zerolag_time = dl_se->deadline -
> > @@ -269,7 +268,7 @@ static void task_non_contending(struct
> > task_struct *p)
> >  * If the "0-lag time" already passed, decrease the active
> >  * utilization now, instead of starting a timer
> >  */
> > -   if (zerolag_time < 0) {
> > +   if ((zerolag_time < 0) ||
> > hrtimer_active(_se->inactive_timer)) { if (dl_task(p))
> > sub_running_bw(dl_se, dl_rq);
> > if (!dl_task(p) || p->state == TASK_DEAD) {
> > 
> > 
> > The idea is that if the timer is active, we leave dl_non_contending
> > set to 0 (so that the timer handler does nothing), and we
> > immediately decrease the running bw.
> > I think this is OK, because this situation can happen only if the
> > task blocks, wakes up while the timer handler is running, and then
> > immediately blocks again - while the timer handler is still
> > running. So, the "zero lag time" cannot be too much in the future.  
> 
> And if we get here and the handler is running it means that the
> handler is spinning on rq->lock waiting the dequeue to release it.
> So, this looks safe to me as well.
> 
> BTW, I could reproduce with Steve's deadline_test [1], and this seems
> to fix it.
> 
> Would you mind sending out a proper patch Luca?

Thanks for looking at this. I'll try to prepare and send a patch in
next week.



Thanks,
Luca


Re: WARN ON at kernel/sched/deadline.c task_non_contending

2019-03-15 Thread luca abeni
Hi,

On Fri, 15 Mar 2019 08:43:00 +0800
"chengjian (D)"  wrote:
[...]
> > diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
> > index 6a73e41a2016..43901fa3f269 100644
> > --- a/kernel/sched/deadline.c
> > +++ b/kernel/sched/deadline.c
> > @@ -252,7 +252,6 @@ static void task_non_contending(struct
> > task_struct *p) if (dl_entity_is_special(dl_se))
> > return;
> >   
> > -   WARN_ON(hrtimer_active(_se->inactive_timer));
> > WARN_ON(dl_se->dl_non_contending);
> >   
> > zerolag_time = dl_se->deadline -
> > @@ -269,7 +268,7 @@ static void task_non_contending(struct
> > task_struct *p)
> >  * If the "0-lag time" already passed, decrease the active
> >  * utilization now, instead of starting a timer
> >  */
> > -   if (zerolag_time < 0) {
> > +   if ((zerolag_time < 0) ||
> > hrtimer_active(_se->inactive_timer)) { if (dl_task(p))
> > sub_running_bw(dl_se, dl_rq);
> > if (!dl_task(p) || p->state == TASK_DEAD) {
> >
> >
> > The idea is that if the timer is active, we leave dl_non_contending
> > set to 0 (so that the timer handler does nothing), and we
> > immediately decrease the running bw.
> > I think this is OK, because this situation can happen only if the
> > task blocks, wakes up while the timer handler is running, and then
> > immediately blocks again - while the timer handler is still
> > running. So, the "zero lag time" cannot be too much in the future.
> >
> >
> > Thanks,
> > Luca
> >
> > .  
> 
> 
> Yeah, it looks good.
> 
> I can do some experiments with it ,
> 
> Do you have some testcases to help me with the test ?

I just tried the test you provided... I also have some other
SCHED_DEADLINE tests at https://github.com/lucabe72/ReclaimingTests but
I did not try them with this patch yet.

Claudio Scordino also had some SCHED_DEADLINE tests here:
https://github.com/evidence/test-sched-dl



Luca


Re: WARN ON at kernel/sched/deadline.c task_non_contending

2019-03-13 Thread luca abeni
Hi,

(I added Juri in cc)

On Tue, 12 Mar 2019 10:03:12 +0800
"chengjian (D)"  wrote:
[...]
> diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
> index 31c050a0d0ce..d73cb033a06d 100644
> --- a/kernel/sched/deadline.c
> +++ b/kernel/sched/deadline.c
> @@ -252,7 +252,6 @@ static void task_non_contending(struct
> task_struct *p) if (dl_entity_is_special(dl_se))
>      return;
> 
> -   WARN_ON(hrtimer_active(_se->inactive_timer));
>      WARN_ON(dl_se->dl_non_contending);
> 
>      zerolag_time = dl_se->deadline -
> @@ -287,7 +286,9 @@ static void task_non_contending(struct
> task_struct *p) }
> 
>      dl_se->dl_non_contending = 1;
> -   get_task_struct(p);
> +
> +   if (!hrtimer_active(_se->inactive_timer));
> +   get_task_struct(p);
>      hrtimer_start(timer, ns_to_ktime(zerolag_time),
> HRTIMER_MODE_REL); }

After looking at the patch a little bit more and running some tests,
I suspect this solution might be racy:
when the timer is already active, (and hrtimer_start() fails), it
relies on its handler to decrease the running bw (by setting
dl_non_contending to 1)... But inactive_task_timer() might have
already checked dl_non_contending, finding it equal to 0 (so, it
ends up doing nothing and the running bw is not decreased).


So, I would prefer a different solution. I think this patch should work:

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 6a73e41a2016..43901fa3f269 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -252,7 +252,6 @@ static void task_non_contending(struct task_struct *p)
if (dl_entity_is_special(dl_se))
return;
 
-   WARN_ON(hrtimer_active(_se->inactive_timer));
WARN_ON(dl_se->dl_non_contending);
 
zerolag_time = dl_se->deadline -
@@ -269,7 +268,7 @@ static void task_non_contending(struct task_struct *p)
 * If the "0-lag time" already passed, decrease the active
 * utilization now, instead of starting a timer
 */
-   if (zerolag_time < 0) {
+   if ((zerolag_time < 0) || hrtimer_active(_se->inactive_timer)) {
if (dl_task(p))
sub_running_bw(dl_se, dl_rq);
if (!dl_task(p) || p->state == TASK_DEAD) {


The idea is that if the timer is active, we leave dl_non_contending set to
0 (so that the timer handler does nothing), and we immediately decrease the
running bw.
I think this is OK, because this situation can happen only if the task
blocks, wakes up while the timer handler is running, and then immediately
blocks again - while the timer handler is still running. So, the "zero lag
time" cannot be too much in the future.


Thanks,
Luca


Re: WARN ON at kernel/sched/deadline.c task_non_contending

2019-03-12 Thread luca abeni
Hi all,

On Tue, 12 Mar 2019 10:03:12 +0800
"chengjian (D)"  wrote:

> Hi.
> 
> When looking to test SCHED_DEADLINE syzkaller report an warn in
> task_non_contending(). I tested the mainline kernel with the C program
> and captured the same call trace.
[...]
> diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
> index 31c050a0d0ce..d73cb033a06d 100644
> --- a/kernel/sched/deadline.c
> +++ b/kernel/sched/deadline.c
> @@ -252,7 +252,6 @@ static void task_non_contending(struct
> task_struct *p) if (dl_entity_is_special(dl_se))
>      return;
> 
> -   WARN_ON(hrtimer_active(_se->inactive_timer));
>      WARN_ON(dl_se->dl_non_contending);
> 
>      zerolag_time = dl_se->deadline -
> @@ -287,7 +286,9 @@ static void task_non_contending(struct
> task_struct *p) }
> 
>      dl_se->dl_non_contending = 1;
> -   get_task_struct(p);
> +
> +   if (!hrtimer_active(_se->inactive_timer));
> +   get_task_struct(p);
>      hrtimer_start(timer, ns_to_ktime(zerolag_time),
> HRTIMER_MODE_REL); }

At a first glance, I think the patch is OK, but I need some more time to
look at the details.

I'll run some experiments with the reproducer, and I'll let you know my
conclusions.


> Did I miss something ?
> 
> I saw it directly remove the hrtimer in hrtime_start() if hrtime is
> queued, it may be unsafe here when the timer handler is running.

This is probably why I added that WARN_ON()... I'll look at a possible
solution.



Thanks,
Luca


> 
> Help ?
> 
> I put the syzkaller log and C demo in attachments.
> 
> Thanks.
> 
> 
> 



Re: WARNING in enqueue_task_dl

2019-01-02 Thread luca abeni
Hi all,
(and, happy new year to everyone!)

this looks similar to a bug we have seen some time ago (a task
switching from SCHED_OTHER to SCHED_DEADLINE while inheriting a
deadline from a SCHED_DEADLINE task triggers the warning)...

Juri, I think you found a fix for such a bug; has it been committed?



Thanks,
Luca

On Mon, 31 Dec 2018 07:02:04 -0800
syzbot  wrote:

> syzbot has found a reproducer for the following crash on:
> 
> HEAD commit:195303136f19 Merge tag 'kconfig-v4.21-2' of
> git://git.kern.. git tree:   upstream
> console output:
> https://syzkaller.appspot.com/x/log.txt?x=118af84b40 kernel
> config:  https://syzkaller.appspot.com/x/.config?x=76d28549be7c27cf
> dashboard link:
> https://syzkaller.appspot.com/bug?extid=119ba87189432ead09b4
> compiler:   gcc (GCC) 9.0.0 20181231 (experimental) syz
> repro:
> https://syzkaller.appspot.com/x/repro.syz?x=10eb7ebf40 C
> reproducer:   https://syzkaller.appspot.com/x/repro.c?x=14156d7740
> 
> IMPORTANT: if you fix the bug, please add the following tag to the
> commit: Reported-by:
> syzbot+119ba87189432ead0...@syzkaller.appspotmail.com
> 
> WARNING: CPU: 0 PID: 9019 at kernel/sched/deadline.c:628  
> setup_new_dl_entity kernel/sched/deadline.c:629 [inline]
> WARNING: CPU: 0 PID: 9019 at kernel/sched/deadline.c:628
> enqueue_dl_entity kernel/sched/deadline.c:1429 [inline]
> WARNING: CPU: 0 PID: 9019 at kernel/sched/deadline.c:628  
> enqueue_task_dl+0x2355/0x3dc0 kernel/sched/deadline.c:1500
> Kernel panic - not syncing: panic_on_warn set ...
> CPU: 0 PID: 9019 Comm: syz-executor280 Not tainted 4.20.0+ #1
> Hardware name: Google Google Compute Engine/Google Compute Engine,
> BIOS Google 01/01/2011
> Call Trace:
>   __dump_stack lib/dump_stack.c:77 [inline]
>   dump_stack+0x1db/0x2d0 lib/dump_stack.c:113
>   panic+0x2cb/0x589 kernel/panic.c:189
>   __warn.cold+0x20/0x4b kernel/panic.c:544
>   report_bug+0x263/0x2b0 lib/bug.c:186
>   fixup_bug arch/x86/kernel/traps.c:178 [inline]
>   fixup_bug arch/x86/kernel/traps.c:173 [inline]
>   do_error_trap+0x11b/0x200 arch/x86/kernel/traps.c:271
>   do_invalid_op+0x37/0x50 arch/x86/kernel/traps.c:290
>   invalid_op+0x14/0x20 arch/x86/entry/entry_64.S:973
> RIP: 0010:setup_new_dl_entity kernel/sched/deadline.c:628 [inline]
> RIP: 0010:enqueue_dl_entity kernel/sched/deadline.c:1429 [inline]
> RIP: 0010:enqueue_task_dl+0x2355/0x3dc0 kernel/sched/deadline.c:1500
> Code: 3c 02 00 0f 85 ba 05 00 00 49 8b b5 50 0a 00 00 e9 53 fa ff ff
> e8 fb f2 64 00 48 8d 4d d8 e9 48 dd ff ff 0f 0b e9 92 f1 ff ff <0f>
> 0b e9 18 f1 ff ff 4c 89 ef 4c 89 95 28 ff ff ff 4c 89 85 30 ff
> RSP: 0018:88809eebfaf8 EFLAGS: 00010002
> RAX: 0002 RBX: 111013dd7f6a RCX: dc00
> RDX: 00333cf09f75 RSI: 0004 RDI: 8880ae62d850
> RBP: 88809eebfbf8 R08: 88807fb0a538 R09: 88807fb0a2fc
> R10: 88807fb0a580 R11: 8880ae62dc7b R12: 88807fb0a2c0
> R13: 8880ae62ce00 R14: 8880ae62ce00 R15: 88807fb0a58c
>   enqueue_task+0xb9/0x380 kernel/sched/core.c:730
>   __sched_setscheduler+0xe32/0x1fe0 kernel/sched/core.c:4336
>   sched_setattr kernel/sched/core.c:4394 [inline]
>   __do_sys_sched_setattr kernel/sched/core.c:4570 [inline]
>   __se_sys_sched_setattr kernel/sched/core.c:4549 [inline]
>   __x64_sys_sched_setattr+0x1af/0x2f0 kernel/sched/core.c:4549
>   do_syscall_64+0x1a3/0x800 arch/x86/entry/common.c:290
>   entry_SYSCALL_64_after_hwframe+0x49/0xbe
> RIP: 0033:0x44c829
> Code: e8 8c d8 02 00 48 83 c4 18 c3 0f 1f 80 00 00 00 00 48 89 f8 48
> 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48>
> 3d 01 f0 ff ff 0f 83 eb c9 fb ff c3 66 2e 0f 1f 84 00 00 00 00
> RSP: 002b:7f28685e8ce8 EFLAGS: 0246 ORIG_RAX: 013a
> RAX: ffda RBX: 006e49f8 RCX: 0044c829
> RDX:  RSI: 2000 RDI: 
> RBP: 006e49f0 R08:  R09: 
> R10:  R11: 0246 R12: 006e49fc
> R13: 7ffd1981c8af R14: 7f28685e99c0 R15: 0001
> 
> ==
> WARNING: possible circular locking dependency detected
> 4.20.0+ #1 Not tainted
> --
> syz-executor280/9019 is trying to acquire lock:
> 1aef527c ((console_sem).lock){-.-.}, at:
> down_trylock+0x13/0x70 kernel/locking/semaphore.c:136
> 
> but task is already holding lock:
> 0ba17b09 (>lock){-.-.}, at: task_rq_lock+0xc8/0x290  
> kernel/sched/core.c:99
> 
> which lock already depends on the new lock.
> 
> 
> the existing dependency chain (in reverse order) is:
> 
> -> #2 (>lock){-.-.}:  
> __raw_spin_lock include/linux/spinlock_api_smp.h:142 [inline]
> _raw_spin_lock+0x2f/0x40 kernel/locking/spinlock.c:144
> rq_lock kernel/sched/sched.h:1149 [inline]
> 

Re: WARNING in enqueue_task_dl

2018-11-19 Thread luca abeni
Hi all,

On Mon, 19 Nov 2018 09:23:03 +0100 (CET)
Thomas Gleixner  wrote:

> Adding scheduler folks
> 
> On Sun, 18 Nov 2018, syzbot wrote:
> 
> > Hello,
> > 
> > syzbot found the following crash on:
> > 
> > HEAD commit:1ce80e0fe98e Merge tag 'fsnotify_for_v4.20-rc3' of
> > git://g.. git tree:   upstream
> > console output:
> > https://syzkaller.appspot.com/x/log.txt?x=14ddbb0b40 kernel
> > config:  https://syzkaller.appspot.com/x/.config?x=d86f24333880b605
> > dashboard link:
> > https://syzkaller.appspot.com/bug?extid=119ba87189432ead09b4
> > compiler:   gcc (GCC) 8.0.1 20180413 (experimental) syz
> > repro:
> > https://syzkaller.appspot.com/x/repro.syz?x=13e9e01540
> > 
> > IMPORTANT: if you fix the bug, please add the following tag to the
> > commit: Reported-by:
> > syzbot+119ba87189432ead0...@syzkaller.appspotmail.com
> > 
> > IPv6: ADDRCONF(NETDEV_CHANGE): veth1: link becomes ready
> > IPv6: ADDRCONF(NETDEV_CHANGE): veth0: link becomes ready
> > 8021q: adding VLAN 0 to HW filter on device team0
> > hrtimer: interrupt took 33411 ns
> > sched: DL replenish lagged too much
> > WARNING: CPU: 1 PID: 6351 at kernel/sched/deadline.c:628
> > enqueue_task_dl+0x22da/0x38a0 kernel/sched/deadline.c:1504

Here, it looks like a task is invoking sched_setattr() to become
SCHED_DEADLINE when dl_boosted is set...

Is this possible / correct?

If this (sched_setattr() with dl_boosted set) should not be possible,
then we have a bug that we need to investigate...

Otherwise, I suspect we can just remove the WARN_ON at line 628 of
deadline.c


Luca




> > PM: Basic memory bitmaps freed
> > Kernel panic - not syncing: panic_on_warn set ...
> > CPU: 1 PID: 6351 Comm: syz-executor0 Not tainted 4.20.0-rc2+ #338
> > Hardware name: Google Google Compute Engine/Google Compute Engine,
> > BIOS Google 01/01/2011
> > Call Trace:
> > __dump_stack lib/dump_stack.c:77 [inline]
> > dump_stack+0x244/0x39d lib/dump_stack.c:113
> > panic+0x2ad/0x55c kernel/panic.c:188
> > __warn.cold.8+0x20/0x45 kernel/panic.c:540
> > report_bug+0x254/0x2d0 lib/bug.c:186
> > fixup_bug arch/x86/kernel/traps.c:178 [inline]
> > do_error_trap+0x11b/0x200 arch/x86/kernel/traps.c:271
> > do_invalid_op+0x36/0x40 arch/x86/kernel/traps.c:290
> > invalid_op+0x14/0x20 arch/x86/entry/entry_64.S:969
> > RIP: 0010:enqueue_task_dl+0x22da/0x38a0 kernel/sched/deadline.c:1504
> > Code: ff 48 8b 8d c8 fe ff ff 48 c1 e6 2a 4c 8b 9d d0 fe ff ff 8b
> > 95 d8 fe ff ff 48 8b 85 e0 fe ff ff e9 16 e4 ff ff e8 16 d0 ea ff
> > <0f> 0b e9 17 f1 ff ff 48 8b bd e8 fe ff ff 4c 89 95 c8 fe ff ff 48
> > RSP: 0018:8881ba39fa18 EFLAGS: 00010002
> > RAX:  RBX: 8881b9d6c000 RCX: 8881b9d6c278
> > RDX: 8881b9d6c03c RSI: 0002 RDI: 8881daf2d710
> > RBP: 8881ba39fb78 R08: 0001 R09: 8881daf0
> > R10: 001a4d4f1987 R11: 8881daf2db3b R12: 111037473f4e
> > R13: 8881b9d6c2cc R14: 8881daf2ccc0 R15: 8881daf2ccc0
> > enqueue_task+0x184/0x390 kernel/sched/core.c:730
> > __sched_setscheduler+0xe99/0x2190 kernel/sched/core.c:4336
> > sched_setattr kernel/sched/core.c:4394 [inline]
> > __do_sys_sched_setattr kernel/sched/core.c:4570 [inline]
> > __se_sys_sched_setattr kernel/sched/core.c:4549 [inline]
> > __x64_sys_sched_setattr+0x1b2/0x2f0 kernel/sched/core.c:4549
> > do_syscall_64+0x1b9/0x820 arch/x86/entry/common.c:290
> > entry_SYSCALL_64_after_hwframe+0x49/0xbe
> > RIP: 0033:0x457569
> > Code: fd b3 fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8
> > 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05
> > <48> 3d 01 f0 ff ff 0f 83 cb b3 fb ff c3 66 2e 0f 1f 84 00 00 00 00
> > RSP: 002b:7f05ce0a2c78 EFLAGS: 0246 ORIG_RAX:
> > 013a RAX: ffda RBX: 0003 RCX:
> > 00457569 RDX:  RSI: 2000 RDI:
> >  RBP: 0072bfa0 R08:  R09:
> >  R10:  R11: 0246 R12:
> > 7f05ce0a36d4 R13: 004c369f R14: 004d5730 R15:
> > 
> > 
> > ==
> > WARNING: possible circular locking dependency detected
> > 4.20.0-rc2+ #338 Not tainted
> > --
> > syz-executor0/6351 is trying to acquire lock:
> > b2b97155 ((console_sem).lock){-.-.}, at:
> > down_trylock+0x13/0x70 kernel/locking/semaphore.c:136
> > 
> > but task is already holding lock:
> > 4cd5557e (>lock){-.-.}, at: task_rq_lock+0xc5/0x2a0
> > kernel/sched/core.c:99
> > 
> > which lock already depends on the new lock.
> > 
> > 
> > the existing dependency chain (in reverse order) is:
> >   
> > -> #2 (>lock){-.-.}:  
> >   __raw_spin_lock include/linux/spinlock_api_smp.h:142 [inline]
> >   _raw_spin_lock+0x2d/0x40 kernel/locking/spinlock.c:144
> >   rq_lock kernel/sched/sched.h:1126 [inline]
> >   

Re: WARNING in enqueue_task_dl

2018-11-19 Thread luca abeni
Hi all,

On Mon, 19 Nov 2018 09:23:03 +0100 (CET)
Thomas Gleixner  wrote:

> Adding scheduler folks
> 
> On Sun, 18 Nov 2018, syzbot wrote:
> 
> > Hello,
> > 
> > syzbot found the following crash on:
> > 
> > HEAD commit:1ce80e0fe98e Merge tag 'fsnotify_for_v4.20-rc3' of
> > git://g.. git tree:   upstream
> > console output:
> > https://syzkaller.appspot.com/x/log.txt?x=14ddbb0b40 kernel
> > config:  https://syzkaller.appspot.com/x/.config?x=d86f24333880b605
> > dashboard link:
> > https://syzkaller.appspot.com/bug?extid=119ba87189432ead09b4
> > compiler:   gcc (GCC) 8.0.1 20180413 (experimental) syz
> > repro:
> > https://syzkaller.appspot.com/x/repro.syz?x=13e9e01540
> > 
> > IMPORTANT: if you fix the bug, please add the following tag to the
> > commit: Reported-by:
> > syzbot+119ba87189432ead0...@syzkaller.appspotmail.com
> > 
> > IPv6: ADDRCONF(NETDEV_CHANGE): veth1: link becomes ready
> > IPv6: ADDRCONF(NETDEV_CHANGE): veth0: link becomes ready
> > 8021q: adding VLAN 0 to HW filter on device team0
> > hrtimer: interrupt took 33411 ns
> > sched: DL replenish lagged too much
> > WARNING: CPU: 1 PID: 6351 at kernel/sched/deadline.c:628
> > enqueue_task_dl+0x22da/0x38a0 kernel/sched/deadline.c:1504

Here, it looks like a task is invoking sched_setattr() to become
SCHED_DEADLINE when dl_boosted is set...

Is this possible / correct?

If this (sched_setattr() with dl_boosted set) should not be possible,
then we have a bug that we need to investigate...

Otherwise, I suspect we can just remove the WARN_ON at line 628 of
deadline.c


Luca




> > PM: Basic memory bitmaps freed
> > Kernel panic - not syncing: panic_on_warn set ...
> > CPU: 1 PID: 6351 Comm: syz-executor0 Not tainted 4.20.0-rc2+ #338
> > Hardware name: Google Google Compute Engine/Google Compute Engine,
> > BIOS Google 01/01/2011
> > Call Trace:
> > __dump_stack lib/dump_stack.c:77 [inline]
> > dump_stack+0x244/0x39d lib/dump_stack.c:113
> > panic+0x2ad/0x55c kernel/panic.c:188
> > __warn.cold.8+0x20/0x45 kernel/panic.c:540
> > report_bug+0x254/0x2d0 lib/bug.c:186
> > fixup_bug arch/x86/kernel/traps.c:178 [inline]
> > do_error_trap+0x11b/0x200 arch/x86/kernel/traps.c:271
> > do_invalid_op+0x36/0x40 arch/x86/kernel/traps.c:290
> > invalid_op+0x14/0x20 arch/x86/entry/entry_64.S:969
> > RIP: 0010:enqueue_task_dl+0x22da/0x38a0 kernel/sched/deadline.c:1504
> > Code: ff 48 8b 8d c8 fe ff ff 48 c1 e6 2a 4c 8b 9d d0 fe ff ff 8b
> > 95 d8 fe ff ff 48 8b 85 e0 fe ff ff e9 16 e4 ff ff e8 16 d0 ea ff
> > <0f> 0b e9 17 f1 ff ff 48 8b bd e8 fe ff ff 4c 89 95 c8 fe ff ff 48
> > RSP: 0018:8881ba39fa18 EFLAGS: 00010002
> > RAX:  RBX: 8881b9d6c000 RCX: 8881b9d6c278
> > RDX: 8881b9d6c03c RSI: 0002 RDI: 8881daf2d710
> > RBP: 8881ba39fb78 R08: 0001 R09: 8881daf0
> > R10: 001a4d4f1987 R11: 8881daf2db3b R12: 111037473f4e
> > R13: 8881b9d6c2cc R14: 8881daf2ccc0 R15: 8881daf2ccc0
> > enqueue_task+0x184/0x390 kernel/sched/core.c:730
> > __sched_setscheduler+0xe99/0x2190 kernel/sched/core.c:4336
> > sched_setattr kernel/sched/core.c:4394 [inline]
> > __do_sys_sched_setattr kernel/sched/core.c:4570 [inline]
> > __se_sys_sched_setattr kernel/sched/core.c:4549 [inline]
> > __x64_sys_sched_setattr+0x1b2/0x2f0 kernel/sched/core.c:4549
> > do_syscall_64+0x1b9/0x820 arch/x86/entry/common.c:290
> > entry_SYSCALL_64_after_hwframe+0x49/0xbe
> > RIP: 0033:0x457569
> > Code: fd b3 fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8
> > 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05
> > <48> 3d 01 f0 ff ff 0f 83 cb b3 fb ff c3 66 2e 0f 1f 84 00 00 00 00
> > RSP: 002b:7f05ce0a2c78 EFLAGS: 0246 ORIG_RAX:
> > 013a RAX: ffda RBX: 0003 RCX:
> > 00457569 RDX:  RSI: 2000 RDI:
> >  RBP: 0072bfa0 R08:  R09:
> >  R10:  R11: 0246 R12:
> > 7f05ce0a36d4 R13: 004c369f R14: 004d5730 R15:
> > 
> > 
> > ==
> > WARNING: possible circular locking dependency detected
> > 4.20.0-rc2+ #338 Not tainted
> > --
> > syz-executor0/6351 is trying to acquire lock:
> > b2b97155 ((console_sem).lock){-.-.}, at:
> > down_trylock+0x13/0x70 kernel/locking/semaphore.c:136
> > 
> > but task is already holding lock:
> > 4cd5557e (>lock){-.-.}, at: task_rq_lock+0xc5/0x2a0
> > kernel/sched/core.c:99
> > 
> > which lock already depends on the new lock.
> > 
> > 
> > the existing dependency chain (in reverse order) is:
> >   
> > -> #2 (>lock){-.-.}:  
> >   __raw_spin_lock include/linux/spinlock_api_smp.h:142 [inline]
> >   _raw_spin_lock+0x2d/0x40 kernel/locking/spinlock.c:144
> >   rq_lock kernel/sched/sched.h:1126 [inline]
> >   

Re: INFO: rcu detected stall in do_idle

2018-10-30 Thread luca abeni
Hi Peter,

On Tue, 30 Oct 2018 11:45:54 +0100
Peter Zijlstra  wrote:
[...]
> >  2. This is related to perf_event_open syscall reproducer does
> > before becoming DEADLINE and entering the busy loop. Enabling of
> > perf swevents generates lot of hrtimers load that happens in the
> > reproducer task context. Now, DEADLINE uses rq_clock() for
> > setting deadlines, but rq_clock_task() for doing runtime
> > enforcement. In a situation like this it seems that the amount of
> > irq pressure becomes pretty big (I'm seeing this on kvm, real hw
> > should maybe do better, pain point remains I guess), so rq_clock()
> > and rq_clock_task() might become more a more skewed w.r.t. each
> > other. Since rq_clock() is only used when setting absolute
> > deadlines for the first time (or when resetting them in certain
> > cases), after a bit the replenishment code will start to see
> > postponed deadlines always in the past w.r.t. rq_clock(). And this
> > brings us back to the fact that the task is never stopped, since it
> > can't keep up with rq_clock().
> > 
> > - Not sure yet how we want to address this [1]. We could use
> >   rq_clock() everywhere, but tasks might be penalized by irq
> >   pressure (theoretically this would mandate that irqs are
> >   explicitly accounted for I guess). I tried to use the skew
> > between the two clocks to "fix" deadlines, but that puts us at
> > risks of de-synchronizing userspace and kernel views of deadlines.  
> 
> Hurm.. right. We knew of this issue back when we did it.
> I suppose now it hurts and we need to figure something out.
> 
> By virtue of being a real-time class, we do indeed need to have
> deadline on the wall-clock. But if we then don't account runtime on
> that same clock, but on a potentially slower clock, we get the
> problem that we can run longer than our period/deadline, which is
> what we're running into here I suppose.

I might be hugely misunderstanding something here, but in my impression
the issue is just that if the IRQ time is not accounted to the
-deadline task, then the non-deadline tasks might be starved.

I do not see this as a skew between two clocks, but as an accounting
thing:
- if we decide that the IRQ time is accounted to the -deadline
  task (this is what happens with CONFIG_IRQ_TIME_ACCOUNTING disabled),
  then the non-deadline tasks are not starved (but of course the
  -deadline tasks executes for less than its reserved time in the
  period); 
- if we decide that the IRQ time is not accounted to the -deadline task
  (this is what happens with CONFIG_IRQ_TIME_ACCOUNTING enabled), then
  the -deadline task executes for the expected amount of time (about
  60% of the CPU time), but an IRQ load of 40% will starve non-deadline
  tasks (this is what happens in the bug that triggered this discussion)

I think this might be seen as an adimission control issue: when
CONFIG_IRQ_TIME_ACCOUNTING is disabled, the IRQ time is accounted for
in the admission control (because it ends up in the task's runtime),
but when CONFIG_IRQ_TIME_ACCOUNTING is enabled the IRQ time is not
accounted for in the admission test (the IRQ handler becomes some sort
of entity with a higher priority than -deadline tasks, on which no
accounting or enforcement is performed).



> And yes, at some point RT workloads need to be aware of the jitter
> injected by things like IRQs and such. But I believe the rationale was
> that for soft real-time workloads this current semantic was 'easier'
> because we get to ignore IRQ overhead for workload estimation etc.
> 
> What we could maybe do is track runtime in both rq_clock_task() and
> rq_clock() and detect where the rq_clock based one exceeds the period
> and then push out the deadline (and add runtime).
> 
> Maybe something along such lines; does that make sense?

Uhm... I have to study and test your patch... I'll comment on this
later.



Thanks,
Luca


> 
> ---
>  include/linux/sched.h   |  3 +++
>  kernel/sched/deadline.c | 53
> - 2 files changed, 38
> insertions(+), 18 deletions(-)
> 
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 8f8a5418b627..6aec81cb3d2e 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -522,6 +522,9 @@ struct sched_dl_entity {
>   u64 deadline;   /*
> Absolute deadline for this instance   */ unsigned
> int   flags;  /* Specifying the
> scheduler behaviour   */ 
> + u64 wallstamp;
> + s64 walltime;
> +
>   /*
>* Some bool flags:
>*
> diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
> index 91e4202b0634..633c8f36c700 100644
> --- a/kernel/sched/deadline.c
> +++ b/kernel/sched/deadline.c
> @@ -683,16 +683,7 @@ static void replenish_dl_entity(struct
> sched_dl_entity *dl_se, if (dl_se->dl_yielded 

Re: INFO: rcu detected stall in do_idle

2018-10-30 Thread luca abeni
Hi Peter,

On Tue, 30 Oct 2018 11:45:54 +0100
Peter Zijlstra  wrote:
[...]
> >  2. This is related to perf_event_open syscall reproducer does
> > before becoming DEADLINE and entering the busy loop. Enabling of
> > perf swevents generates lot of hrtimers load that happens in the
> > reproducer task context. Now, DEADLINE uses rq_clock() for
> > setting deadlines, but rq_clock_task() for doing runtime
> > enforcement. In a situation like this it seems that the amount of
> > irq pressure becomes pretty big (I'm seeing this on kvm, real hw
> > should maybe do better, pain point remains I guess), so rq_clock()
> > and rq_clock_task() might become more a more skewed w.r.t. each
> > other. Since rq_clock() is only used when setting absolute
> > deadlines for the first time (or when resetting them in certain
> > cases), after a bit the replenishment code will start to see
> > postponed deadlines always in the past w.r.t. rq_clock(). And this
> > brings us back to the fact that the task is never stopped, since it
> > can't keep up with rq_clock().
> > 
> > - Not sure yet how we want to address this [1]. We could use
> >   rq_clock() everywhere, but tasks might be penalized by irq
> >   pressure (theoretically this would mandate that irqs are
> >   explicitly accounted for I guess). I tried to use the skew
> > between the two clocks to "fix" deadlines, but that puts us at
> > risks of de-synchronizing userspace and kernel views of deadlines.  
> 
> Hurm.. right. We knew of this issue back when we did it.
> I suppose now it hurts and we need to figure something out.
> 
> By virtue of being a real-time class, we do indeed need to have
> deadline on the wall-clock. But if we then don't account runtime on
> that same clock, but on a potentially slower clock, we get the
> problem that we can run longer than our period/deadline, which is
> what we're running into here I suppose.

I might be hugely misunderstanding something here, but in my impression
the issue is just that if the IRQ time is not accounted to the
-deadline task, then the non-deadline tasks might be starved.

I do not see this as a skew between two clocks, but as an accounting
thing:
- if we decide that the IRQ time is accounted to the -deadline
  task (this is what happens with CONFIG_IRQ_TIME_ACCOUNTING disabled),
  then the non-deadline tasks are not starved (but of course the
  -deadline tasks executes for less than its reserved time in the
  period); 
- if we decide that the IRQ time is not accounted to the -deadline task
  (this is what happens with CONFIG_IRQ_TIME_ACCOUNTING enabled), then
  the -deadline task executes for the expected amount of time (about
  60% of the CPU time), but an IRQ load of 40% will starve non-deadline
  tasks (this is what happens in the bug that triggered this discussion)

I think this might be seen as an adimission control issue: when
CONFIG_IRQ_TIME_ACCOUNTING is disabled, the IRQ time is accounted for
in the admission control (because it ends up in the task's runtime),
but when CONFIG_IRQ_TIME_ACCOUNTING is enabled the IRQ time is not
accounted for in the admission test (the IRQ handler becomes some sort
of entity with a higher priority than -deadline tasks, on which no
accounting or enforcement is performed).



> And yes, at some point RT workloads need to be aware of the jitter
> injected by things like IRQs and such. But I believe the rationale was
> that for soft real-time workloads this current semantic was 'easier'
> because we get to ignore IRQ overhead for workload estimation etc.
> 
> What we could maybe do is track runtime in both rq_clock_task() and
> rq_clock() and detect where the rq_clock based one exceeds the period
> and then push out the deadline (and add runtime).
> 
> Maybe something along such lines; does that make sense?

Uhm... I have to study and test your patch... I'll comment on this
later.



Thanks,
Luca


> 
> ---
>  include/linux/sched.h   |  3 +++
>  kernel/sched/deadline.c | 53
> - 2 files changed, 38
> insertions(+), 18 deletions(-)
> 
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 8f8a5418b627..6aec81cb3d2e 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -522,6 +522,9 @@ struct sched_dl_entity {
>   u64 deadline;   /*
> Absolute deadline for this instance   */ unsigned
> int   flags;  /* Specifying the
> scheduler behaviour   */ 
> + u64 wallstamp;
> + s64 walltime;
> +
>   /*
>* Some bool flags:
>*
> diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
> index 91e4202b0634..633c8f36c700 100644
> --- a/kernel/sched/deadline.c
> +++ b/kernel/sched/deadline.c
> @@ -683,16 +683,7 @@ static void replenish_dl_entity(struct
> sched_dl_entity *dl_se, if (dl_se->dl_yielded 

Re: INFO: rcu detected stall in do_idle

2018-10-19 Thread luca abeni
On Fri, 19 Oct 2018 13:39:42 +0200
Peter Zijlstra  wrote:

> On Thu, Oct 18, 2018 at 01:08:11PM +0200, luca abeni wrote:
> > Ok, I see the issue now: the problem is that the "while
> > (dl_se->runtime <= 0)" loop is executed at replenishment time, but
> > the deadline should be postponed at enforcement time.
> > 
> > I mean: in update_curr_dl() we do:
> > dl_se->runtime -= scaled_delta_exec;
> > if (dl_runtime_exceeded(dl_se) || dl_se->dl_yielded) {
> > ...
> > enqueue replenishment timer at dl_next_period(dl_se)
> > But dl_next_period() is based on a "wrong" deadline!
> > 
> > 
> > I think that inserting a
> > while (dl_se->runtime <= -pi_se->dl_runtime) {
> > dl_se->deadline += pi_se->dl_period;
> > dl_se->runtime += pi_se->dl_runtime;
> > }
> > immediately after "dl_se->runtime -= scaled_delta_exec;" would fix
> > the problem, no?  
> 
> That certainly makes sense to me.

Good; I'll try to work on this idea in the weekend.


Thanks,
Luca

> The only remaining issue would then
> be placing a limit on the amount of times we can take that loop;
> which, as you propose in a later email; can be done separately as a
> limit on runtime.



Re: INFO: rcu detected stall in do_idle

2018-10-19 Thread luca abeni
On Fri, 19 Oct 2018 13:39:42 +0200
Peter Zijlstra  wrote:

> On Thu, Oct 18, 2018 at 01:08:11PM +0200, luca abeni wrote:
> > Ok, I see the issue now: the problem is that the "while
> > (dl_se->runtime <= 0)" loop is executed at replenishment time, but
> > the deadline should be postponed at enforcement time.
> > 
> > I mean: in update_curr_dl() we do:
> > dl_se->runtime -= scaled_delta_exec;
> > if (dl_runtime_exceeded(dl_se) || dl_se->dl_yielded) {
> > ...
> > enqueue replenishment timer at dl_next_period(dl_se)
> > But dl_next_period() is based on a "wrong" deadline!
> > 
> > 
> > I think that inserting a
> > while (dl_se->runtime <= -pi_se->dl_runtime) {
> > dl_se->deadline += pi_se->dl_period;
> > dl_se->runtime += pi_se->dl_runtime;
> > }
> > immediately after "dl_se->runtime -= scaled_delta_exec;" would fix
> > the problem, no?  
> 
> That certainly makes sense to me.

Good; I'll try to work on this idea in the weekend.


Thanks,
Luca

> The only remaining issue would then
> be placing a limit on the amount of times we can take that loop;
> which, as you propose in a later email; can be done separately as a
> limit on runtime.



Re: INFO: rcu detected stall in do_idle

2018-10-18 Thread luca abeni
Hi Juri,

On Thu, 18 Oct 2018 14:21:42 +0200
Juri Lelli  wrote:
[...]
> > > > I missed the original emails, but maybe the issue is that the
> > > > task blocks before the tick, and when it wakes up again
> > > > something goes wrong with the deadline and runtime assignment?
> > > > (maybe because the deadline is in the past?)
> > > 
> > > No, the problem is that the task won't be throttled at all,
> > > because its replenishing instant is always way in the past when
> > > tick occurs. :-/  
> > 
> > Ok, I see the issue now: the problem is that the "while
> > (dl_se->runtime <= 0)" loop is executed at replenishment time, but
> > the deadline should be postponed at enforcement time.
> > 
> > I mean: in update_curr_dl() we do:
> > dl_se->runtime -= scaled_delta_exec;
> > if (dl_runtime_exceeded(dl_se) || dl_se->dl_yielded) {
> > ...
> > enqueue replenishment timer at dl_next_period(dl_se)
> > But dl_next_period() is based on a "wrong" deadline!
> > 
> > 
> > I think that inserting a
> > while (dl_se->runtime <= -pi_se->dl_runtime) {
> > dl_se->deadline += pi_se->dl_period;
> > dl_se->runtime += pi_se->dl_runtime;
> > }
> > immediately after "dl_se->runtime -= scaled_delta_exec;" would fix
> > the problem, no?  
> 
> Mmm, I also thought of letting the task "pay back" its overrunning.
> But, doesn't this get us quite far from what one would expect. I mean,
> enforcement granularity will be way different from task period, no?

Yes, the granularity will be what the kernel can provide (due to the HZ
value and to the hrtick on/off state). But at least the task will not
starve non-deadline tasks (which is bug that originated this
discussion, I think).

If I understand well, there are two different (and orthogonal) issues
here:
1) Due to a bug in the accounting / enforcement mechanisms
   (the wrong placement of the while() loop), the tasks consumes 100% of
   the CPU time, starving non-deadline tasks
2) Due to the large HZ value, the small runtime (and period) and the
   fact that hrtick is disabled, the kernel cannot provide the
   requested scheduling granularity

The second issue can be fixed by imposing limits on minimum and maximum
runtime and the first issue can be fixed by changing the code as I
suggested in my previous email.

I would suggest to address both the two issues, with separate changes
(the current replenishment code looks strange anyway).



Luca


Re: INFO: rcu detected stall in do_idle

2018-10-18 Thread luca abeni
Hi Juri,

On Thu, 18 Oct 2018 14:21:42 +0200
Juri Lelli  wrote:
[...]
> > > > I missed the original emails, but maybe the issue is that the
> > > > task blocks before the tick, and when it wakes up again
> > > > something goes wrong with the deadline and runtime assignment?
> > > > (maybe because the deadline is in the past?)
> > > 
> > > No, the problem is that the task won't be throttled at all,
> > > because its replenishing instant is always way in the past when
> > > tick occurs. :-/  
> > 
> > Ok, I see the issue now: the problem is that the "while
> > (dl_se->runtime <= 0)" loop is executed at replenishment time, but
> > the deadline should be postponed at enforcement time.
> > 
> > I mean: in update_curr_dl() we do:
> > dl_se->runtime -= scaled_delta_exec;
> > if (dl_runtime_exceeded(dl_se) || dl_se->dl_yielded) {
> > ...
> > enqueue replenishment timer at dl_next_period(dl_se)
> > But dl_next_period() is based on a "wrong" deadline!
> > 
> > 
> > I think that inserting a
> > while (dl_se->runtime <= -pi_se->dl_runtime) {
> > dl_se->deadline += pi_se->dl_period;
> > dl_se->runtime += pi_se->dl_runtime;
> > }
> > immediately after "dl_se->runtime -= scaled_delta_exec;" would fix
> > the problem, no?  
> 
> Mmm, I also thought of letting the task "pay back" its overrunning.
> But, doesn't this get us quite far from what one would expect. I mean,
> enforcement granularity will be way different from task period, no?

Yes, the granularity will be what the kernel can provide (due to the HZ
value and to the hrtick on/off state). But at least the task will not
starve non-deadline tasks (which is bug that originated this
discussion, I think).

If I understand well, there are two different (and orthogonal) issues
here:
1) Due to a bug in the accounting / enforcement mechanisms
   (the wrong placement of the while() loop), the tasks consumes 100% of
   the CPU time, starving non-deadline tasks
2) Due to the large HZ value, the small runtime (and period) and the
   fact that hrtick is disabled, the kernel cannot provide the
   requested scheduling granularity

The second issue can be fixed by imposing limits on minimum and maximum
runtime and the first issue can be fixed by changing the code as I
suggested in my previous email.

I would suggest to address both the two issues, with separate changes
(the current replenishment code looks strange anyway).



Luca


Re: INFO: rcu detected stall in do_idle

2018-10-18 Thread luca abeni
On Thu, 18 Oct 2018 12:47:13 +0200
Juri Lelli  wrote:

> Hi,
> 
> On 18/10/18 12:23, luca abeni wrote:
> > Hi Juri,
> > 
> > On Thu, 18 Oct 2018 10:28:38 +0200
> > Juri Lelli  wrote:
> > [...]  
> > >  struct sched_attr {
> > > .size = 0,
> > > .policy   = 6,
> > > .flags= 0,
> > > .nice = 0,
> > > .priority = 0,
> > > .runtime  = 0x9917,
> > > .deadline = 0x,
> > > .period   = 0,
> > >  }
> > > 
> > > So, we seem to be correctly (in theory, see below) accepting the
> > > task.
> > > 
> > > What seems to generate the problem here is that CONFIG_HZ=100 and
> > > reproducer task has "tiny" runtime (~40us) and deadline (~66us)
> > > parameters, combination that "bypasses" the enforcing mechanism
> > > (performed at each tick).  
> > 
> > Ok, so the task can execute for at most 1 tick before being
> > throttled... Which does not look too bad.
> > 
> > I missed the original emails, but maybe the issue is that the task
> > blocks before the tick, and when it wakes up again something goes
> > wrong with the deadline and runtime assignment? (maybe because the
> > deadline is in the past?)  
> 
> No, the problem is that the task won't be throttled at all, because
> its replenishing instant is always way in the past when tick
> occurs. :-/

Ok, I see the issue now: the problem is that the "while (dl_se->runtime
<= 0)" loop is executed at replenishment time, but the deadline should
be postponed at enforcement time.

I mean: in update_curr_dl() we do:
dl_se->runtime -= scaled_delta_exec;
if (dl_runtime_exceeded(dl_se) || dl_se->dl_yielded) {
...
enqueue replenishment timer at dl_next_period(dl_se)
But dl_next_period() is based on a "wrong" deadline!


I think that inserting a
while (dl_se->runtime <= -pi_se->dl_runtime) {
dl_se->deadline += pi_se->dl_period;
dl_se->runtime += pi_se->dl_runtime;
}
immediately after "dl_se->runtime -= scaled_delta_exec;" would fix the
problem, no?
If we go this way, then we can remove the while loop from
replenish_dl_entity(), and change it in
WARN_ON(dl_se->runtime <= -pi_se->dl_runtime);
WARN_ON(dl_se->runtime > 0);
dl_se->deadline += pi_se->dl_period;
dl_se->runtime += pi_se->dl_runtime;
or something similar.


Luca


Re: INFO: rcu detected stall in do_idle

2018-10-18 Thread luca abeni
On Thu, 18 Oct 2018 12:47:13 +0200
Juri Lelli  wrote:

> Hi,
> 
> On 18/10/18 12:23, luca abeni wrote:
> > Hi Juri,
> > 
> > On Thu, 18 Oct 2018 10:28:38 +0200
> > Juri Lelli  wrote:
> > [...]  
> > >  struct sched_attr {
> > > .size = 0,
> > > .policy   = 6,
> > > .flags= 0,
> > > .nice = 0,
> > > .priority = 0,
> > > .runtime  = 0x9917,
> > > .deadline = 0x,
> > > .period   = 0,
> > >  }
> > > 
> > > So, we seem to be correctly (in theory, see below) accepting the
> > > task.
> > > 
> > > What seems to generate the problem here is that CONFIG_HZ=100 and
> > > reproducer task has "tiny" runtime (~40us) and deadline (~66us)
> > > parameters, combination that "bypasses" the enforcing mechanism
> > > (performed at each tick).  
> > 
> > Ok, so the task can execute for at most 1 tick before being
> > throttled... Which does not look too bad.
> > 
> > I missed the original emails, but maybe the issue is that the task
> > blocks before the tick, and when it wakes up again something goes
> > wrong with the deadline and runtime assignment? (maybe because the
> > deadline is in the past?)  
> 
> No, the problem is that the task won't be throttled at all, because
> its replenishing instant is always way in the past when tick
> occurs. :-/

Ok, I see the issue now: the problem is that the "while (dl_se->runtime
<= 0)" loop is executed at replenishment time, but the deadline should
be postponed at enforcement time.

I mean: in update_curr_dl() we do:
dl_se->runtime -= scaled_delta_exec;
if (dl_runtime_exceeded(dl_se) || dl_se->dl_yielded) {
...
enqueue replenishment timer at dl_next_period(dl_se)
But dl_next_period() is based on a "wrong" deadline!


I think that inserting a
while (dl_se->runtime <= -pi_se->dl_runtime) {
dl_se->deadline += pi_se->dl_period;
dl_se->runtime += pi_se->dl_runtime;
}
immediately after "dl_se->runtime -= scaled_delta_exec;" would fix the
problem, no?
If we go this way, then we can remove the while loop from
replenish_dl_entity(), and change it in
WARN_ON(dl_se->runtime <= -pi_se->dl_runtime);
WARN_ON(dl_se->runtime > 0);
dl_se->deadline += pi_se->dl_period;
dl_se->runtime += pi_se->dl_runtime;
or something similar.


Luca


Re: INFO: rcu detected stall in do_idle

2018-10-18 Thread luca abeni
Hi Juri,

On Thu, 18 Oct 2018 12:10:08 +0200
Juri Lelli  wrote:
[...]
> > Yes, a HZ related limit sounds like something we'd want. But if
> > we're going to do a minimum sysctl, we should also consider adding
> > a maximum, if you set a massive period/deadline, you can, even with
> > a relatively low u, incur significant delays.
> > 
> > And do we want to put the limit on runtime or on period ?
> > 
> > That is, something like:
> > 
> >   TICK_NSEC/2 < period < 10*TICK_NSEC
> > 
> > and/or
> > 
> >   TICK_NSEC/2 < runtime < 10*TICK_NSEC
> > 
> > Hmm, for HZ=1000 that ends up with a max period of 10ms, that's far
> > too low, 24Hz needs ~41ms. We can of course also limit the runtime
> > by capping u for users (as we should anyway).  
> 
> I also thought of TICK_NSEC/2 as a reasonably safe lower limit

I tend to think that something larger than "2" should be used (maybe
10? I mean: even if HZ = 100, it might make sense to allow a runtime
equal to 1ms...)


> that will implicitly limit period as well since
> 
>runtime <= deadline <= period

I agree that if we end up with TICK_NSEC/2 for the runtime limit then
explicitly enforcing a minimum period is not needed.



> Not sure about the upper limit, though. Lower limit is something
> related to the inherent granularity of the platform/config, upper
> limit is more to do with highest prio stuff with huge period delaying
> everything else; doesn't seem to be related to HZ?

I agree


Luca


Re: INFO: rcu detected stall in do_idle

2018-10-18 Thread luca abeni
Hi Juri,

On Thu, 18 Oct 2018 12:10:08 +0200
Juri Lelli  wrote:
[...]
> > Yes, a HZ related limit sounds like something we'd want. But if
> > we're going to do a minimum sysctl, we should also consider adding
> > a maximum, if you set a massive period/deadline, you can, even with
> > a relatively low u, incur significant delays.
> > 
> > And do we want to put the limit on runtime or on period ?
> > 
> > That is, something like:
> > 
> >   TICK_NSEC/2 < period < 10*TICK_NSEC
> > 
> > and/or
> > 
> >   TICK_NSEC/2 < runtime < 10*TICK_NSEC
> > 
> > Hmm, for HZ=1000 that ends up with a max period of 10ms, that's far
> > too low, 24Hz needs ~41ms. We can of course also limit the runtime
> > by capping u for users (as we should anyway).  
> 
> I also thought of TICK_NSEC/2 as a reasonably safe lower limit

I tend to think that something larger than "2" should be used (maybe
10? I mean: even if HZ = 100, it might make sense to allow a runtime
equal to 1ms...)


> that will implicitly limit period as well since
> 
>runtime <= deadline <= period

I agree that if we end up with TICK_NSEC/2 for the runtime limit then
explicitly enforcing a minimum period is not needed.



> Not sure about the upper limit, though. Lower limit is something
> related to the inherent granularity of the platform/config, upper
> limit is more to do with highest prio stuff with huge period delaying
> everything else; doesn't seem to be related to HZ?

I agree


Luca


Re: INFO: rcu detected stall in do_idle

2018-10-18 Thread luca abeni
Hi Peter,

On Thu, 18 Oct 2018 11:48:50 +0200
Peter Zijlstra  wrote:
[...]
> > So, I tend to think that we might want to play safe and put some
> > higher minimum value for dl_runtime (it's currently at 1ULL <<
> > DL_SCALE). Guess the problem is to pick a reasonable value, though.
> > Maybe link it someway to HZ? Then we might add a sysctl (or
> > similar) thing with which knowledgeable users can do whatever they
> > think their platform/config can support?  
> 
> Yes, a HZ related limit sounds like something we'd want. But if we're
> going to do a minimum sysctl, we should also consider adding a
> maximum, if you set a massive period/deadline, you can, even with a
> relatively low u, incur significant delays.

I agree with this.


> And do we want to put the limit on runtime or on period ?

I think we should have a minimum allowed runtime, a maximum allowed
runtime, a minimum allowed period and a (per-user? per-control
group?) maximum allowed utilization.

I suspect having a maximum period is useless, if we already enforce a
maximum runtime.


> That is, something like:
> 
>   TICK_NSEC/2 < period < 10*TICK_NSEC

As written above I would not enforce a maximum period.


> 
> and/or
> 
>   TICK_NSEC/2 < runtime < 10*TICK_NSEC

I think (but I might be wrong) that "TICK_NSEC/2" is too large... I
would divide the tick for a larger number (how many time do we want to
allow the loop to run?)

And I think the maximum runtime should not be TICK-dependent... It is
the maximum amount of time for which we allow the dealdine task to
starve non-deadline tasks, so it should be an absolute time, not
something HZ-dependent... No?



> Hmm, for HZ=1000 that ends up with a max period of 10ms, that's far
> too low, 24Hz needs ~41ms. We can of course also limit the runtime by
> capping u for users (as we should anyway).

Regarding capping u for users: some time ago, with Juri we discussed
the idea of having per-cgroup limits on the deadline utilization... I
think this is a good idea (and if the userspace creates a cgroup per
user, this results in per-user capping - but it is more flexible in
general)



Luca


Re: INFO: rcu detected stall in do_idle

2018-10-18 Thread luca abeni
Hi Peter,

On Thu, 18 Oct 2018 11:48:50 +0200
Peter Zijlstra  wrote:
[...]
> > So, I tend to think that we might want to play safe and put some
> > higher minimum value for dl_runtime (it's currently at 1ULL <<
> > DL_SCALE). Guess the problem is to pick a reasonable value, though.
> > Maybe link it someway to HZ? Then we might add a sysctl (or
> > similar) thing with which knowledgeable users can do whatever they
> > think their platform/config can support?  
> 
> Yes, a HZ related limit sounds like something we'd want. But if we're
> going to do a minimum sysctl, we should also consider adding a
> maximum, if you set a massive period/deadline, you can, even with a
> relatively low u, incur significant delays.

I agree with this.


> And do we want to put the limit on runtime or on period ?

I think we should have a minimum allowed runtime, a maximum allowed
runtime, a minimum allowed period and a (per-user? per-control
group?) maximum allowed utilization.

I suspect having a maximum period is useless, if we already enforce a
maximum runtime.


> That is, something like:
> 
>   TICK_NSEC/2 < period < 10*TICK_NSEC

As written above I would not enforce a maximum period.


> 
> and/or
> 
>   TICK_NSEC/2 < runtime < 10*TICK_NSEC

I think (but I might be wrong) that "TICK_NSEC/2" is too large... I
would divide the tick for a larger number (how many time do we want to
allow the loop to run?)

And I think the maximum runtime should not be TICK-dependent... It is
the maximum amount of time for which we allow the dealdine task to
starve non-deadline tasks, so it should be an absolute time, not
something HZ-dependent... No?



> Hmm, for HZ=1000 that ends up with a max period of 10ms, that's far
> too low, 24Hz needs ~41ms. We can of course also limit the runtime by
> capping u for users (as we should anyway).

Regarding capping u for users: some time ago, with Juri we discussed
the idea of having per-cgroup limits on the deadline utilization... I
think this is a good idea (and if the userspace creates a cgroup per
user, this results in per-user capping - but it is more flexible in
general)



Luca


Re: INFO: rcu detected stall in do_idle

2018-10-18 Thread luca abeni
Hi Juri,

On Thu, 18 Oct 2018 10:28:38 +0200
Juri Lelli  wrote:
[...]
>  struct sched_attr {
> .size = 0,
> .policy   = 6,
> .flags= 0,
> .nice = 0,
> .priority = 0,
> .runtime  = 0x9917,
> .deadline = 0x,
> .period   = 0,
>  }
> 
> So, we seem to be correctly (in theory, see below) accepting the task.
> 
> What seems to generate the problem here is that CONFIG_HZ=100 and
> reproducer task has "tiny" runtime (~40us) and deadline (~66us)
> parameters, combination that "bypasses" the enforcing mechanism
> (performed at each tick).

Ok, so the task can execute for at most 1 tick before being throttled...
Which does not look too bad.

I missed the original emails, but maybe the issue is that the task
blocks before the tick, and when it wakes up again something goes wrong
with the deadline and runtime assignment? (maybe because the deadline
is in the past?)


> Another side problem seems also to be that with such tiny parameters
> we spend lot of time in the while (dl_se->runtime <= 0) loop of
> replenish_dl_ entity() (actually uselessly, as deadline is most
> probably going to still be in the past when eventually runtime
> becomes positive again), as delta_exec is huge w.r.t. runtime and
> runtime has to keep up with tiny increments of dl_runtime. I guess we
> could ameliorate things here by limiting the number of time we
> execute the loop before bailing out.

Actually, I think the loop will iterate at most 10ms / 39us times, which
is about 256 times, right? If this is too much (I do not know how much
time it is spent executing the loop), then the solution is (as you
suggest) to increase the minimum allowed runtime.

[...]
> So, I tend to think that we might want to play safe and put some
> higher minimum value for dl_runtime (it's currently at 1ULL <<
> DL_SCALE). Guess the problem is to pick a reasonable value, though.
> Maybe link it someway to HZ?

Yes, a value dependent on HZ looks like a good idea. I would propose
HZ / N, where N is the maximum number of times you want the loop above
to be executed.


> Then we might add a sysctl (or similar)
> thing with which knowledgeable users can do whatever they think their
> platform/config can support?

I guess this can be related to the utilization limits we were
discussing some time ago... I would propose a cgroup-based interface to
set all of these limits.



Luca


Re: INFO: rcu detected stall in do_idle

2018-10-18 Thread luca abeni
Hi Juri,

On Thu, 18 Oct 2018 10:28:38 +0200
Juri Lelli  wrote:
[...]
>  struct sched_attr {
> .size = 0,
> .policy   = 6,
> .flags= 0,
> .nice = 0,
> .priority = 0,
> .runtime  = 0x9917,
> .deadline = 0x,
> .period   = 0,
>  }
> 
> So, we seem to be correctly (in theory, see below) accepting the task.
> 
> What seems to generate the problem here is that CONFIG_HZ=100 and
> reproducer task has "tiny" runtime (~40us) and deadline (~66us)
> parameters, combination that "bypasses" the enforcing mechanism
> (performed at each tick).

Ok, so the task can execute for at most 1 tick before being throttled...
Which does not look too bad.

I missed the original emails, but maybe the issue is that the task
blocks before the tick, and when it wakes up again something goes wrong
with the deadline and runtime assignment? (maybe because the deadline
is in the past?)


> Another side problem seems also to be that with such tiny parameters
> we spend lot of time in the while (dl_se->runtime <= 0) loop of
> replenish_dl_ entity() (actually uselessly, as deadline is most
> probably going to still be in the past when eventually runtime
> becomes positive again), as delta_exec is huge w.r.t. runtime and
> runtime has to keep up with tiny increments of dl_runtime. I guess we
> could ameliorate things here by limiting the number of time we
> execute the loop before bailing out.

Actually, I think the loop will iterate at most 10ms / 39us times, which
is about 256 times, right? If this is too much (I do not know how much
time it is spent executing the loop), then the solution is (as you
suggest) to increase the minimum allowed runtime.

[...]
> So, I tend to think that we might want to play safe and put some
> higher minimum value for dl_runtime (it's currently at 1ULL <<
> DL_SCALE). Guess the problem is to pick a reasonable value, though.
> Maybe link it someway to HZ?

Yes, a value dependent on HZ looks like a good idea. I would propose
HZ / N, where N is the maximum number of times you want the loop above
to be executed.


> Then we might add a sysctl (or similar)
> thing with which knowledgeable users can do whatever they think their
> platform/config can support?

I guess this can be related to the utilization limits we were
discussing some time ago... I would propose a cgroup-based interface to
set all of these limits.



Luca


Re: [RFD/RFC PATCH 5/8] sched: Add proxy execution

2018-10-12 Thread luca abeni
On Thu, 11 Oct 2018 14:53:25 +0200
Peter Zijlstra  wrote:

[...]
> > > > +   if (rq->curr != rq->idle) {
> > > > +   rq->proxy = rq->idle;
> > > > +   set_tsk_need_resched(rq->idle);
> > > > +   /*
> > > > +* XXX [juril] don't we still need to migrate
> > > > @next to
> > > > +* @owner's CPU?
> > > > +*/
> > > > +   return rq->idle;
> > > > +   }  
> > > 
> > > If I understand well, this code ends up migrating the task only
> > > if the CPU was previously idle? (scheduling the idle task if the
> > > CPU was not previously idle)
> > > 
> > > Out of curiosity (I admit this is my ignorance), why is this
> > > needed? If I understand well, after scheduling the idle task the
> > > scheduler will be invoked again (because of the
> > > set_tsk_need_resched(rq->idle)) but I do not understand why it is
> > > not possible to migrate task "p" immediately (I would just check
> > > "rq->curr != p", to avoid migrating the currently scheduled
> > > task).  
[...]
> I think it was the safe and simple choice; note that we're not
> migrating just a single @p, but a whole chain of @p.

Ah, that's the point I was missing... Thanks for explaining, now
everything looks more clear!


But... Here is my next dumb question: once the tasks are migrated to
the other runqueue, what prevents the scheduler from migrating them
back? In particular, task p: if it is (for example) a fixed priority
task an is on this runqueue, it is probably because the FP invariant
wants this... So, the push mechanism might end up migrating p back to
this runqueue soon... No?

Another doubt: if I understand well, when a task p "blocks" on a mutex
the proxy mechanism migrates it (and the whole chain of blocked tasks)
to the owner's core... Right?
Now, I understand why this is simpler to implement, but from the
schedulability point of view shouldn't we migrate the owner to p's core
instead?


Thanks,
Luca

> rq->curr must
> not be any of the possible @p's. rq->idle, is per definition not one
> of the @p's.
> 
> Does that make sense?



Re: [RFD/RFC PATCH 5/8] sched: Add proxy execution

2018-10-12 Thread luca abeni
On Thu, 11 Oct 2018 14:53:25 +0200
Peter Zijlstra  wrote:

[...]
> > > > +   if (rq->curr != rq->idle) {
> > > > +   rq->proxy = rq->idle;
> > > > +   set_tsk_need_resched(rq->idle);
> > > > +   /*
> > > > +* XXX [juril] don't we still need to migrate
> > > > @next to
> > > > +* @owner's CPU?
> > > > +*/
> > > > +   return rq->idle;
> > > > +   }  
> > > 
> > > If I understand well, this code ends up migrating the task only
> > > if the CPU was previously idle? (scheduling the idle task if the
> > > CPU was not previously idle)
> > > 
> > > Out of curiosity (I admit this is my ignorance), why is this
> > > needed? If I understand well, after scheduling the idle task the
> > > scheduler will be invoked again (because of the
> > > set_tsk_need_resched(rq->idle)) but I do not understand why it is
> > > not possible to migrate task "p" immediately (I would just check
> > > "rq->curr != p", to avoid migrating the currently scheduled
> > > task).  
[...]
> I think it was the safe and simple choice; note that we're not
> migrating just a single @p, but a whole chain of @p.

Ah, that's the point I was missing... Thanks for explaining, now
everything looks more clear!


But... Here is my next dumb question: once the tasks are migrated to
the other runqueue, what prevents the scheduler from migrating them
back? In particular, task p: if it is (for example) a fixed priority
task an is on this runqueue, it is probably because the FP invariant
wants this... So, the push mechanism might end up migrating p back to
this runqueue soon... No?

Another doubt: if I understand well, when a task p "blocks" on a mutex
the proxy mechanism migrates it (and the whole chain of blocked tasks)
to the owner's core... Right?
Now, I understand why this is simpler to implement, but from the
schedulability point of view shouldn't we migrate the owner to p's core
instead?


Thanks,
Luca

> rq->curr must
> not be any of the possible @p's. rq->idle, is per definition not one
> of the @p's.
> 
> Does that make sense?



Re: [RFD/RFC PATCH 0/8] Towards implementing proxy execution

2018-10-10 Thread luca abeni
Hi all,

On Tue,  9 Oct 2018 11:24:26 +0200
Juri Lelli  wrote:

> Hi all,
> 
> Proxy Execution (also goes under several other names) isn't a new
> concept, it has been mentioned already in the past to this community
> (both in email discussions and at conferences [1, 2]), but no actual
> implementation that applies to a fairly recent kernel exists as of
> today (of which I'm aware of at least - happy to be proven wrong).
> 
> Very broadly speaking, more info below, proxy execution enables a task
> to run using the context of some other task that is "willing" to
> participate in the mechanism, as this helps both tasks to improve
> performance (w.r.t. the latter task not participating to proxy
> execution).

First of all, thanks to Juri for working on this!

I am looking at the patchset, and I have some questions / comments
(maybe some of my questions are really stupid, I do not know... :)


To begin, I have a very high-level comment about proxy execution: I
believe proxy execution is a very powerful concept, that can be used in
many cases, not only to do inheritance on mutual exclusion (think, for
example, about pipelines of tasks: a task implementing a stage of the
pipeline can act as a proxy for the task implementing the previous
stage).

So, I would propose to make the proxy() function of patch more generic,
and not strictly bound to mutexes. Maybe a task structure can contain a
list of tasks for which the task can act as a proxy, and we can have a
function like "I want to act as a proxy for task T" to be invoked when
a task blocks?



Luca


Re: [RFD/RFC PATCH 0/8] Towards implementing proxy execution

2018-10-10 Thread luca abeni
Hi all,

On Tue,  9 Oct 2018 11:24:26 +0200
Juri Lelli  wrote:

> Hi all,
> 
> Proxy Execution (also goes under several other names) isn't a new
> concept, it has been mentioned already in the past to this community
> (both in email discussions and at conferences [1, 2]), but no actual
> implementation that applies to a fairly recent kernel exists as of
> today (of which I'm aware of at least - happy to be proven wrong).
> 
> Very broadly speaking, more info below, proxy execution enables a task
> to run using the context of some other task that is "willing" to
> participate in the mechanism, as this helps both tasks to improve
> performance (w.r.t. the latter task not participating to proxy
> execution).

First of all, thanks to Juri for working on this!

I am looking at the patchset, and I have some questions / comments
(maybe some of my questions are really stupid, I do not know... :)


To begin, I have a very high-level comment about proxy execution: I
believe proxy execution is a very powerful concept, that can be used in
many cases, not only to do inheritance on mutual exclusion (think, for
example, about pipelines of tasks: a task implementing a stage of the
pipeline can act as a proxy for the task implementing the previous
stage).

So, I would propose to make the proxy() function of patch more generic,
and not strictly bound to mutexes. Maybe a task structure can contain a
list of tasks for which the task can act as a proxy, and we can have a
function like "I want to act as a proxy for task T" to be invoked when
a task blocks?



Luca


Re: [RFD/RFC PATCH 0/8] Towards implementing proxy execution

2018-10-10 Thread luca abeni
On Wed, 10 Oct 2018 12:57:10 +0200
Peter Zijlstra  wrote:

> On Wed, Oct 10, 2018 at 12:34:17PM +0200, luca abeni wrote:
> > So, I would propose to make the proxy() function of patch more
> > generic, and not strictly bound to mutexes. Maybe a task structure
> > can contain a list of tasks for which the task can act as a proxy,
> > and we can have a function like "I want to act as a proxy for task
> > T" to be invoked when a task blocks?  
> 
> Certainly possible, but that's something I'd prefer to look at after
> it all 'works'.

Of course :)
I was mentioning this idea because maybe it can have some impact on the
design.

BTW, here is another "interesting" issue I had in the past with changes
like this one: how do we check if the patchset works as expected?

"No crashes" is surely a requirement, but I think we also need some
kind of testcase that fails if the inheritance mechanism is not working
properly, and is successful if the inheritance works.

Maybe we can develop some testcase based on rt-app (if noone has such a
testcase already)


Thanks,
Luca
> The mutex blocking thing doesn't require external
> interfaces etc..



Re: [RFD/RFC PATCH 0/8] Towards implementing proxy execution

2018-10-10 Thread luca abeni
On Wed, 10 Oct 2018 12:57:10 +0200
Peter Zijlstra  wrote:

> On Wed, Oct 10, 2018 at 12:34:17PM +0200, luca abeni wrote:
> > So, I would propose to make the proxy() function of patch more
> > generic, and not strictly bound to mutexes. Maybe a task structure
> > can contain a list of tasks for which the task can act as a proxy,
> > and we can have a function like "I want to act as a proxy for task
> > T" to be invoked when a task blocks?  
> 
> Certainly possible, but that's something I'd prefer to look at after
> it all 'works'.

Of course :)
I was mentioning this idea because maybe it can have some impact on the
design.

BTW, here is another "interesting" issue I had in the past with changes
like this one: how do we check if the patchset works as expected?

"No crashes" is surely a requirement, but I think we also need some
kind of testcase that fails if the inheritance mechanism is not working
properly, and is successful if the inheritance works.

Maybe we can develop some testcase based on rt-app (if noone has such a
testcase already)


Thanks,
Luca
> The mutex blocking thing doesn't require external
> interfaces etc..



Re: [RFD/RFC PATCH 5/8] sched: Add proxy execution

2018-10-10 Thread luca abeni
Hi,

On Tue,  9 Oct 2018 11:24:31 +0200
Juri Lelli  wrote:
[...]
> +migrate_task:
[...]
> + put_prev_task(rq, next);
> + if (rq->curr != rq->idle) {
> + rq->proxy = rq->idle;
> + set_tsk_need_resched(rq->idle);
> + /*
> +  * XXX [juril] don't we still need to migrate @next
> to
> +  * @owner's CPU?
> +  */
> + return rq->idle;
> + }

If I understand well, this code ends up migrating the task only if the
CPU was previously idle? (scheduling the idle task if the CPU was not
previously idle)

Out of curiosity (I admit this is my ignorance), why is this needed?
If I understand well, after scheduling the idle task the scheduler will
be invoked again (because of the set_tsk_need_resched(rq->idle)) but I
do not understand why it is not possible to migrate task "p" immediately
(I would just check "rq->curr != p", to avoid migrating the currently
scheduled task).


Thanks,
Luca

> + rq->proxy = _task;
> +
> + for (; p; p = p->blocked_task) {
> + int wake_cpu = p->wake_cpu;
> +
> + WARN_ON(p == rq->curr);
> +
> + p->on_rq = TASK_ON_RQ_MIGRATING;
> + dequeue_task(rq, p, 0);
> + set_task_cpu(p, that_cpu);
> + /*
> +  * We can abuse blocked_entry to migrate the thing,
> because @p is
> +  * still on the rq.
> +  */
> + list_add(>blocked_entry, _list);
> +
> + /*
> +  * Preserve p->wake_cpu, such that we can tell where
> it
> +  * used to run later.
> +  */
> + p->wake_cpu = wake_cpu;
> + }
> +
> + rq_unpin_lock(rq, rf);
> + raw_spin_unlock(>lock);
> + raw_spin_lock(_rq->lock);
> +
> + while (!list_empty(_list)) {
> + p = list_first_entry(_list, struct
> task_struct, blocked_entry);
> + list_del_init(>blocked_entry);
> +
> + enqueue_task(that_rq, p, 0);
> + check_preempt_curr(that_rq, p, 0);
> + p->on_rq = TASK_ON_RQ_QUEUED;
> + resched_curr(that_rq);
> + }
> +
> + raw_spin_unlock(_rq->lock);
> + raw_spin_lock(>lock);
> + rq_repin_lock(rq, rf);
> +
> + return NULL; /* Retry task selection on _this_ CPU. */
> +
> +owned_task:
> + /*
> +  * Its possible we interleave with mutex_unlock like:
> +  *
> +  *  lock(>lock);
> +  *proxy()
> +  * mutex_unlock()
> +  *   lock(_lock);
> +  *   next(owner) = current->blocked_task;
> +  *   unlock(_lock);
> +  *
> +  *   wake_up_q();
> +  * ...
> +  *   ttwu_remote()
> +  * __task_rq_lock()
> +  *lock(_lock);
> +  *owner == p
> +  *
> +  * Which leaves us to finish the ttwu_remote() and make it
> go.
> +  *
> +  * XXX is this happening in case of an HANDOFF to p?
> +  * In any case, reading of the owner in
> __mutex_unlock_slowpath is
> +  * done atomically outside wait_lock (only adding waiters to
> wake_q is
> +  * done inside the critical section).
> +  * Does this means we can get to proxy _w/o an owner_ if
> that was
> +  * cleared before grabbing wait_lock? Do we account for this
> case?
> +  * OK we actually do (see PROXY_EXEC ifdeffery in unlock
> function).
> +  */
> +
> + /*
> +  * Finish wakeup, will make the contending ttwu do a
> +  * _spurious_ wakeup, but all code should be able to
> +  * deal with that.
> +  */
> + owner->blocked_on = NULL;
> + owner->state = TASK_RUNNING;
> + // XXX task_woken
> +
> + /*
> +  * If @owner/@p is allowed to run on this CPU, make it go.
> +  */
> + if (cpumask_test_cpu(this_cpu, >cpus_allowed)) {
> + raw_spin_unlock(>wait_lock);
> + return owner;
> + }
> +
> + /*
> +  * We have to let ttwu fix things up, because we
> +  * can't restore the affinity. So dequeue.
> +  */
> + owner->on_rq = 0;
> + deactivate_task(rq, p, DEQUEUE_SLEEP);
> + goto blocked_task;
> +
> +blocked_task:
> + /*
> +  * If !@owner->on_rq, holding @rq->lock will not pin the
> task,
> +  * so we cannot drop @mutex->wait_lock until we're sure its
> a blocked
> +  * task on this rq.
> +  *
> +  * We use @owner->blocked_lock to serialize against
> ttwu_activate().
> +  * Either we see its new owner->on_rq or it will see our
> list_add().
> +  */
> + raw_spin_lock(>blocked_lock);
> +
> + /*
> +  * If we became runnable while waiting for blocked_lock,
> retry.
> +  */
> + if (owner->on_rq) {
> + /*
> +  * If we see the new on->rq, we must also see the
> new task_cpu().
> +  */
> + 

Re: [RFD/RFC PATCH 5/8] sched: Add proxy execution

2018-10-10 Thread luca abeni
Hi,

On Tue,  9 Oct 2018 11:24:31 +0200
Juri Lelli  wrote:
[...]
> +migrate_task:
[...]
> + put_prev_task(rq, next);
> + if (rq->curr != rq->idle) {
> + rq->proxy = rq->idle;
> + set_tsk_need_resched(rq->idle);
> + /*
> +  * XXX [juril] don't we still need to migrate @next
> to
> +  * @owner's CPU?
> +  */
> + return rq->idle;
> + }

If I understand well, this code ends up migrating the task only if the
CPU was previously idle? (scheduling the idle task if the CPU was not
previously idle)

Out of curiosity (I admit this is my ignorance), why is this needed?
If I understand well, after scheduling the idle task the scheduler will
be invoked again (because of the set_tsk_need_resched(rq->idle)) but I
do not understand why it is not possible to migrate task "p" immediately
(I would just check "rq->curr != p", to avoid migrating the currently
scheduled task).


Thanks,
Luca

> + rq->proxy = _task;
> +
> + for (; p; p = p->blocked_task) {
> + int wake_cpu = p->wake_cpu;
> +
> + WARN_ON(p == rq->curr);
> +
> + p->on_rq = TASK_ON_RQ_MIGRATING;
> + dequeue_task(rq, p, 0);
> + set_task_cpu(p, that_cpu);
> + /*
> +  * We can abuse blocked_entry to migrate the thing,
> because @p is
> +  * still on the rq.
> +  */
> + list_add(>blocked_entry, _list);
> +
> + /*
> +  * Preserve p->wake_cpu, such that we can tell where
> it
> +  * used to run later.
> +  */
> + p->wake_cpu = wake_cpu;
> + }
> +
> + rq_unpin_lock(rq, rf);
> + raw_spin_unlock(>lock);
> + raw_spin_lock(_rq->lock);
> +
> + while (!list_empty(_list)) {
> + p = list_first_entry(_list, struct
> task_struct, blocked_entry);
> + list_del_init(>blocked_entry);
> +
> + enqueue_task(that_rq, p, 0);
> + check_preempt_curr(that_rq, p, 0);
> + p->on_rq = TASK_ON_RQ_QUEUED;
> + resched_curr(that_rq);
> + }
> +
> + raw_spin_unlock(_rq->lock);
> + raw_spin_lock(>lock);
> + rq_repin_lock(rq, rf);
> +
> + return NULL; /* Retry task selection on _this_ CPU. */
> +
> +owned_task:
> + /*
> +  * Its possible we interleave with mutex_unlock like:
> +  *
> +  *  lock(>lock);
> +  *proxy()
> +  * mutex_unlock()
> +  *   lock(_lock);
> +  *   next(owner) = current->blocked_task;
> +  *   unlock(_lock);
> +  *
> +  *   wake_up_q();
> +  * ...
> +  *   ttwu_remote()
> +  * __task_rq_lock()
> +  *lock(_lock);
> +  *owner == p
> +  *
> +  * Which leaves us to finish the ttwu_remote() and make it
> go.
> +  *
> +  * XXX is this happening in case of an HANDOFF to p?
> +  * In any case, reading of the owner in
> __mutex_unlock_slowpath is
> +  * done atomically outside wait_lock (only adding waiters to
> wake_q is
> +  * done inside the critical section).
> +  * Does this means we can get to proxy _w/o an owner_ if
> that was
> +  * cleared before grabbing wait_lock? Do we account for this
> case?
> +  * OK we actually do (see PROXY_EXEC ifdeffery in unlock
> function).
> +  */
> +
> + /*
> +  * Finish wakeup, will make the contending ttwu do a
> +  * _spurious_ wakeup, but all code should be able to
> +  * deal with that.
> +  */
> + owner->blocked_on = NULL;
> + owner->state = TASK_RUNNING;
> + // XXX task_woken
> +
> + /*
> +  * If @owner/@p is allowed to run on this CPU, make it go.
> +  */
> + if (cpumask_test_cpu(this_cpu, >cpus_allowed)) {
> + raw_spin_unlock(>wait_lock);
> + return owner;
> + }
> +
> + /*
> +  * We have to let ttwu fix things up, because we
> +  * can't restore the affinity. So dequeue.
> +  */
> + owner->on_rq = 0;
> + deactivate_task(rq, p, DEQUEUE_SLEEP);
> + goto blocked_task;
> +
> +blocked_task:
> + /*
> +  * If !@owner->on_rq, holding @rq->lock will not pin the
> task,
> +  * so we cannot drop @mutex->wait_lock until we're sure its
> a blocked
> +  * task on this rq.
> +  *
> +  * We use @owner->blocked_lock to serialize against
> ttwu_activate().
> +  * Either we see its new owner->on_rq or it will see our
> list_add().
> +  */
> + raw_spin_lock(>blocked_lock);
> +
> + /*
> +  * If we became runnable while waiting for blocked_lock,
> retry.
> +  */
> + if (owner->on_rq) {
> + /*
> +  * If we see the new on->rq, we must also see the
> new task_cpu().
> +  */
> + 

Re: [RFD/RFC PATCH 3/8] locking/mutex: Rework task_struct::blocked_on

2018-10-10 Thread luca abeni
Hi,

On Tue,  9 Oct 2018 11:24:29 +0200
Juri Lelli  wrote:

> From: Peter Zijlstra 
> 
> Track the blocked-on relation for mutexes, this allows following this
> relation at schedule time. Add blocked_task to track the inverse
> relation.
> 
> ,-> task
> | | blocked-on
> | v
>blocked-task |   mutex
> | | owner
> | v
> `-- task

I was a little bit confused by this description, because (if I
understand the code well) blocked_task does not actually track the
inverse of the "blocked_on" relationship, but just points to the task
that is _currently_ acting as a proxy for a given task.

In theory, we could have multiple tasks blocked on "mutex" (which is
owned by "task"), so if "blocked_task" tracked the inverse of
"blocked_on" it should have been a list (or a data structure containing
pointers to multiple task structures), no?

I would propose to change "blocked_task" into something like
"current_proxy", or similar, which should be more clear (unless I
completely misunderstood this stuff... In that case, sorry about the
noise)


Also, I suspect that this "blocked_task" (or "current_proxy") field
should be introcuced in patch 5 (same for the "task_is_blocked()"
function from patch 4... Should it go in patch 5?)

Luca
> 
> This patch only enables blocked-on relation, blocked-task will be
> enabled in a later patch implementing proxy().
> 
> Signed-off-by: Peter Zijlstra (Intel) 
> [minor changes while rebasing]
> Signed-off-by: Juri Lelli 
> ---
>  include/linux/sched.h| 6 ++
>  kernel/fork.c| 6 +++---
>  kernel/locking/mutex-debug.c | 7 +++
>  kernel/locking/mutex.c   | 3 +++
>  4 files changed, 11 insertions(+), 11 deletions(-)
> 
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 977cb57d7bc9..a35e8ab3eef1 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -907,10 +907,8 @@ struct task_struct {
>   struct rt_mutex_waiter  *pi_blocked_on;
>  #endif
>  
> -#ifdef CONFIG_DEBUG_MUTEXES
> - /* Mutex deadlock detection: */
> - struct mutex_waiter *blocked_on;
> -#endif
> + struct task_struct  *blocked_task;  /* task
> that's boosting us */
> + struct mutex*blocked_on;/* lock
> we're blocked on */ 
>  #ifdef CONFIG_TRACE_IRQFLAGS
>   unsigned intirq_events;
> diff --git a/kernel/fork.c b/kernel/fork.c
> index f0b58479534f..ef27a675b0d7 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -1827,9 +1827,9 @@ static __latent_entropy struct task_struct
> *copy_process( lockdep_init_task(p);
>  #endif
>  
> -#ifdef CONFIG_DEBUG_MUTEXES
> - p->blocked_on = NULL; /* not blocked yet */
> -#endif
> + p->blocked_task = NULL; /* nobody is boosting us yet*/
> + p->blocked_on = NULL;  /* not blocked yet */
> +
>  #ifdef CONFIG_BCACHE
>   p->sequential_io= 0;
>   p->sequential_io_avg= 0;
> diff --git a/kernel/locking/mutex-debug.c
> b/kernel/locking/mutex-debug.c index a660d38b6c29..6605e083a3e9 100644
> --- a/kernel/locking/mutex-debug.c
> +++ b/kernel/locking/mutex-debug.c
> @@ -53,8 +53,8 @@ void debug_mutex_add_waiter(struct mutex *lock,
> struct mutex_waiter *waiter, {
>   SMP_DEBUG_LOCKS_WARN_ON(!raw_spin_is_locked(>wait_lock));
>  
> - /* Mark the current thread as blocked on the lock: */
> - task->blocked_on = waiter;
> + /* Current thread can't be alredy blocked (since it's
> executing!) */
> + DEBUG_LOCKS_WARN_ON(task->blocked_on);
>  }
>  
>  void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter
> *waiter, @@ -62,8 +62,7 @@ void mutex_remove_waiter(struct mutex
> *lock, struct mutex_waiter *waiter, {
>   DEBUG_LOCKS_WARN_ON(list_empty(>list));
>   DEBUG_LOCKS_WARN_ON(waiter->task != task);
> - DEBUG_LOCKS_WARN_ON(task->blocked_on != waiter);
> - task->blocked_on = NULL;
> + DEBUG_LOCKS_WARN_ON(task->blocked_on != lock);
>  
>   list_del_init(>list);
>   waiter->task = NULL;
> diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
> index f37402cd8496..76b59b555da3 100644
> --- a/kernel/locking/mutex.c
> +++ b/kernel/locking/mutex.c
> @@ -979,6 +979,7 @@ __mutex_lock_common(struct mutex *lock, long
> state, unsigned int subclass, }
>  
>   waiter.task = current;
> + current->blocked_on = lock;
>  
>   set_current_state(state);
>   for (;;) {
> @@ -1047,6 +1048,8 @@ __mutex_lock_common(struct mutex *lock, long
> state, unsigned int subclass, }
>  
>   mutex_remove_waiter(lock, , current);
> + current->blocked_on = NULL;
> +
>   if (likely(list_empty(>wait_list)))
>   __mutex_clear_flag(lock, MUTEX_FLAGS);
>  



Re: [RFD/RFC PATCH 3/8] locking/mutex: Rework task_struct::blocked_on

2018-10-10 Thread luca abeni
Hi,

On Tue,  9 Oct 2018 11:24:29 +0200
Juri Lelli  wrote:

> From: Peter Zijlstra 
> 
> Track the blocked-on relation for mutexes, this allows following this
> relation at schedule time. Add blocked_task to track the inverse
> relation.
> 
> ,-> task
> | | blocked-on
> | v
>blocked-task |   mutex
> | | owner
> | v
> `-- task

I was a little bit confused by this description, because (if I
understand the code well) blocked_task does not actually track the
inverse of the "blocked_on" relationship, but just points to the task
that is _currently_ acting as a proxy for a given task.

In theory, we could have multiple tasks blocked on "mutex" (which is
owned by "task"), so if "blocked_task" tracked the inverse of
"blocked_on" it should have been a list (or a data structure containing
pointers to multiple task structures), no?

I would propose to change "blocked_task" into something like
"current_proxy", or similar, which should be more clear (unless I
completely misunderstood this stuff... In that case, sorry about the
noise)


Also, I suspect that this "blocked_task" (or "current_proxy") field
should be introcuced in patch 5 (same for the "task_is_blocked()"
function from patch 4... Should it go in patch 5?)

Luca
> 
> This patch only enables blocked-on relation, blocked-task will be
> enabled in a later patch implementing proxy().
> 
> Signed-off-by: Peter Zijlstra (Intel) 
> [minor changes while rebasing]
> Signed-off-by: Juri Lelli 
> ---
>  include/linux/sched.h| 6 ++
>  kernel/fork.c| 6 +++---
>  kernel/locking/mutex-debug.c | 7 +++
>  kernel/locking/mutex.c   | 3 +++
>  4 files changed, 11 insertions(+), 11 deletions(-)
> 
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 977cb57d7bc9..a35e8ab3eef1 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -907,10 +907,8 @@ struct task_struct {
>   struct rt_mutex_waiter  *pi_blocked_on;
>  #endif
>  
> -#ifdef CONFIG_DEBUG_MUTEXES
> - /* Mutex deadlock detection: */
> - struct mutex_waiter *blocked_on;
> -#endif
> + struct task_struct  *blocked_task;  /* task
> that's boosting us */
> + struct mutex*blocked_on;/* lock
> we're blocked on */ 
>  #ifdef CONFIG_TRACE_IRQFLAGS
>   unsigned intirq_events;
> diff --git a/kernel/fork.c b/kernel/fork.c
> index f0b58479534f..ef27a675b0d7 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -1827,9 +1827,9 @@ static __latent_entropy struct task_struct
> *copy_process( lockdep_init_task(p);
>  #endif
>  
> -#ifdef CONFIG_DEBUG_MUTEXES
> - p->blocked_on = NULL; /* not blocked yet */
> -#endif
> + p->blocked_task = NULL; /* nobody is boosting us yet*/
> + p->blocked_on = NULL;  /* not blocked yet */
> +
>  #ifdef CONFIG_BCACHE
>   p->sequential_io= 0;
>   p->sequential_io_avg= 0;
> diff --git a/kernel/locking/mutex-debug.c
> b/kernel/locking/mutex-debug.c index a660d38b6c29..6605e083a3e9 100644
> --- a/kernel/locking/mutex-debug.c
> +++ b/kernel/locking/mutex-debug.c
> @@ -53,8 +53,8 @@ void debug_mutex_add_waiter(struct mutex *lock,
> struct mutex_waiter *waiter, {
>   SMP_DEBUG_LOCKS_WARN_ON(!raw_spin_is_locked(>wait_lock));
>  
> - /* Mark the current thread as blocked on the lock: */
> - task->blocked_on = waiter;
> + /* Current thread can't be alredy blocked (since it's
> executing!) */
> + DEBUG_LOCKS_WARN_ON(task->blocked_on);
>  }
>  
>  void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter
> *waiter, @@ -62,8 +62,7 @@ void mutex_remove_waiter(struct mutex
> *lock, struct mutex_waiter *waiter, {
>   DEBUG_LOCKS_WARN_ON(list_empty(>list));
>   DEBUG_LOCKS_WARN_ON(waiter->task != task);
> - DEBUG_LOCKS_WARN_ON(task->blocked_on != waiter);
> - task->blocked_on = NULL;
> + DEBUG_LOCKS_WARN_ON(task->blocked_on != lock);
>  
>   list_del_init(>list);
>   waiter->task = NULL;
> diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
> index f37402cd8496..76b59b555da3 100644
> --- a/kernel/locking/mutex.c
> +++ b/kernel/locking/mutex.c
> @@ -979,6 +979,7 @@ __mutex_lock_common(struct mutex *lock, long
> state, unsigned int subclass, }
>  
>   waiter.task = current;
> + current->blocked_on = lock;
>  
>   set_current_state(state);
>   for (;;) {
> @@ -1047,6 +1048,8 @@ __mutex_lock_common(struct mutex *lock, long
> state, unsigned int subclass, }
>  
>   mutex_remove_waiter(lock, , current);
> + current->blocked_on = NULL;
> +
>   if (likely(list_empty(>wait_list)))
>   __mutex_clear_flag(lock, MUTEX_FLAGS);
>  



Re: [PATCH] sched/deadline: Fix switched_from_dl

2018-07-11 Thread luca abeni
On Wed, 11 Jul 2018 09:29:48 +0200
Juri Lelli  wrote:

> Mark noticed that syzkaller is able to reliably trigger the following
> 
>   dl_rq->running_bw > dl_rq->this_bw
>   WARNING: CPU: 1 PID: 153 at kernel/sched/deadline.c:124
> switched_from_dl+0x454/0x608 Kernel panic - not syncing:
> panic_on_warn set ...
> 
>   CPU: 1 PID: 153 Comm: syz-executor253 Not tainted 4.18.0-rc3+ #29
>   Hardware name: linux,dummy-virt (DT)
>   Call trace:
>dump_backtrace+0x0/0x458
>show_stack+0x20/0x30
>dump_stack+0x180/0x250
>panic+0x2dc/0x4ec
>__warn_printk+0x0/0x150
>report_bug+0x228/0x2d8
>bug_handler+0xa0/0x1a0
>brk_handler+0x2f0/0x568
>do_debug_exception+0x1bc/0x5d0
>el1_dbg+0x18/0x78
>switched_from_dl+0x454/0x608
>__sched_setscheduler+0x8cc/0x2018
>sys_sched_setattr+0x340/0x758
>el0_svc_naked+0x30/0x34
> 
> syzkaller reproducer runs a bunch of threads that constantly switch
> between DEADLINE and NORMAL classes while interacting through futexes.
> 
> The splat above is caused by the fact that if a DEADLINE task is
> setattr back to NORMAL while in non_contending state (blocked on a
> futex - inactive timer armed), its contribution to running_bw is not
> removed before sub_rq_bw() gets called (!task_on_rq_queued() branch)
> and the latter sees running_bw > this_bw.
> 
> Fix it by removing a task contribution from running_bw if the task is
> not queued and in non_contending state while switched to a different
> class.
> 
> Reported-by: Mark Rutland 
> Signed-off-by: Juri Lelli 
> ---
>  kernel/sched/deadline.c | 11 ++-
>  1 file changed, 10 insertions(+), 1 deletion(-)

Reviewed-by: Luca Abeni 

And, thanks for taking care of this issue!


Thanks,
Luca


> 
> diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
> index fbfc3f1d368a..10c7b51c0d1f 100644
> --- a/kernel/sched/deadline.c
> +++ b/kernel/sched/deadline.c
> @@ -2290,8 +2290,17 @@ static void switched_from_dl(struct rq *rq,
> struct task_struct *p) if (task_on_rq_queued(p) && p->dl.dl_runtime)
>   task_non_contending(p);
>  
> - if (!task_on_rq_queued(p))
> + if (!task_on_rq_queued(p)) {
> + /*
> +  * Inactive timer is armed. However, p is leaving
> DEADLINE and
> +  * might migrate away from this rq while continuing
> to run on
> +  * some other class. We need to remove its
> contribution from
> +  * this rq running_bw now, or sub_rq_bw (below) will
> complain.
> +  */
> + if (p->dl.dl_non_contending)
> + sub_running_bw(>dl, >dl);
>   sub_rq_bw(>dl, >dl);
> + }
>  
>   /*
>* We cannot use inactive_task_timer() to invoke
> sub_running_bw()



Re: [PATCH] sched/deadline: Fix switched_from_dl

2018-07-11 Thread luca abeni
On Wed, 11 Jul 2018 09:29:48 +0200
Juri Lelli  wrote:

> Mark noticed that syzkaller is able to reliably trigger the following
> 
>   dl_rq->running_bw > dl_rq->this_bw
>   WARNING: CPU: 1 PID: 153 at kernel/sched/deadline.c:124
> switched_from_dl+0x454/0x608 Kernel panic - not syncing:
> panic_on_warn set ...
> 
>   CPU: 1 PID: 153 Comm: syz-executor253 Not tainted 4.18.0-rc3+ #29
>   Hardware name: linux,dummy-virt (DT)
>   Call trace:
>dump_backtrace+0x0/0x458
>show_stack+0x20/0x30
>dump_stack+0x180/0x250
>panic+0x2dc/0x4ec
>__warn_printk+0x0/0x150
>report_bug+0x228/0x2d8
>bug_handler+0xa0/0x1a0
>brk_handler+0x2f0/0x568
>do_debug_exception+0x1bc/0x5d0
>el1_dbg+0x18/0x78
>switched_from_dl+0x454/0x608
>__sched_setscheduler+0x8cc/0x2018
>sys_sched_setattr+0x340/0x758
>el0_svc_naked+0x30/0x34
> 
> syzkaller reproducer runs a bunch of threads that constantly switch
> between DEADLINE and NORMAL classes while interacting through futexes.
> 
> The splat above is caused by the fact that if a DEADLINE task is
> setattr back to NORMAL while in non_contending state (blocked on a
> futex - inactive timer armed), its contribution to running_bw is not
> removed before sub_rq_bw() gets called (!task_on_rq_queued() branch)
> and the latter sees running_bw > this_bw.
> 
> Fix it by removing a task contribution from running_bw if the task is
> not queued and in non_contending state while switched to a different
> class.
> 
> Reported-by: Mark Rutland 
> Signed-off-by: Juri Lelli 
> ---
>  kernel/sched/deadline.c | 11 ++-
>  1 file changed, 10 insertions(+), 1 deletion(-)

Reviewed-by: Luca Abeni 

And, thanks for taking care of this issue!


Thanks,
Luca


> 
> diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
> index fbfc3f1d368a..10c7b51c0d1f 100644
> --- a/kernel/sched/deadline.c
> +++ b/kernel/sched/deadline.c
> @@ -2290,8 +2290,17 @@ static void switched_from_dl(struct rq *rq,
> struct task_struct *p) if (task_on_rq_queued(p) && p->dl.dl_runtime)
>   task_non_contending(p);
>  
> - if (!task_on_rq_queued(p))
> + if (!task_on_rq_queued(p)) {
> + /*
> +  * Inactive timer is armed. However, p is leaving
> DEADLINE and
> +  * might migrate away from this rq while continuing
> to run on
> +  * some other class. We need to remove its
> contribution from
> +  * this rq running_bw now, or sub_rq_bw (below) will
> complain.
> +  */
> + if (p->dl.dl_non_contending)
> + sub_running_bw(>dl, >dl);
>   sub_rq_bw(>dl, >dl);
> + }
>  
>   /*
>* We cannot use inactive_task_timer() to invoke
> sub_running_bw()



Re: [PATCH v5 00/10] track CPU utilization

2018-06-06 Thread luca abeni
Hi all,

sorry; I missed the beginning of this thread... Anyway, below I add
some comments:

On Wed, 6 Jun 2018 15:05:58 +0200
Claudio Scordino  wrote:
[...]
> >> Ok, I see ... Have you guys already tried something like my patch
> >> above (keeping the freq >= this_bw) in real world use cases ? Is
> >> this costing that much energy in practice ? If we fill the gaps
> >> left by DL (when it  
> > 
> > IIRC, Claudio (now Cc-ed) did experiment a bit with both
> > approaches, so he might add some numbers to my words above. I
> > didn't (yet). But, please consider that I might be reserving (for
> > example) 50% of bandwidth for my heavy and time sensitive task and
> > then have that task wake up only once in a while (but I'll be
> > keeping clock speed up for the whole time). :/  
> 
> As far as I can remember, we never tested energy consumption of
> running_bw vs this_bw, as at OSPM'17 we had already decided to use
> running_bw implementing GRUB-PA. The rationale is that, as Juri
> pointed out, the amount of spare (i.e. reclaimable) bandwidth in
> this_bw is very user-dependent.

Yes, I agree with this. The appropriateness of using this_bw or
running_bw is highly workload-dependent... If a periodic task consumes
much less than its runtime (or if a sporadic task has inter-activation
times much larger than the SCHED_DEADLINE period), then running_bw has
to be preferred. But if a periodic task consumes almost all of its
runtime before blocking, then this_bw has to be preferred...

But this also depends on the hardware: if the frequency switch time is
small, then running_bw is more appropriate... On the other hand,
this_bw works much better if the frequency switch time is high.
(Talking about this, I remember Claudio measured frequency switch times
large almost 3ms... Is this really due to hardware issues? Or maybe
there is some software issue invoved? 3ms look like a lot of time...)

Anyway, this is why I proposed to use some kind of /sys/... file to
control the kind of deadline utilization used for frequency scaling: in
this way, the system designer / administrator, who hopefully has the
needed information about workload and hardware, can optimize the
frequency scaling behaviour by deciding if running_bw or this_bw will be
used.


Luca

> For example, the user can let this_bw
> be much higher than the measured bandwidth, just to be sure that the
> deadlines are met even in corner cases. In practice, this means that
> the task executes for quite a short time and then blocks (with its
> bandwidth reclaimed, hence the CPU frequency reduced, at the 0lag
> time). Using this_bw rather than running_bw, the CPU frequency would
> remain at the same fixed value even when the task is blocked. I
> understand that on some cases it could even be better (i.e. no waste
> of energy in frequency switch). However, IMHO, these are corner cases
> and in the average case it is better to rely on running_bw and reduce
> the CPU frequency accordingly.
> 
> Best regards,
> 
>   Claudio



Re: [PATCH v5 00/10] track CPU utilization

2018-06-06 Thread luca abeni
Hi all,

sorry; I missed the beginning of this thread... Anyway, below I add
some comments:

On Wed, 6 Jun 2018 15:05:58 +0200
Claudio Scordino  wrote:
[...]
> >> Ok, I see ... Have you guys already tried something like my patch
> >> above (keeping the freq >= this_bw) in real world use cases ? Is
> >> this costing that much energy in practice ? If we fill the gaps
> >> left by DL (when it  
> > 
> > IIRC, Claudio (now Cc-ed) did experiment a bit with both
> > approaches, so he might add some numbers to my words above. I
> > didn't (yet). But, please consider that I might be reserving (for
> > example) 50% of bandwidth for my heavy and time sensitive task and
> > then have that task wake up only once in a while (but I'll be
> > keeping clock speed up for the whole time). :/  
> 
> As far as I can remember, we never tested energy consumption of
> running_bw vs this_bw, as at OSPM'17 we had already decided to use
> running_bw implementing GRUB-PA. The rationale is that, as Juri
> pointed out, the amount of spare (i.e. reclaimable) bandwidth in
> this_bw is very user-dependent.

Yes, I agree with this. The appropriateness of using this_bw or
running_bw is highly workload-dependent... If a periodic task consumes
much less than its runtime (or if a sporadic task has inter-activation
times much larger than the SCHED_DEADLINE period), then running_bw has
to be preferred. But if a periodic task consumes almost all of its
runtime before blocking, then this_bw has to be preferred...

But this also depends on the hardware: if the frequency switch time is
small, then running_bw is more appropriate... On the other hand,
this_bw works much better if the frequency switch time is high.
(Talking about this, I remember Claudio measured frequency switch times
large almost 3ms... Is this really due to hardware issues? Or maybe
there is some software issue invoved? 3ms look like a lot of time...)

Anyway, this is why I proposed to use some kind of /sys/... file to
control the kind of deadline utilization used for frequency scaling: in
this way, the system designer / administrator, who hopefully has the
needed information about workload and hardware, can optimize the
frequency scaling behaviour by deciding if running_bw or this_bw will be
used.


Luca

> For example, the user can let this_bw
> be much higher than the measured bandwidth, just to be sure that the
> deadlines are met even in corner cases. In practice, this means that
> the task executes for quite a short time and then blocks (with its
> bandwidth reclaimed, hence the CPU frequency reduced, at the 0lag
> time). Using this_bw rather than running_bw, the CPU frequency would
> remain at the same fixed value even when the task is blocked. I
> understand that on some cases it could even be better (i.e. no waste
> of energy in frequency switch). However, IMHO, these are corner cases
> and in the average case it is better to rely on running_bw and reduce
> the CPU frequency accordingly.
> 
> Best regards,
> 
>   Claudio



Re: [PATCH v5 00/10] track CPU utilization

2018-06-06 Thread luca abeni
Hi,

On Wed, 6 Jun 2018 14:20:46 +0100
Quentin Perret  wrote:
[...]
> > However, IMHO, these are corner cases and in the average case it is
> > better to rely on running_bw and reduce the CPU frequency
> > accordingly.  
> 
> My point was that accepting to go at a lower frequency than required
> by this_bw is fundamentally unsafe. If you're at a low frequency when
> a DL task starts, there are real situations where you won't be able
> to increase the frequency immediately, which can eventually lead to
> missing deadlines. Now, if this risk is known, has been discussed,
> and is accepted, that's fair enough. I'm just too late for the
> discussion :-)

Well, our conclusion was that this issue can be addressed when
designing the scheduling parameters:
- If we do not consider frequency scaling, a task can respect its
  deadlines if the SCHED_DEADLINE runtime is larger than the task's
  execution time and the SCHED_DEADLINE period is smaller than the
  task's period (and if some kind of "global" admission test is
  respected)
- Considering frequency scaling (and 0-time frequency switches), the
  SCHED_DEADLINE runtime must be larger than the task execution time at
  the highest frequency
- If the frequency switch time is larger than 0, then the
  SCHED_DEADLINE runtime must be larger than the task execution time
  (at the highest frequency) plus the frequency switch time

If this third condition is respected, I think that deadline misses can
be avoided even if running_bw is used (and the CPU takes a considerable
time to switch frequency). Of course, this requires an over-allocation
of runtime (and the global admission test has more probabilities to
fail)...



Luca


Re: [PATCH v5 00/10] track CPU utilization

2018-06-06 Thread luca abeni
Hi,

On Wed, 6 Jun 2018 14:20:46 +0100
Quentin Perret  wrote:
[...]
> > However, IMHO, these are corner cases and in the average case it is
> > better to rely on running_bw and reduce the CPU frequency
> > accordingly.  
> 
> My point was that accepting to go at a lower frequency than required
> by this_bw is fundamentally unsafe. If you're at a low frequency when
> a DL task starts, there are real situations where you won't be able
> to increase the frequency immediately, which can eventually lead to
> missing deadlines. Now, if this risk is known, has been discussed,
> and is accepted, that's fair enough. I'm just too late for the
> discussion :-)

Well, our conclusion was that this issue can be addressed when
designing the scheduling parameters:
- If we do not consider frequency scaling, a task can respect its
  deadlines if the SCHED_DEADLINE runtime is larger than the task's
  execution time and the SCHED_DEADLINE period is smaller than the
  task's period (and if some kind of "global" admission test is
  respected)
- Considering frequency scaling (and 0-time frequency switches), the
  SCHED_DEADLINE runtime must be larger than the task execution time at
  the highest frequency
- If the frequency switch time is larger than 0, then the
  SCHED_DEADLINE runtime must be larger than the task execution time
  (at the highest frequency) plus the frequency switch time

If this third condition is respected, I think that deadline misses can
be avoided even if running_bw is used (and the CPU takes a considerable
time to switch frequency). Of course, this requires an over-allocation
of runtime (and the global admission test has more probabilities to
fail)...



Luca


Re: [PATCH V2 0/7] sched/deadline: fix cpusets bandwidth accounting

2018-02-02 Thread Luca Abeni
Hi Mathieu,

On Thu,  1 Feb 2018 09:51:02 -0700
Mathieu Poirier  wrote:

> This is the follow-up patchset to [1] that attempt to fix a problem
> reported by Steve Rostedt [2] where DL bandwidth accounting is not
> recomputed after CPUset and CPU hotplug operations.  When CPU hotplug and
> some CUPset manipulation take place root domains are destroyed and new ones
> created, loosing at the same time DL accounting information pertaining to
> utilisation.  Please see [1] for a full description of the approach.

I do not know the cgroup / cpuset code too much, so I have no useful
comments on your patches... But I think this patchset is a nice
improvemnt respect to the current situation.

[...]
> A notable addition is patch 7/7 - it addresses a problem seen when hot
> plugging out a CPU where a DL task is running (see changelog for full
> details).  The issue is unrelated to this patchset and will manifest
> itself on a mainline kernel.

I think I introduced this bug with my reclaiming patches, so I am
interested.
When a cpu is hot-plugged out, which code in the kernel is responsible
for migrating the tasks that are executing on such CPU? I was sure I
was handling all the relevant codepaths, but this bug clearly shows
that I was wrong.


Thanks,
Luca


Re: [PATCH V2 0/7] sched/deadline: fix cpusets bandwidth accounting

2018-02-02 Thread Luca Abeni
Hi Mathieu,

On Thu,  1 Feb 2018 09:51:02 -0700
Mathieu Poirier  wrote:

> This is the follow-up patchset to [1] that attempt to fix a problem
> reported by Steve Rostedt [2] where DL bandwidth accounting is not
> recomputed after CPUset and CPU hotplug operations.  When CPU hotplug and
> some CUPset manipulation take place root domains are destroyed and new ones
> created, loosing at the same time DL accounting information pertaining to
> utilisation.  Please see [1] for a full description of the approach.

I do not know the cgroup / cpuset code too much, so I have no useful
comments on your patches... But I think this patchset is a nice
improvemnt respect to the current situation.

[...]
> A notable addition is patch 7/7 - it addresses a problem seen when hot
> plugging out a CPU where a DL task is running (see changelog for full
> details).  The issue is unrelated to this patchset and will manifest
> itself on a mainline kernel.

I think I introduced this bug with my reclaiming patches, so I am
interested.
When a cpu is hot-plugged out, which code in the kernel is responsible
for migrating the tasks that are executing on such CPU? I was sure I
was handling all the relevant codepaths, but this bug clearly shows
that I was wrong.


Thanks,
Luca


Re: Sparse warnings from sched.h

2017-11-26 Thread luca abeni
On Sat, 25 Nov 2017 21:46:11 -0800
Jakub Kicinski  wrote:

> Hi!
> 
> Did these:
> 
> ./include/linux/sched.h:476:62: error: dubious one-bit signed bitfield
> ./include/linux/sched.h:477:62: error: dubious one-bit signed bitfield
> ./include/linux/sched.h:478:62: error: dubious one-bit signed bitfield
> ./include/linux/sched.h:479:62: error: dubious one-bit signed bitfield
> 
> got fixed?  I saw there were patches posted, but nothing have reached
> Linus's tree, yet.

I think a fix is in the tip tree:
https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git/commit/?id=aa5222e92f8000ed3c1c38dddf11c83222aadfb3



Luca


Re: Sparse warnings from sched.h

2017-11-26 Thread luca abeni
On Sat, 25 Nov 2017 21:46:11 -0800
Jakub Kicinski  wrote:

> Hi!
> 
> Did these:
> 
> ./include/linux/sched.h:476:62: error: dubious one-bit signed bitfield
> ./include/linux/sched.h:477:62: error: dubious one-bit signed bitfield
> ./include/linux/sched.h:478:62: error: dubious one-bit signed bitfield
> ./include/linux/sched.h:479:62: error: dubious one-bit signed bitfield
> 
> got fixed?  I saw there were patches posted, but nothing have reached
> Linus's tree, yet.

I think a fix is in the tip tree:
https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git/commit/?id=aa5222e92f8000ed3c1c38dddf11c83222aadfb3



Luca


Re: [PATCH] sched/deadline: Use bools for the state flags

2017-11-21 Thread Luca Abeni
Hello,

On Tue, 21 Nov 2017 11:44:05 +0100 (CET)
Jiri Kosina <ji...@kernel.org> wrote:

> From: Jiri Kosina <jkos...@suse.cz>
> 
> Commit
> 
>   799ba82de01e ("sched/deadline: Use C bitfields for the state flags")
> 
> converted state flags into one-bit signed int. Signed one-bit type can be 
> either 0 or -1, which is going to cause a problem once 1 is assigned to it 
> and then the value later tested against 1.

I think a different fix has just been committed to tip:
https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git/commit/?id=aa5222e92f8000ed3c1c38dddf11c83222aadfb3


Luca
> 
> The current code is okay, as all the checks are (non-)zero check, but I 
> believe that we'd rather be safe than sorry here; remove the fragility by 
> converting the state flags to bool.
> 
> This also silences annoying sparse complaints about this very issue when 
> compiling any code that includes sched.h.
> 
> Fixes: 799ba82de01e ("sched/deadline: Use C bitfields for the state flags")
> Cc: luca abeni <luca.ab...@santannapisa.it>
> Cc: Peter Zijlstra (Intel) <pet...@infradead.org>
> Cc: Daniel Bristot de Oliveira <bris...@redhat.com>
> Cc: Juri Lelli <juri.le...@arm.com>
> Cc: Linus Torvalds <torva...@linux-foundation.org>
> Cc: Mathieu Poirier <mathieu.poir...@linaro.org>
> Cc: Mike Galbraith <efa...@gmx.de>
> Cc: Peter Zijlstra <pet...@infradead.org>
> Cc: Steven Rostedt <rost...@goodmis.org>
> Cc: Thomas Gleixner <t...@linutronix.de>
> Signed-off-by: Jiri Kosina <jkos...@suse.cz>
> ---
>  include/linux/sched.h   |  8 
>  kernel/sched/core.c |  8 
>  kernel/sched/deadline.c | 32 
>  3 files changed, 24 insertions(+), 24 deletions(-)
> 
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index a5dc7c98b0a2..b19fa1b96726 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -473,10 +473,10 @@ struct sched_dl_entity {
>* conditions between the inactive timer handler and the wakeup
>* code.
>*/
> - int dl_throttled  : 1;
> - int dl_boosted: 1;
> - int dl_yielded: 1;
> - int dl_non_contending : 1;
> + booldl_throttled;
> + booldl_boosted;
> + booldl_yielded;
> + booldl_non_contending;
>  
>   /*
>* Bandwidth enforcement timer. Each -deadline task has its
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 75554f366fd3..f9bbf0f55e0e 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -3738,20 +3738,20 @@ void rt_mutex_setprio(struct task_struct *p, struct 
> task_struct *pi_task)
>   if (dl_prio(prio)) {
>   if (!dl_prio(p->normal_prio) ||
>   (pi_task && dl_entity_preempt(_task->dl, >dl))) {
> - p->dl.dl_boosted = 1;
> + p->dl.dl_boosted = true;
>   queue_flag |= ENQUEUE_REPLENISH;
>   } else
> - p->dl.dl_boosted = 0;
> + p->dl.dl_boosted = false;
>   p->sched_class = _sched_class;
>   } else if (rt_prio(prio)) {
>   if (dl_prio(oldprio))
> - p->dl.dl_boosted = 0;
> + p->dl.dl_boosted = false;
>   if (oldprio < prio)
>   queue_flag |= ENQUEUE_HEAD;
>   p->sched_class = _sched_class;
>   } else {
>   if (dl_prio(oldprio))
> - p->dl.dl_boosted = 0;
> + p->dl.dl_boosted = false;
>   if (rt_prio(oldprio))
>   p->rt.timeout = 0;
>   p->sched_class = _sched_class;
> diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
> index 2473736c7616..78cb00ae8b2f 100644
> --- a/kernel/sched/deadline.c
> +++ b/kernel/sched/deadline.c
> @@ -133,7 +133,7 @@ void dl_change_utilization(struct task_struct *p, u64 
> new_bw)
>   rq = task_rq(p);
>   if (p->dl.dl_non_contending) {
>   sub_running_bw(p->dl.dl_bw, >dl);
> - p->dl.dl_non_contending = 0;
> + p->dl.dl_non_contending = false;
>   /*
>* If the timer handler is currently running and the
>* timer cannot be cancelled, i

Re: [PATCH] sched/deadline: Use bools for the state flags

2017-11-21 Thread Luca Abeni
Hello,

On Tue, 21 Nov 2017 11:44:05 +0100 (CET)
Jiri Kosina  wrote:

> From: Jiri Kosina 
> 
> Commit
> 
>   799ba82de01e ("sched/deadline: Use C bitfields for the state flags")
> 
> converted state flags into one-bit signed int. Signed one-bit type can be 
> either 0 or -1, which is going to cause a problem once 1 is assigned to it 
> and then the value later tested against 1.

I think a different fix has just been committed to tip:
https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git/commit/?id=aa5222e92f8000ed3c1c38dddf11c83222aadfb3


Luca
> 
> The current code is okay, as all the checks are (non-)zero check, but I 
> believe that we'd rather be safe than sorry here; remove the fragility by 
> converting the state flags to bool.
> 
> This also silences annoying sparse complaints about this very issue when 
> compiling any code that includes sched.h.
> 
> Fixes: 799ba82de01e ("sched/deadline: Use C bitfields for the state flags")
> Cc: luca abeni 
> Cc: Peter Zijlstra (Intel) 
> Cc: Daniel Bristot de Oliveira 
> Cc: Juri Lelli 
> Cc: Linus Torvalds 
> Cc: Mathieu Poirier 
> Cc: Mike Galbraith 
> Cc: Peter Zijlstra 
> Cc: Steven Rostedt 
> Cc: Thomas Gleixner 
> Signed-off-by: Jiri Kosina 
> ---
>  include/linux/sched.h   |  8 
>  kernel/sched/core.c |  8 
>  kernel/sched/deadline.c | 32 
>  3 files changed, 24 insertions(+), 24 deletions(-)
> 
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index a5dc7c98b0a2..b19fa1b96726 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -473,10 +473,10 @@ struct sched_dl_entity {
>* conditions between the inactive timer handler and the wakeup
>* code.
>*/
> - int dl_throttled  : 1;
> - int dl_boosted: 1;
> - int dl_yielded: 1;
> - int dl_non_contending : 1;
> + booldl_throttled;
> + booldl_boosted;
> + booldl_yielded;
> + booldl_non_contending;
>  
>   /*
>* Bandwidth enforcement timer. Each -deadline task has its
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 75554f366fd3..f9bbf0f55e0e 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -3738,20 +3738,20 @@ void rt_mutex_setprio(struct task_struct *p, struct 
> task_struct *pi_task)
>   if (dl_prio(prio)) {
>   if (!dl_prio(p->normal_prio) ||
>   (pi_task && dl_entity_preempt(_task->dl, >dl))) {
> - p->dl.dl_boosted = 1;
> + p->dl.dl_boosted = true;
>   queue_flag |= ENQUEUE_REPLENISH;
>   } else
> - p->dl.dl_boosted = 0;
> + p->dl.dl_boosted = false;
>   p->sched_class = _sched_class;
>   } else if (rt_prio(prio)) {
>   if (dl_prio(oldprio))
> - p->dl.dl_boosted = 0;
> + p->dl.dl_boosted = false;
>   if (oldprio < prio)
>   queue_flag |= ENQUEUE_HEAD;
>   p->sched_class = _sched_class;
>   } else {
>   if (dl_prio(oldprio))
> - p->dl.dl_boosted = 0;
> + p->dl.dl_boosted = false;
>   if (rt_prio(oldprio))
>   p->rt.timeout = 0;
>   p->sched_class = _sched_class;
> diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
> index 2473736c7616..78cb00ae8b2f 100644
> --- a/kernel/sched/deadline.c
> +++ b/kernel/sched/deadline.c
> @@ -133,7 +133,7 @@ void dl_change_utilization(struct task_struct *p, u64 
> new_bw)
>   rq = task_rq(p);
>   if (p->dl.dl_non_contending) {
>   sub_running_bw(p->dl.dl_bw, >dl);
> - p->dl.dl_non_contending = 0;
> + p->dl.dl_non_contending = false;
>   /*
>* If the timer handler is currently running and the
>* timer cannot be cancelled, inactive_task_timer()
> @@ -251,7 +251,7 @@ static void task_non_contending(struct task_struct *p)
>   return;
>   }
>  
> - dl_se->dl_non_contending = 1;
> + dl_se->dl_non_contending = true;
>   get_task_struct(p);
>   hrtimer_start(timer, ns_to_ktime(zerolag_time), HRTIMER_MODE_REL);
>  }
> @@ -271,7 +27

Re: [PATCH] sched: use unsigned int for one-bit bitfield in sched_dl_entity

2017-11-17 Thread Luca Abeni
Hi,

On Fri, 17 Nov 2017 14:50:11 +0800
Xin Long  wrote:

> This patch is to fix the 'dubious one-bit signed bitfield' error reported
> by sparse, when using 'make C=2'.
> 
> Fixes: 799ba82de01e ("sched/deadline: Use C bitfields for the state flags")
> Signed-off-by: Xin Long 

I think this is very similar to patches already sent by Dan Carpenter
and Matthew Wilcox. As for the previous patches, I think the change is
ok.



Luca



> ---
>  include/linux/sched.h | 8 
>  1 file changed, 4 insertions(+), 4 deletions(-)
> 
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index a5dc7c9..3e35a37 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -473,10 +473,10 @@ struct sched_dl_entity {
>* conditions between the inactive timer handler and the wakeup
>* code.
>*/
> - int dl_throttled  : 1;
> - int dl_boosted: 1;
> - int dl_yielded: 1;
> - int dl_non_contending : 1;
> + unsigned intdl_throttled  : 1,
> + dl_boosted: 1,
> + dl_yielded: 1,
> + dl_non_contending : 1;
>  
>   /*
>* Bandwidth enforcement timer. Each -deadline task has its



Re: [PATCH] sched: use unsigned int for one-bit bitfield in sched_dl_entity

2017-11-17 Thread Luca Abeni
Hi,

On Fri, 17 Nov 2017 14:50:11 +0800
Xin Long  wrote:

> This patch is to fix the 'dubious one-bit signed bitfield' error reported
> by sparse, when using 'make C=2'.
> 
> Fixes: 799ba82de01e ("sched/deadline: Use C bitfields for the state flags")
> Signed-off-by: Xin Long 

I think this is very similar to patches already sent by Dan Carpenter
and Matthew Wilcox. As for the previous patches, I think the change is
ok.



Luca



> ---
>  include/linux/sched.h | 8 
>  1 file changed, 4 insertions(+), 4 deletions(-)
> 
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index a5dc7c9..3e35a37 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -473,10 +473,10 @@ struct sched_dl_entity {
>* conditions between the inactive timer handler and the wakeup
>* code.
>*/
> - int dl_throttled  : 1;
> - int dl_boosted: 1;
> - int dl_yielded: 1;
> - int dl_non_contending : 1;
> + unsigned intdl_throttled  : 1,
> + dl_boosted: 1,
> + dl_yielded: 1,
> + dl_non_contending : 1;
>  
>   /*
>* Bandwidth enforcement timer. Each -deadline task has its



Re: New sparse warnings from sched.h

2017-11-15 Thread luca abeni
Hi,

On Tue, 14 Nov 2017 12:41:35 -0800
Matthew Wilcox <wi...@infradead.org> wrote:

> commit 799ba82de01e7543f6b2042e1a739f3a20255f23
> Author: luca abeni <luca.ab...@santannapisa.it>
> Date:   Thu Sep 7 12:09:31 2017 +0200
> 
> sched/deadline: Use C bitfields for the state flags
> 
> Ask the compiler to use a single bit for storing true / false
> values, instead of wasting the size of a whole int value.
> Tested with gcc 5.4.0 on x86_64, and the compiler produces the
> expected Assembly (similar to the Assembly code generated when
> explicitly accessing the bits with bitmasks, "&" and "|").
> 
> produces four warnings from sparse for every file which includes
> sched.h:
> 
> ./include/linux/sched.h:476:62: error: dubious one-bit signed bitfield
> ./include/linux/sched.h:477:62: error: dubious one-bit signed bitfield
> ./include/linux/sched.h:478:62: error: dubious one-bit signed bitfield
> ./include/linux/sched.h:479:62: error: dubious one-bit signed bitfield
> 
> This seems like the trivial fix (untested):
[...]

I think Dan Carpenter sent a similar patch (and I replied agreeing with
it).

I believe in this particular case signed integers are safe, but if
"unsigned int" is the preferred style in the kernel I agree with the
change.


Thanks,
Luca

> 
> 
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index a5dc7c98b0a2..21991d668d35 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -473,10 +473,10 @@ struct sched_dl_entity {
>* conditions between the inactive timer handler and the
> wakeup
>* code.
>*/
> - int dl_throttled  : 1;
> - int dl_boosted: 1;
> - int dl_yielded: 1;
> - int dl_non_contending : 1;
> + unsigned intdl_throttled  : 1;
> + unsigned intdl_boosted: 1;
> + unsigned intdl_yielded: 1;
> + unsigned intdl_non_contending : 1;
>  
>   /*
>* Bandwidth enforcement timer. Each -deadline task has its



Re: New sparse warnings from sched.h

2017-11-15 Thread luca abeni
Hi,

On Tue, 14 Nov 2017 12:41:35 -0800
Matthew Wilcox  wrote:

> commit 799ba82de01e7543f6b2042e1a739f3a20255f23
> Author: luca abeni 
> Date:   Thu Sep 7 12:09:31 2017 +0200
> 
> sched/deadline: Use C bitfields for the state flags
> 
> Ask the compiler to use a single bit for storing true / false
> values, instead of wasting the size of a whole int value.
> Tested with gcc 5.4.0 on x86_64, and the compiler produces the
> expected Assembly (similar to the Assembly code generated when
> explicitly accessing the bits with bitmasks, "&" and "|").
> 
> produces four warnings from sparse for every file which includes
> sched.h:
> 
> ./include/linux/sched.h:476:62: error: dubious one-bit signed bitfield
> ./include/linux/sched.h:477:62: error: dubious one-bit signed bitfield
> ./include/linux/sched.h:478:62: error: dubious one-bit signed bitfield
> ./include/linux/sched.h:479:62: error: dubious one-bit signed bitfield
> 
> This seems like the trivial fix (untested):
[...]

I think Dan Carpenter sent a similar patch (and I replied agreeing with
it).

I believe in this particular case signed integers are safe, but if
"unsigned int" is the preferred style in the kernel I agree with the
change.


Thanks,
Luca

> 
> 
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index a5dc7c98b0a2..21991d668d35 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -473,10 +473,10 @@ struct sched_dl_entity {
>* conditions between the inactive timer handler and the
> wakeup
>* code.
>*/
> - int dl_throttled  : 1;
> - int dl_boosted: 1;
> - int dl_yielded: 1;
> - int dl_non_contending : 1;
> + unsigned intdl_throttled  : 1;
> + unsigned intdl_boosted: 1;
> + unsigned intdl_yielded: 1;
> + unsigned intdl_non_contending : 1;
>  
>   /*
>* Bandwidth enforcement timer. Each -deadline task has its



Re: [PATCH] sched/deadline: Don't use dubious signed bitfields

2017-10-13 Thread Luca Abeni
Hi,

On Fri, 13 Oct 2017 10:01:22 +0300
Dan Carpenter <dan.carpen...@oracle.com> wrote:

> It doesn't cause a run-time bug, but these bitfields should be unsigned.
> When it's signed ->dl_throttled is set to either 0 or -1, instead of
> 0 and 1 as expected.  The sched.h file is included into tons of places
> so Sparse generates a flood of warnings like this:
> 
> ./include/linux/sched.h:477:54: error: dubious one-bit signed bitfield
> 
> Signed-off-by: Dan Carpenter <dan.carpen...@oracle.com>

I did not notice any issue when testing, but if "unsigned int" is the
common practice for bitfields, I agree with the change.

Reviewed-by: Luca Abeni <luca.ab...@santannapisa.it>


Thanks,
Luca
> 
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 0f897dfc195e..105eaff8a5e7 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -474,10 +474,10 @@ struct sched_dl_entity {
>* conditions between the inactive timer handler and the wakeup
>* code.
>*/
> - int dl_throttled  : 1;
> - int dl_boosted: 1;
> - int dl_yielded: 1;
> - int dl_non_contending : 1;
> + unsigned intdl_throttled  : 1;
> + unsigned intdl_boosted: 1;
> + unsigned intdl_yielded: 1;
> + unsigned intdl_non_contending : 1;
>  
>   /*
>* Bandwidth enforcement timer. Each -deadline task has its



Re: [PATCH] sched/deadline: Don't use dubious signed bitfields

2017-10-13 Thread Luca Abeni
Hi,

On Fri, 13 Oct 2017 10:01:22 +0300
Dan Carpenter  wrote:

> It doesn't cause a run-time bug, but these bitfields should be unsigned.
> When it's signed ->dl_throttled is set to either 0 or -1, instead of
> 0 and 1 as expected.  The sched.h file is included into tons of places
> so Sparse generates a flood of warnings like this:
> 
> ./include/linux/sched.h:477:54: error: dubious one-bit signed bitfield
> 
> Signed-off-by: Dan Carpenter 

I did not notice any issue when testing, but if "unsigned int" is the
common practice for bitfields, I agree with the change.

Reviewed-by: Luca Abeni 


Thanks,
Luca
> 
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 0f897dfc195e..105eaff8a5e7 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -474,10 +474,10 @@ struct sched_dl_entity {
>* conditions between the inactive timer handler and the wakeup
>* code.
>*/
> - int dl_throttled  : 1;
> - int dl_boosted: 1;
> - int dl_yielded: 1;
> - int dl_non_contending : 1;
> + unsigned intdl_throttled  : 1;
> + unsigned intdl_boosted: 1;
> + unsigned intdl_yielded: 1;
> + unsigned intdl_non_contending : 1;
>  
>   /*
>* Bandwidth enforcement timer. Each -deadline task has its



Re: [PATCH 0/7] sched/deadline: fix cpusets bandwidth accounting

2017-10-13 Thread Luca Abeni
Hi Mathieu,

On Thu, 12 Oct 2017 10:57:09 -0600
Mathieu Poirier  wrote:
[...]
> >> Regardless of how we proceed (using existing CPUset list or new ones) we
> >> need to deal with DL tasks that span more than one root domain,  something
> >> that will typically happen after a CPUset operation.  For example, if we
> >> split the number of available CPUs on a system in two CPUsets and then turn
> >> off the 'sched_load_balance' flag on the parent CPUset, DL tasks in the
> >> parent CPUset will end up spanning two root domains.
> >>
> >> One way to deal with this is to prevent CPUset operations from happening
> >> when such condition is detected, as enacted in this set.  Although simple
> >> this approach feels brittle and akin to a "whack-a-mole" game.  A better
> >> and more reliable approach would be to teach the DL scheduler to deal with
> >> tasks that span multiple root domains, a serious and substantial
> >> undertaking.
> >>
> >> I am sending this as a starting point for discussion.  I would be grateful
> >> if you could take the time to comment on the approach and most importantly
> >> provide input on how to deal with the open issue underlined above.  
> >
> > Right, so teaching DEADLINE about arbitrary affinities is 'interesting'.
> >
> > Although the rules proposed by Tomasso; if found sufficient; would
> > greatly simplify things. Also the online semi-partition approach to SMP
> > could help with that.  
> 
> The "rules" proposed by Tomasso, are you referring to patches or the
> deadline/cgroup extension work that he presented at OSPM?

No, that is an unrelated thing... Tommaso previously proposed some
improvements to the admission control mechanism to take arbitrary
affinities into account.


I think Tommaso's proposal is similar to what I previously proposed in
this thread (to admit a SCHED_DEADLINE task with utilization
u = runtime / period and affinity to N runqueues, we can account u / N
to each one of the runqueues, and check if the sum of the utilizations
on each runqueue is < 1).

As previously noticed by Peter, this might have some scalability issues
(a naive implementation would lock the root domain while iterating on
all the runqueues). Few days ago, I was discussing with Tommaso about a
possible solution based on not locking the root domain structure, and
eventually using a roll-back strategy if the status of the root domain
changes while we are updating it. I think in a previous email you
mentioned RCU, which might result in a similar solution.

Anyway, I am adding Tommaso in cc so that he can comment more.


> I'd also be
> interested to know more about this "online semi-partition approach to
> SMP" you mentioned.

It is basically an implementation (and extension to arbitrary
affinities) of this work:
http://drops.dagstuhl.de/opus/volltexte/2017/7165/


Luca

> Maybe that's a conversation we could have at the
> upcoming RT summit in Prague.
> 
> >
> > But yes, that's fairly massive surgery. For now I think we'll have to
> > live and accept the limitations. So failing the various cpuset
> > operations when they violate rules seems fine. Relaxing rules is always
> > easier than tightening them (later).  
> 
> Agreed.
> 
> >
> > One 'series' you might be interested in when respinning these is:
> >
> >   
> > https://lkml.kernel.org/r/20171011094833.pdp4torvotvjd...@hirez.programming.kicks-ass.net
> >
> > By doing synchronous domain rebuild we loose a bunch of funnies.  
> 
> Getting rid of the asynchronous nature of the hotplug path would be a
> delight - I'll start keeping track of that effort as well.
> 
> Thanks for the review,
> Mathieu



Re: [PATCH 0/7] sched/deadline: fix cpusets bandwidth accounting

2017-10-13 Thread Luca Abeni
Hi Mathieu,

On Thu, 12 Oct 2017 10:57:09 -0600
Mathieu Poirier  wrote:
[...]
> >> Regardless of how we proceed (using existing CPUset list or new ones) we
> >> need to deal with DL tasks that span more than one root domain,  something
> >> that will typically happen after a CPUset operation.  For example, if we
> >> split the number of available CPUs on a system in two CPUsets and then turn
> >> off the 'sched_load_balance' flag on the parent CPUset, DL tasks in the
> >> parent CPUset will end up spanning two root domains.
> >>
> >> One way to deal with this is to prevent CPUset operations from happening
> >> when such condition is detected, as enacted in this set.  Although simple
> >> this approach feels brittle and akin to a "whack-a-mole" game.  A better
> >> and more reliable approach would be to teach the DL scheduler to deal with
> >> tasks that span multiple root domains, a serious and substantial
> >> undertaking.
> >>
> >> I am sending this as a starting point for discussion.  I would be grateful
> >> if you could take the time to comment on the approach and most importantly
> >> provide input on how to deal with the open issue underlined above.  
> >
> > Right, so teaching DEADLINE about arbitrary affinities is 'interesting'.
> >
> > Although the rules proposed by Tomasso; if found sufficient; would
> > greatly simplify things. Also the online semi-partition approach to SMP
> > could help with that.  
> 
> The "rules" proposed by Tomasso, are you referring to patches or the
> deadline/cgroup extension work that he presented at OSPM?

No, that is an unrelated thing... Tommaso previously proposed some
improvements to the admission control mechanism to take arbitrary
affinities into account.


I think Tommaso's proposal is similar to what I previously proposed in
this thread (to admit a SCHED_DEADLINE task with utilization
u = runtime / period and affinity to N runqueues, we can account u / N
to each one of the runqueues, and check if the sum of the utilizations
on each runqueue is < 1).

As previously noticed by Peter, this might have some scalability issues
(a naive implementation would lock the root domain while iterating on
all the runqueues). Few days ago, I was discussing with Tommaso about a
possible solution based on not locking the root domain structure, and
eventually using a roll-back strategy if the status of the root domain
changes while we are updating it. I think in a previous email you
mentioned RCU, which might result in a similar solution.

Anyway, I am adding Tommaso in cc so that he can comment more.


> I'd also be
> interested to know more about this "online semi-partition approach to
> SMP" you mentioned.

It is basically an implementation (and extension to arbitrary
affinities) of this work:
http://drops.dagstuhl.de/opus/volltexte/2017/7165/


Luca

> Maybe that's a conversation we could have at the
> upcoming RT summit in Prague.
> 
> >
> > But yes, that's fairly massive surgery. For now I think we'll have to
> > live and accept the limitations. So failing the various cpuset
> > operations when they violate rules seems fine. Relaxing rules is always
> > easier than tightening them (later).  
> 
> Agreed.
> 
> >
> > One 'series' you might be interested in when respinning these is:
> >
> >   
> > https://lkml.kernel.org/r/20171011094833.pdp4torvotvjd...@hirez.programming.kicks-ass.net
> >
> > By doing synchronous domain rebuild we loose a bunch of funnies.  
> 
> Getting rid of the asynchronous nature of the hotplug path would be a
> delight - I'll start keeping track of that effort as well.
> 
> Thanks for the review,
> Mathieu



[tip:sched/core] sched/deadline: Use C bitfields for the state flags

2017-10-10 Thread tip-bot for luca abeni
Commit-ID:  799ba82de01e7543f6b2042e1a739f3a20255f23
Gitweb: https://git.kernel.org/tip/799ba82de01e7543f6b2042e1a739f3a20255f23
Author: luca abeni <luca.ab...@santannapisa.it>
AuthorDate: Thu, 7 Sep 2017 12:09:31 +0200
Committer:  Ingo Molnar <mi...@kernel.org>
CommitDate: Tue, 10 Oct 2017 11:45:26 +0200

sched/deadline: Use C bitfields for the state flags

Ask the compiler to use a single bit for storing true / false values,
instead of wasting the size of a whole int value.
Tested with gcc 5.4.0 on x86_64, and the compiler produces the expected
Assembly (similar to the Assembly code generated when explicitly accessing
the bits with bitmasks, "&" and "|").

Signed-off-by: luca abeni <luca.ab...@santannapisa.it>
Signed-off-by: Peter Zijlstra (Intel) <pet...@infradead.org>
Reviewed-by: Daniel Bristot de Oliveira <bris...@redhat.com>
Cc: Juri Lelli <juri.le...@arm.com>
Cc: Linus Torvalds <torva...@linux-foundation.org>
Cc: Mathieu Poirier <mathieu.poir...@linaro.org>
Cc: Mike Galbraith <efa...@gmx.de>
Cc: Peter Zijlstra <pet...@infradead.org>
Cc: Steven Rostedt <rost...@goodmis.org>
Cc: Thomas Gleixner <t...@linutronix.de>
Link: 
http://lkml.kernel.org/r/1504778971-13573-5-git-send-email-luca.ab...@santannapisa.it
Signed-off-by: Ingo Molnar <mi...@kernel.org>
---
 include/linux/sched.h | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 33a01f4..0f897df 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -474,10 +474,10 @@ struct sched_dl_entity {
 * conditions between the inactive timer handler and the wakeup
 * code.
 */
-   int dl_throttled;
-   int dl_boosted;
-   int dl_yielded;
-   int dl_non_contending;
+   int dl_throttled  : 1;
+   int dl_boosted: 1;
+   int dl_yielded: 1;
+   int dl_non_contending : 1;
 
/*
 * Bandwidth enforcement timer. Each -deadline task has its


[tip:sched/core] sched/deadline: Use C bitfields for the state flags

2017-10-10 Thread tip-bot for luca abeni
Commit-ID:  799ba82de01e7543f6b2042e1a739f3a20255f23
Gitweb: https://git.kernel.org/tip/799ba82de01e7543f6b2042e1a739f3a20255f23
Author: luca abeni 
AuthorDate: Thu, 7 Sep 2017 12:09:31 +0200
Committer:  Ingo Molnar 
CommitDate: Tue, 10 Oct 2017 11:45:26 +0200

sched/deadline: Use C bitfields for the state flags

Ask the compiler to use a single bit for storing true / false values,
instead of wasting the size of a whole int value.
Tested with gcc 5.4.0 on x86_64, and the compiler produces the expected
Assembly (similar to the Assembly code generated when explicitly accessing
the bits with bitmasks, "&" and "|").

Signed-off-by: luca abeni 
Signed-off-by: Peter Zijlstra (Intel) 
Reviewed-by: Daniel Bristot de Oliveira 
Cc: Juri Lelli 
Cc: Linus Torvalds 
Cc: Mathieu Poirier 
Cc: Mike Galbraith 
Cc: Peter Zijlstra 
Cc: Steven Rostedt 
Cc: Thomas Gleixner 
Link: 
http://lkml.kernel.org/r/1504778971-13573-5-git-send-email-luca.ab...@santannapisa.it
Signed-off-by: Ingo Molnar 
---
 include/linux/sched.h | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 33a01f4..0f897df 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -474,10 +474,10 @@ struct sched_dl_entity {
 * conditions between the inactive timer handler and the wakeup
 * code.
 */
-   int dl_throttled;
-   int dl_boosted;
-   int dl_yielded;
-   int dl_non_contending;
+   int dl_throttled  : 1;
+   int dl_boosted: 1;
+   int dl_yielded: 1;
+   int dl_non_contending : 1;
 
/*
 * Bandwidth enforcement timer. Each -deadline task has its


[tip:sched/core] sched/deadline: Fix switching to -deadline

2017-10-10 Thread tip-bot for Luca Abeni
Commit-ID:  295d6d5e373607729bcc8182c25afe964655714f
Gitweb: https://git.kernel.org/tip/295d6d5e373607729bcc8182c25afe964655714f
Author: Luca Abeni <luca.ab...@santannapisa.it>
AuthorDate: Thu, 7 Sep 2017 12:09:29 +0200
Committer:  Ingo Molnar <mi...@kernel.org>
CommitDate: Tue, 10 Oct 2017 11:43:30 +0200

sched/deadline: Fix switching to -deadline

Fix a bug introduced in:

  72f9f3fdc928 ("sched/deadline: Remove dl_new from struct sched_dl_entity")

After that commit, when switching to -deadline if the scheduling
deadline of a task is in the past then switched_to_dl() calls
setup_new_entity() to properly initialize the scheduling deadline
and runtime.

The problem is that the task is enqueued _before_ having its parameters
initialized by setup_new_entity(), and this can cause problems.
For example, a task with its out-of-date deadline in the past will
potentially be enqueued as the highest priority one; however, its
adjusted deadline may not be the earliest one.

This patch fixes the problem by initializing the task's parameters before
enqueuing it.

Signed-off-by: luca abeni <luca.ab...@santannapisa.it>
Signed-off-by: Peter Zijlstra (Intel) <pet...@infradead.org>
Reviewed-by: Daniel Bristot de Oliveira <bris...@redhat.com>
Cc: Juri Lelli <juri.le...@arm.com>
Cc: Linus Torvalds <torva...@linux-foundation.org>
Cc: Mathieu Poirier <mathieu.poir...@linaro.org>
Cc: Mike Galbraith <efa...@gmx.de>
Cc: Peter Zijlstra <pet...@infradead.org>
Cc: Steven Rostedt <rost...@goodmis.org>
Cc: Thomas Gleixner <t...@linutronix.de>
Link: 
http://lkml.kernel.org/r/1504778971-13573-3-git-send-email-luca.ab...@santannapisa.it
Signed-off-by: Ingo Molnar <mi...@kernel.org>
---
 kernel/sched/deadline.c | 11 ---
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 0191ec7..9d45354 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1364,6 +1364,10 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se,
update_dl_entity(dl_se, pi_se);
} else if (flags & ENQUEUE_REPLENISH) {
replenish_dl_entity(dl_se, pi_se);
+   } else if ((flags & ENQUEUE_RESTORE) &&
+ dl_time_before(dl_se->deadline,
+rq_clock(rq_of_dl_rq(dl_rq_of_se(dl_se) {
+   setup_new_dl_entity(dl_se);
}
 
__enqueue_dl_entity(dl_se);
@@ -2255,13 +2259,6 @@ static void switched_to_dl(struct rq *rq, struct 
task_struct *p)
 
return;
}
-   /*
-* If p is boosted we already updated its params in
-* rt_mutex_setprio()->enqueue_task(..., ENQUEUE_REPLENISH),
-* p's deadline being now already after rq_clock(rq).
-*/
-   if (dl_time_before(p->dl.deadline, rq_clock(rq)))
-   setup_new_dl_entity(>dl);
 
if (rq->curr != p) {
 #ifdef CONFIG_SMP


[tip:sched/core] sched/deadline: Fix switching to -deadline

2017-10-10 Thread tip-bot for Luca Abeni
Commit-ID:  295d6d5e373607729bcc8182c25afe964655714f
Gitweb: https://git.kernel.org/tip/295d6d5e373607729bcc8182c25afe964655714f
Author: Luca Abeni 
AuthorDate: Thu, 7 Sep 2017 12:09:29 +0200
Committer:  Ingo Molnar 
CommitDate: Tue, 10 Oct 2017 11:43:30 +0200

sched/deadline: Fix switching to -deadline

Fix a bug introduced in:

  72f9f3fdc928 ("sched/deadline: Remove dl_new from struct sched_dl_entity")

After that commit, when switching to -deadline if the scheduling
deadline of a task is in the past then switched_to_dl() calls
setup_new_entity() to properly initialize the scheduling deadline
and runtime.

The problem is that the task is enqueued _before_ having its parameters
initialized by setup_new_entity(), and this can cause problems.
For example, a task with its out-of-date deadline in the past will
potentially be enqueued as the highest priority one; however, its
adjusted deadline may not be the earliest one.

This patch fixes the problem by initializing the task's parameters before
enqueuing it.

Signed-off-by: luca abeni 
Signed-off-by: Peter Zijlstra (Intel) 
Reviewed-by: Daniel Bristot de Oliveira 
Cc: Juri Lelli 
Cc: Linus Torvalds 
Cc: Mathieu Poirier 
Cc: Mike Galbraith 
Cc: Peter Zijlstra 
Cc: Steven Rostedt 
Cc: Thomas Gleixner 
Link: 
http://lkml.kernel.org/r/1504778971-13573-3-git-send-email-luca.ab...@santannapisa.it
Signed-off-by: Ingo Molnar 
---
 kernel/sched/deadline.c | 11 ---
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 0191ec7..9d45354 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1364,6 +1364,10 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se,
update_dl_entity(dl_se, pi_se);
} else if (flags & ENQUEUE_REPLENISH) {
replenish_dl_entity(dl_se, pi_se);
+   } else if ((flags & ENQUEUE_RESTORE) &&
+ dl_time_before(dl_se->deadline,
+rq_clock(rq_of_dl_rq(dl_rq_of_se(dl_se) {
+   setup_new_dl_entity(dl_se);
}
 
__enqueue_dl_entity(dl_se);
@@ -2255,13 +2259,6 @@ static void switched_to_dl(struct rq *rq, struct 
task_struct *p)
 
return;
}
-   /*
-* If p is boosted we already updated its params in
-* rt_mutex_setprio()->enqueue_task(..., ENQUEUE_REPLENISH),
-* p's deadline being now already after rq_clock(rq).
-*/
-   if (dl_time_before(p->dl.deadline, rq_clock(rq)))
-   setup_new_dl_entity(>dl);
 
if (rq->curr != p) {
 #ifdef CONFIG_SMP


[tip:sched/core] sched/headers: Remove duplicate prototype of __dl_clear_params()

2017-10-10 Thread tip-bot for luca abeni
Commit-ID:  e964d3501b64d6930aaa4dd18955a8cd086ccb92
Gitweb: https://git.kernel.org/tip/e964d3501b64d6930aaa4dd18955a8cd086ccb92
Author: luca abeni <luca.ab...@santannapisa.it>
AuthorDate: Thu, 7 Sep 2017 12:09:28 +0200
Committer:  Ingo Molnar <mi...@kernel.org>
CommitDate: Tue, 10 Oct 2017 11:43:30 +0200

sched/headers: Remove duplicate prototype of __dl_clear_params()

Signed-off-by: luca abeni <luca.ab...@santannapisa.it>
Signed-off-by: Peter Zijlstra (Intel) <pet...@infradead.org>
Reviewed-by: Daniel Bristot de Oliveira <bris...@redhat.com>
Cc: Juri Lelli <juri.le...@arm.com>
Cc: Linus Torvalds <torva...@linux-foundation.org>
Cc: Mathieu Poirier <mathieu.poir...@linaro.org>
Cc: Mike Galbraith <efa...@gmx.de>
Cc: Peter Zijlstra <pet...@infradead.org>
Cc: Steven Rostedt <rost...@goodmis.org>
Cc: Thomas Gleixner <t...@linutronix.de>
Link: 
http://lkml.kernel.org/r/1504778971-13573-2-git-send-email-luca.ab...@santannapisa.it
Signed-off-by: Ingo Molnar <mi...@kernel.org>
---
 kernel/sched/sched.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e83d1b8..9d5aa18 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -255,7 +255,6 @@ extern int sched_dl_overflow(struct task_struct *p, int 
policy,
 extern void __setparam_dl(struct task_struct *p, const struct sched_attr 
*attr);
 extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr);
 extern bool __checkparam_dl(const struct sched_attr *attr);
-extern void __dl_clear_params(struct task_struct *p);
 extern bool dl_param_changed(struct task_struct *p, const struct sched_attr 
*attr);
 extern int dl_task_can_attach(struct task_struct *p,
  const struct cpumask *cs_cpus_allowed);


[tip:sched/core] sched/headers: Remove duplicate prototype of __dl_clear_params()

2017-10-10 Thread tip-bot for luca abeni
Commit-ID:  e964d3501b64d6930aaa4dd18955a8cd086ccb92
Gitweb: https://git.kernel.org/tip/e964d3501b64d6930aaa4dd18955a8cd086ccb92
Author: luca abeni 
AuthorDate: Thu, 7 Sep 2017 12:09:28 +0200
Committer:  Ingo Molnar 
CommitDate: Tue, 10 Oct 2017 11:43:30 +0200

sched/headers: Remove duplicate prototype of __dl_clear_params()

Signed-off-by: luca abeni 
Signed-off-by: Peter Zijlstra (Intel) 
Reviewed-by: Daniel Bristot de Oliveira 
Cc: Juri Lelli 
Cc: Linus Torvalds 
Cc: Mathieu Poirier 
Cc: Mike Galbraith 
Cc: Peter Zijlstra 
Cc: Steven Rostedt 
Cc: Thomas Gleixner 
Link: 
http://lkml.kernel.org/r/1504778971-13573-2-git-send-email-luca.ab...@santannapisa.it
Signed-off-by: Ingo Molnar 
---
 kernel/sched/sched.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e83d1b8..9d5aa18 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -255,7 +255,6 @@ extern int sched_dl_overflow(struct task_struct *p, int 
policy,
 extern void __setparam_dl(struct task_struct *p, const struct sched_attr 
*attr);
 extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr);
 extern bool __checkparam_dl(const struct sched_attr *attr);
-extern void __dl_clear_params(struct task_struct *p);
 extern bool dl_param_changed(struct task_struct *p, const struct sched_attr 
*attr);
 extern int dl_task_can_attach(struct task_struct *p,
  const struct cpumask *cs_cpus_allowed);


[PATCH 4/4] sched/deadline: use C bitfields for the state flags

2017-09-07 Thread luca abeni
Ask the compiler to use a single bit for storing true / false values,
instead of wasting the size of a whole int value.
Tested with gcc 5.4.0 on x86_64, and the compiler produces the expected
Assembly (similar to the Assembly code generated when explicitly accessing
the bits with bitmasks, "&" and "|").

Signed-off-by: luca abeni <luca.ab...@santannapisa.it>
---
 include/linux/sched.h | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 68b3833..e03cc69 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -472,10 +472,10 @@ struct sched_dl_entity {
 * conditions between the inactive timer handler and the wakeup
 * code.
 */
-   int dl_throttled;
-   int dl_boosted;
-   int dl_yielded;
-   int dl_non_contending;
+   int dl_throttled  : 1;
+   int dl_boosted: 1;
+   int dl_yielded: 1;
+   int dl_non_contending : 1;
 
/*
 * Bandwidth enforcement timer. Each -deadline task has its
-- 
2.7.4



[PATCH 4/4] sched/deadline: use C bitfields for the state flags

2017-09-07 Thread luca abeni
Ask the compiler to use a single bit for storing true / false values,
instead of wasting the size of a whole int value.
Tested with gcc 5.4.0 on x86_64, and the compiler produces the expected
Assembly (similar to the Assembly code generated when explicitly accessing
the bits with bitmasks, "&" and "|").

Signed-off-by: luca abeni 
---
 include/linux/sched.h | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 68b3833..e03cc69 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -472,10 +472,10 @@ struct sched_dl_entity {
 * conditions between the inactive timer handler and the wakeup
 * code.
 */
-   int dl_throttled;
-   int dl_boosted;
-   int dl_yielded;
-   int dl_non_contending;
+   int dl_throttled  : 1;
+   int dl_boosted: 1;
+   int dl_yielded: 1;
+   int dl_non_contending : 1;
 
/*
 * Bandwidth enforcement timer. Each -deadline task has its
-- 
2.7.4



[PATCH 2/4] sched/deadline: fix switching to -deadline

2017-09-07 Thread luca abeni
Fix a bug introduced in 72f9f3fdc928dc3ecd223e801b32d930b662b6ed:
after that commit, when switching to -deadline if the scheduling
deadline of a task is in the past then switched_to_dl() calls
setup_new_entity() to properly initialize the scheduling deadline
and runtime.

The problem is that the task is enqueued _before_ having its parameters
initialized by setup_new_entity(), and this can cause problems.
For example, a task with its out-of-date deadline in the past will
potentially be enqueued as the highest priority one; however, its
adjusted deadline may not be the earliest one.

This patch fixes the problem by initializing the task's parameters before
enqueuing it.

Signed-off-by: luca abeni <luca.ab...@santannapisa.it>
Reviewed-by: Daniel Bristot de Oliveira <bris...@redhat.com>
---
 kernel/sched/deadline.c | 11 ---
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index d05bd94..d7f9e86 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1376,6 +1376,10 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se,
update_dl_entity(dl_se, pi_se);
} else if (flags & ENQUEUE_REPLENISH) {
replenish_dl_entity(dl_se, pi_se);
+   } else if ((flags & ENQUEUE_RESTORE) &&
+ dl_time_before(dl_se->deadline,
+rq_clock(rq_of_dl_rq(dl_rq_of_se(dl_se) {
+   setup_new_dl_entity(dl_se);
}
 
__enqueue_dl_entity(dl_se);
@@ -2267,13 +2271,6 @@ static void switched_to_dl(struct rq *rq, struct 
task_struct *p)
 
return;
}
-   /*
-* If p is boosted we already updated its params in
-* rt_mutex_setprio()->enqueue_task(..., ENQUEUE_REPLENISH),
-* p's deadline being now already after rq_clock(rq).
-*/
-   if (dl_time_before(p->dl.deadline, rq_clock(rq)))
-   setup_new_dl_entity(>dl);
 
if (rq->curr != p) {
 #ifdef CONFIG_SMP
-- 
2.7.4



[PATCH 2/4] sched/deadline: fix switching to -deadline

2017-09-07 Thread luca abeni
Fix a bug introduced in 72f9f3fdc928dc3ecd223e801b32d930b662b6ed:
after that commit, when switching to -deadline if the scheduling
deadline of a task is in the past then switched_to_dl() calls
setup_new_entity() to properly initialize the scheduling deadline
and runtime.

The problem is that the task is enqueued _before_ having its parameters
initialized by setup_new_entity(), and this can cause problems.
For example, a task with its out-of-date deadline in the past will
potentially be enqueued as the highest priority one; however, its
adjusted deadline may not be the earliest one.

This patch fixes the problem by initializing the task's parameters before
enqueuing it.

Signed-off-by: luca abeni 
Reviewed-by: Daniel Bristot de Oliveira 
---
 kernel/sched/deadline.c | 11 ---
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index d05bd94..d7f9e86 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1376,6 +1376,10 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se,
update_dl_entity(dl_se, pi_se);
} else if (flags & ENQUEUE_REPLENISH) {
replenish_dl_entity(dl_se, pi_se);
+   } else if ((flags & ENQUEUE_RESTORE) &&
+ dl_time_before(dl_se->deadline,
+rq_clock(rq_of_dl_rq(dl_rq_of_se(dl_se) {
+   setup_new_dl_entity(dl_se);
}
 
__enqueue_dl_entity(dl_se);
@@ -2267,13 +2271,6 @@ static void switched_to_dl(struct rq *rq, struct 
task_struct *p)
 
return;
}
-   /*
-* If p is boosted we already updated its params in
-* rt_mutex_setprio()->enqueue_task(..., ENQUEUE_REPLENISH),
-* p's deadline being now already after rq_clock(rq).
-*/
-   if (dl_time_before(p->dl.deadline, rq_clock(rq)))
-   setup_new_dl_entity(>dl);
 
if (rq->curr != p) {
 #ifdef CONFIG_SMP
-- 
2.7.4



[PATCH 3/4] sched/deadline: rename __dl_clear() to __dl_sub()

2017-09-07 Thread luca abeni
From: Peter Zijlstra <pet...@infradead.org>

__dl_sub() is more meaningful as a name, and is more consistent
with the naming of the dual function (__dl_add()).

Signed-off-by: Peter Zijlstra (Intel) <pet...@infradead.org>
Signed-off-by: luca abeni <luca.ab...@santannapisa.it>
---
 kernel/sched/deadline.c | 10 +-
 kernel/sched/sched.h|  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index d7f9e86..bab11dd 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -242,7 +242,7 @@ static void task_non_contending(struct task_struct *p)
if (p->state == TASK_DEAD)
sub_rq_bw(p->dl.dl_bw, >dl);
raw_spin_lock(_b->lock);
-   __dl_clear(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
+   __dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
__dl_clear_params(p);
raw_spin_unlock(_b->lock);
}
@@ -1211,7 +1211,7 @@ static enum hrtimer_restart inactive_task_timer(struct 
hrtimer *timer)
}
 
raw_spin_lock(_b->lock);
-   __dl_clear(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
+   __dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
raw_spin_unlock(_b->lock);
__dl_clear_params(p);
 
@@ -2182,7 +2182,7 @@ static void set_cpus_allowed_dl(struct task_struct *p,
 * until we complete the update.
 */
raw_spin_lock(_dl_b->lock);
-   __dl_clear(src_dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
+   __dl_sub(src_dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
raw_spin_unlock(_dl_b->lock);
}
 
@@ -2460,7 +2460,7 @@ int sched_dl_overflow(struct task_struct *p, int policy,
if (dl_policy(policy) && !task_has_dl_policy(p) &&
!__dl_overflow(dl_b, cpus, 0, new_bw)) {
if (hrtimer_active(>dl.inactive_timer))
-   __dl_clear(dl_b, p->dl.dl_bw, cpus);
+   __dl_sub(dl_b, p->dl.dl_bw, cpus);
__dl_add(dl_b, new_bw, cpus);
err = 0;
} else if (dl_policy(policy) && task_has_dl_policy(p) &&
@@ -2472,7 +2472,7 @@ int sched_dl_overflow(struct task_struct *p, int policy,
 * But this would require to set the task's "inactive
 * timer" when the task is not inactive.
 */
-   __dl_clear(dl_b, p->dl.dl_bw, cpus);
+   __dl_sub(dl_b, p->dl.dl_bw, cpus);
__dl_add(dl_b, new_bw, cpus);
dl_change_utilization(p, new_bw);
err = 0;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 0b93e4b..061ec3b 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -226,7 +226,7 @@ struct dl_bw {
 static inline void __dl_update(struct dl_bw *dl_b, s64 bw);
 
 static inline
-void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw, int cpus)
+void __dl_sub(struct dl_bw *dl_b, u64 tsk_bw, int cpus)
 {
dl_b->total_bw -= tsk_bw;
__dl_update(dl_b, (s32)tsk_bw / cpus);
-- 
2.7.4



[PATCH 3/4] sched/deadline: rename __dl_clear() to __dl_sub()

2017-09-07 Thread luca abeni
From: Peter Zijlstra 

__dl_sub() is more meaningful as a name, and is more consistent
with the naming of the dual function (__dl_add()).

Signed-off-by: Peter Zijlstra (Intel) 
Signed-off-by: luca abeni 
---
 kernel/sched/deadline.c | 10 +-
 kernel/sched/sched.h|  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index d7f9e86..bab11dd 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -242,7 +242,7 @@ static void task_non_contending(struct task_struct *p)
if (p->state == TASK_DEAD)
sub_rq_bw(p->dl.dl_bw, >dl);
raw_spin_lock(_b->lock);
-   __dl_clear(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
+   __dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
__dl_clear_params(p);
raw_spin_unlock(_b->lock);
}
@@ -1211,7 +1211,7 @@ static enum hrtimer_restart inactive_task_timer(struct 
hrtimer *timer)
}
 
raw_spin_lock(_b->lock);
-   __dl_clear(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
+   __dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
raw_spin_unlock(_b->lock);
__dl_clear_params(p);
 
@@ -2182,7 +2182,7 @@ static void set_cpus_allowed_dl(struct task_struct *p,
 * until we complete the update.
 */
raw_spin_lock(_dl_b->lock);
-   __dl_clear(src_dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
+   __dl_sub(src_dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
raw_spin_unlock(_dl_b->lock);
}
 
@@ -2460,7 +2460,7 @@ int sched_dl_overflow(struct task_struct *p, int policy,
if (dl_policy(policy) && !task_has_dl_policy(p) &&
!__dl_overflow(dl_b, cpus, 0, new_bw)) {
if (hrtimer_active(>dl.inactive_timer))
-   __dl_clear(dl_b, p->dl.dl_bw, cpus);
+   __dl_sub(dl_b, p->dl.dl_bw, cpus);
__dl_add(dl_b, new_bw, cpus);
err = 0;
} else if (dl_policy(policy) && task_has_dl_policy(p) &&
@@ -2472,7 +2472,7 @@ int sched_dl_overflow(struct task_struct *p, int policy,
 * But this would require to set the task's "inactive
 * timer" when the task is not inactive.
 */
-   __dl_clear(dl_b, p->dl.dl_bw, cpus);
+   __dl_sub(dl_b, p->dl.dl_bw, cpus);
__dl_add(dl_b, new_bw, cpus);
dl_change_utilization(p, new_bw);
err = 0;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 0b93e4b..061ec3b 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -226,7 +226,7 @@ struct dl_bw {
 static inline void __dl_update(struct dl_bw *dl_b, s64 bw);
 
 static inline
-void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw, int cpus)
+void __dl_sub(struct dl_bw *dl_b, u64 tsk_bw, int cpus)
 {
dl_b->total_bw -= tsk_bw;
__dl_update(dl_b, (s32)tsk_bw / cpus);
-- 
2.7.4



[PATCH 1/4] sched/sched.h: remove duplicate prototype of __dl_clear_params()

2017-09-07 Thread luca abeni
Signed-off-by: luca abeni <luca.ab...@santannapisa.it>
---
 kernel/sched/sched.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 1043c8b..0b93e4b 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -255,7 +255,6 @@ extern int sched_dl_overflow(struct task_struct *p, int 
policy,
 extern void __setparam_dl(struct task_struct *p, const struct sched_attr 
*attr);
 extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr);
 extern bool __checkparam_dl(const struct sched_attr *attr);
-extern void __dl_clear_params(struct task_struct *p);
 extern bool dl_param_changed(struct task_struct *p, const struct sched_attr 
*attr);
 extern int dl_task_can_attach(struct task_struct *p,
  const struct cpumask *cs_cpus_allowed);
-- 
2.7.4



  1   2   3   4   5   6   7   8   >