Re: [RFC PATCH v2 4/6] sched/deadline: Introduce deadline servers
On Tue, 6 Oct 2020 11:35:23 +0200 Juri Lelli wrote: [...] > > > + if (dl_se->server_has_tasks(dl_se)) { > > > + enqueue_dl_entity(dl_se, dl_se, > > > ENQUEUE_REPLENISH); > > > + resched_curr(rq); > > > + __push_dl_task(rq, ); > > > + } else { > > > + replenish_dl_entity(dl_se, dl_se); > > > > I am wondering if here we need a "task_non_contending(dl_se)" after > > "replenish_dl_entity(dl_se, dl_se);"... > > > > Basically, what happened is that all the served tasks blocked while the > > server was throttled... So, now the server should be disabled (so, we > > replenish the dl entity but we do not insert it in runqueue). > > But when the server finished its budget and has been throttled, it has > > not been disabled (so, its utilization is still in running_bw). > > Hummm. For CFS, we call dl_server_stop() after the last CFS task blocks > and that calls dequeue_dl(SLEEP), which should be calling > task_non_contending(). That should be happening also while the server is > throttled and CFS tasks are running outside of it, no? You are right... I somehow lost this detail. > Guess I'm missing something. No, I was the one missing something :) Sorry about the noise. Thanks, Luca
Re: [RFC PATCH v2 4/6] sched/deadline: Introduce deadline servers
Hi, sorry for the late reply... Anyway, I am currently testing this patchset (and trying to use it for the "SCHED_DEADLINE-based cgroup scheduling" patchset). And during my tests I had a doubt: On Fri, 7 Aug 2020 11:50:49 +0200 Juri Lelli wrote: > From: Peter Zijlstra > > Low priority tasks (e.g., SCHED_OTHER) can suffer starvation if tasks > with higher priority (e.g., SCHED_FIFO) monopolize CPU(s). [...] > @@ -1024,10 +1062,34 @@ static enum hrtimer_restart dl_task_timer(struct > hrtimer *timer) > struct sched_dl_entity *dl_se = container_of(timer, >struct sched_dl_entity, >dl_timer); > - struct task_struct *p = dl_task_of(dl_se); > + struct task_struct *p; > struct rq_flags rf; > struct rq *rq; > > + if (dl_server(dl_se)) { > + struct rq *rq = rq_of_dl_se(dl_se); > + struct rq_flags rf; > + > + rq_lock(rq, ); > + if (dl_se->dl_throttled) { > + sched_clock_tick(); > + update_rq_clock(rq); > + > + if (dl_se->server_has_tasks(dl_se)) { > + enqueue_dl_entity(dl_se, dl_se, > ENQUEUE_REPLENISH); > + resched_curr(rq); > + __push_dl_task(rq, ); > + } else { > + replenish_dl_entity(dl_se, dl_se); I am wondering if here we need a "task_non_contending(dl_se)" after "replenish_dl_entity(dl_se, dl_se);"... Basically, what happened is that all the served tasks blocked while the server was throttled... So, now the server should be disabled (so, we replenish the dl entity but we do not insert it in runqueue). But when the server finished its budget and has been throttled, it has not been disabled (so, its utilization is still in running_bw). I think that if we do not call task_non_contending() the server's utilization is not correctly removed from running_bw (this does not happen for regular tasks, because task_non_contending() is invoked when they block, even if the dl entity is throttled... But for servers I think we have an issue). What do you think? Do you agree with this change? I have no easy way to reproduce the issue, but this change fixed some strange behaviours I was seeing when using this patch to schedule RT cgroups. Thanks, Luca > + } > + > + } > + rq_unlock(rq, ); > + > + return HRTIMER_NORESTART; > + } > + > + p = dl_task_of(dl_se); > rq = task_rq_lock(p, ); > > /* > @@ -1098,21 +1160,7 @@ static enum hrtimer_restart dl_task_timer(struct > hrtimer *timer) > else > resched_curr(rq); > > -#ifdef CONFIG_SMP > - /* > - * Queueing this task back might have overloaded rq, check if we need > - * to kick someone away. > - */ > - if (has_pushable_dl_tasks(rq)) { > - /* > - * Nothing relies on rq->lock after this, so its safe to drop > - * rq->lock. > - */ > - rq_unpin_lock(rq, ); > - push_dl_task(rq); > - rq_repin_lock(rq, ); > - } > -#endif > + __push_dl_task(rq, ); > > unlock: > task_rq_unlock(rq, p, ); > @@ -1154,12 +1202,11 @@ static void init_dl_task_timer(struct sched_dl_entity > *dl_se) > */ > static inline void dl_check_constrained_dl(struct sched_dl_entity *dl_se) > { > - struct task_struct *p = dl_task_of(dl_se); > - struct rq *rq = rq_of_dl_rq(dl_rq_of_se(dl_se)); > + struct rq *rq = rq_of_dl_se(dl_se); > > if (dl_time_before(dl_se->deadline, rq_clock(rq)) && > dl_time_before(rq_clock(rq), dl_next_period(dl_se))) { > - if (unlikely(dl_se->dl_boosted || !start_dl_timer(p))) > + if (unlikely(dl_se->dl_boosted || !start_dl_timer(dl_se))) > return; > dl_se->dl_throttled = 1; > if (dl_se->runtime > 0) > @@ -1216,29 +1263,10 @@ static u64 grub_reclaim(u64 delta, struct rq *rq, > struct sched_dl_entity *dl_se) > return (delta * u_act) >> BW_SHIFT; > } > > -/* > - * Update the current task's runtime statistics (provided it is still > - * a -deadline task and has not been removed from the dl_rq). > - */ > -static void update_curr_dl(struct rq *rq) > +static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, > s64 delta_exec) > { > - struct task_struct *curr = rq->curr; > - struct sched_dl_entity *dl_se = >dl; > - s64 delta_exec, scaled_delta_exec; > - int cpu = cpu_of(rq); > + s64 scaled_delta_exec; > > - if (!dl_task(curr) || !on_dl_rq(dl_se)) > - return; > - > - /* > - * Consumed budget is computed considering the time as > - * observed by schedulable
Re: [RFC PATCH v2 0/6] SCHED_DEADLINE server infrastructure
Hi Juri, thanks for sharing the v2 patchset! In the next days I'll have a look at it, and try some tests... In the meanwhile, I have some questions/comments after a first quick look. If I understand well, the patchset does not apply deadline servers to FIFO and RR tasks, right? How does this patchset interact with RT throttling? If I understand well, patch 6/6 does something like "use deadline servers for SCHED_OTHER only if FIFO/RR tasks risk to starve SCHED_OTHER tasks"... Right? I understand this is because you do not want to delay RT tasks if they are not starving other tasks. But then, maybe what you want is not deadline-based scheduling. Maybe a reservation-based scheduler based on fixed priorities is what you want? (with SCHED_DEADLINE, you could provide exact performance guarantees to SCHED_OTHER tasks, but I suspect patch 6/6 breaks these guarantees?) Thanks, Luca On Fri, 7 Aug 2020 11:50:45 +0200 Juri Lelli wrote: > Hi, > > This is RFC v2 of Peter's SCHED_DEADLINE server infrastructure > implementation [1]. > > SCHED_DEADLINE servers can help fixing starvation issues of low > priority tasks (e.g., SCHED_OTHER) when higher priority tasks > monopolize CPU cycles. Today we have RT Throttling; DEADLINE servers > should be able to replace and improve that. > > I rebased Peter's patches (adding changelogs where needed) on > tip/sched/core as of today and incorporated fixes to issues discussed > during RFC v1. Current set seems to even boot on real HW! :-) > > While playing with RFC v1 set (and discussing it further offline with > Daniel) it has emerged the need to slightly change the behavior. Patch > 6/6 is a (cumbersome?) attempt to show what's probably needed. > The problem with "original" implementation is that FIFO tasks might > suffer preemption from NORMAL even when spare CPU cycles are > available. In fact, fair deadline server is enqueued right away when > NORMAL tasks wake up and they are first scheduled by the server, thus > potentially preempting a well behaving FIFO task. This is of course > not ideal. So, in patch 6/6 I propose to use some kind of starvation > monitor/ watchdog that delays enqueuing of deadline servers to the > point when fair tasks might start to actually suffer from starvation > (just randomly picked HZ/2 for now). One problem I already see with > the current implementation is that it adds overhead to fair paths, so > I'm pretty sure there are better ways to implement the idea (e.g., > Daniel already suggested using a starvation monitor kthread sort of > thing). > > Receiving comments and suggestions is the sole purpose of this posting > at this stage. Hopefully we can further discuss the idea at Plumbers > in a few weeks. So, please don't focus too much into actual > implementation (which I plan to revise anyway after I'm back from pto > :), but try to see if this might actually fly. The feature seems to > be very much needed. > > Thanks! > > Juri > > 1 - > https://lore.kernel.org/lkml/20190726145409.947503...@infradead.org/ > > Juri Lelli (1): > sched/fair: Implement starvation monitor > > Peter Zijlstra (5): > sched: Unify runtime accounting across classes > sched/deadline: Collect sched_dl_entity initialization > sched/deadline: Move bandwidth accounting into > {en,de}queue_dl_entity sched/deadline: Introduce deadline servers > sched/fair: Add trivial fair server > > include/linux/sched.h| 28 ++- > kernel/sched/core.c | 23 +- > kernel/sched/deadline.c | 483 > --- kernel/sched/fair.c | > 136 ++- kernel/sched/rt.c| 17 +- > kernel/sched/sched.h | 50 +++- > kernel/sched/stop_task.c | 16 +- > 7 files changed, 522 insertions(+), 231 deletions(-) >
Re: [RFC PATCH v2 6/6] sched/fair: Implement starvation monitor
On Fri, 7 Aug 2020 15:43:53 +0200 Juri Lelli wrote: > On 07/08/20 15:28, luca abeni wrote: > > Hi Juri, > > > > On Fri, 7 Aug 2020 11:56:04 +0200 > > Juri Lelli wrote: > > > > > Starting deadline server for lower priority classes right away > > > when first task is enqueued might break guarantees > > > > Which guarantees are you thinking about, here? Response times of > > fixed priority tasks? > > Response time, but also wakeup latency (which, for better or worse, is > another important metric). > > > If fixed priority tasks are also scheduled through deadline servers, > > then you can provide response-time guarantees to them even when > > lower-priority/non-real-time tasks are scheduled through deadline > > servers. > > Right, but I fear we won't be able to keep current behavior for > wakeups: RT with highest prio always gets scheduled right away? Uhm... I think this depends on how the servers' parameters are designed: assigning "wrong" (or "bad") parameters to the server used to schedule RT tasks, this property is broken. (however, notice that even with the current patchset the highest priority task might be scheduled with some delay --- if the SCHED_OTHER deadline server is active because SCHED_OTHER tasks are being starved). Luca
Re: [RFC PATCH v2 6/6] sched/fair: Implement starvation monitor
Hi Peter, On Fri, 7 Aug 2020 12:46:18 +0200 pet...@infradead.org wrote: > On Fri, Aug 07, 2020 at 11:56:04AM +0200, Juri Lelli wrote: > > Starting deadline server for lower priority classes right away when > > first task is enqueued might break guarantees, as tasks belonging to > > intermediate priority classes could be uselessly preempted. E.g., a > > well behaving (non hog) FIFO task can be preempted by NORMAL tasks > > even if there are still CPU cycles available for NORMAL tasks to > > run, as they'll be running inside the fair deadline server for some > > period of time. > > > > To prevent this issue, implement a starvation monitor mechanism that > > starts the deadline server only if a (fair in this case) task hasn't > > been scheduled for some interval of time after it has been enqueued. > > Use pick/put functions to manage starvation monitor status. > > One thing I considerd was scheduling this as a least-laxity entity -- > such that it runs late, not early Are you thinking about scheduling both RT and non-RT tasks through deadline servers? If yes, then I think that using something like laxity-based scheduling for the SCHED_OTHER server can be a good idea (but then we need to understand how to combine deadline-based scheduling with laxity-based scheduling, etc...) Or are you thinking about keeping the SCHED_OTHER server throttled until its laxity is 0 (or until its laxity is lower than some small value)? In this second case, the approach would work even if RT tasks are not scheduled through a server (but I do not know which kind of performance guarantee we could provide). > -- and start the server when > rq->nr_running != rq->cfs.h_nr_running, IOW when there's !fair tasks > around. Yes, this could be a good optimization. Luca > > Not saying we should do it like that, but that's perhaps more > deterministic than this.
Re: [RFC PATCH v2 0/6] SCHED_DEADLINE server infrastructure
Hi Juri, On Fri, 7 Aug 2020 15:30:41 +0200 Juri Lelli wrote: [...] > > In the meanwhile, I have some questions/comments after a first quick > > look. > > > > If I understand well, the patchset does not apply deadline servers > > to FIFO and RR tasks, right? How does this patchset interact with RT > > throttling? > > Well, it's something like the dual of it, in that RT Throttling > directly throttles RT tasks to make spare CPU cycles available to > fair tasks while this patchset introduces deadline servers to > schedule fair tasks, thus still reserving CPU time for those (when > needed). Ah, OK... I was thinking about using deadline servers for both RT and non-RT tasks. And to use them not only to throttle, but also to provide some kind of performance guarantees (to all the scheduling classes). Think about what can be done when combining this mechanism with cgroups/containers :) [...] > > I understand this is because you do not > > want to delay RT tasks if they are not starving other tasks. But > > then, maybe what you want is not deadline-based scheduling. Maybe a > > reservation-based scheduler based on fixed priorities is what you > > want? (with SCHED_DEADLINE, you could provide exact performance > > guarantees to SCHED_OTHER tasks, but I suspect patch 6/6 breaks > > these guarantees?) > > I agree that we are not interested in exact guarantees in this case, > but why not using something that it's already there and would give us > the functionality we need (fix starvation for fair)? Ok, if performance guarantees to non-RT tasks are not the goal, then I agree. I was thinking that since the patchset provides a mechanism to schedule various classes of tasks through deadline servers, then using these servers to provide some kinds of guarantees could be interesting ;-) Thanks, Luca > It would also > work well in presence of "real" deadline tasks I think, in that you > could account for these fair servers while performing admission > control. > > Best, > > Juri >
Re: [RFC PATCH v2 6/6] sched/fair: Implement starvation monitor
Hi Juri, On Fri, 7 Aug 2020 11:56:04 +0200 Juri Lelli wrote: > Starting deadline server for lower priority classes right away when > first task is enqueued might break guarantees Which guarantees are you thinking about, here? Response times of fixed priority tasks? If fixed priority tasks are also scheduled through deadline servers, then you can provide response-time guarantees to them even when lower-priority/non-real-time tasks are scheduled through deadline servers. Thanks, Luca > as tasks belonging to > intermediate priority classes could be uselessly preempted. E.g., a > well behaving (non hog) FIFO task can be preempted by NORMAL tasks > even if there are still CPU cycles available for NORMAL tasks to run, > as they'll be running inside the fair deadline server for some period > of time. > > To prevent this issue, implement a starvation monitor mechanism that > starts the deadline server only if a (fair in this case) task hasn't > been scheduled for some interval of time after it has been enqueued. > Use pick/put functions to manage starvation monitor status. > > Signed-off-by: Juri Lelli > --- > kernel/sched/fair.c | 57 > ++-- kernel/sched/sched.h | > 4 2 files changed, 59 insertions(+), 2 deletions(-) > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c > index 6a97ee2a4e26d..5cdf76e508074 100644 > --- a/kernel/sched/fair.c > +++ b/kernel/sched/fair.c > @@ -5494,6 +5494,53 @@ static int sched_idle_cpu(int cpu) > } > #endif > > + > +static void fair_server_watchdog(struct timer_list *list) > +{ > + struct rq *rq = container_of(list, struct rq, > fair_server_wd); > + struct rq_flags rf; > + > + rq_lock_irqsave(rq, ); > + rq->fair_server_wd_running = 0; > + > + if (!rq->cfs.h_nr_running) > + goto out; > + > + update_rq_clock(rq); > + dl_server_start(>fair_server); > + rq->fair_server_active = 1; > + resched_curr(rq); > + > +out: > + rq_unlock_irqrestore(rq, ); > +} > + > +static inline void fair_server_watchdog_start(struct rq *rq) > +{ > + if (rq->fair_server_wd_running || rq->fair_server_active) > + return; > + > + timer_setup(>fair_server_wd, fair_server_watchdog, 0); > + rq->fair_server_wd.expires = jiffies + > FAIR_SERVER_WATCHDOG_INTERVAL; > + add_timer_on(>fair_server_wd, cpu_of(rq)); > + rq->fair_server_active = 0; > + rq->fair_server_wd_running = 1; > +} > + > +static inline void fair_server_watchdog_stop(struct rq *rq, bool > stop_server) +{ > + if (!rq->fair_server_wd_running && !stop_server) > + return; > + > + del_timer(>fair_server_wd); > + rq->fair_server_wd_running = 0; > + > + if (stop_server && rq->fair_server_active) { > + dl_server_stop(>fair_server); > + rq->fair_server_active = 0; > + } > +} > + > /* > * The enqueue_task method is called before nr_running is > * increased. Here we update the fair scheduling stats and > @@ -5515,7 +5562,7 @@ enqueue_task_fair(struct rq *rq, struct > task_struct *p, int flags) util_est_enqueue(>cfs, p); > > if (!rq->cfs.h_nr_running) > - dl_server_start(>fair_server); > + fair_server_watchdog_start(rq); > > /* >* If in_iowait is set, the code below may not trigger any > cpufreq @@ -5670,7 +5717,7 @@ static void dequeue_task_fair(struct rq > *rq, struct task_struct *p, int flags) > dequeue_throttle: > if (!rq->cfs.h_nr_running) > - dl_server_stop(>fair_server); > + fair_server_watchdog_stop(rq, true); > > util_est_dequeue(>cfs, p, task_sleep); > hrtick_update(rq); > @@ -7123,6 +7170,7 @@ done: __maybe_unused; > hrtick_start_fair(rq, p); > > update_misfit_status(p, rq); > + fair_server_watchdog_stop(rq, false); > > return p; > > @@ -7178,6 +7226,8 @@ void fair_server_init(struct rq *rq) > dl_se->dl_period = 20 * TICK_NSEC; > > dl_server_init(dl_se, rq, fair_server_has_tasks, > fair_server_pick); + > + rq->fair_server_wd_running = 0; > } > > /* > @@ -7192,6 +7242,9 @@ static void put_prev_task_fair(struct rq *rq, > struct task_struct *prev) cfs_rq = cfs_rq_of(se); > put_prev_entity(cfs_rq, se); > } > + > + if (rq->cfs.h_nr_running) > + fair_server_watchdog_start(rq); > } > > /* > diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h > index bf8c9c07705c9..1e1a5436be725 100644 > --- a/kernel/sched/sched.h > +++ b/kernel/sched/sched.h > @@ -375,6 +375,7 @@ extern void dl_server_init(struct sched_dl_entity > *dl_se, struct rq *rq, dl_server_has_tasks_f has_tasks, > dl_server_pick_f pick); > > +#define FAIR_SERVER_WATCHDOG_INTERVAL (HZ >> 1) > extern void fair_server_init(struct rq *); > > #ifdef CONFIG_CGROUP_SCHED > @@ -962,6 +963,9 @@ struct rq { > struct dl_rq
[tip: sched/core] sched/deadline: Implement fallback mechanism for !fit case
The following commit has been merged into the sched/core branch of tip: Commit-ID: 23e71d8ba42933bff12e453858fd68c073bc5258 Gitweb: https://git.kernel.org/tip/23e71d8ba42933bff12e453858fd68c073bc5258 Author:Luca Abeni AuthorDate:Wed, 20 May 2020 15:42:43 +02:00 Committer: Peter Zijlstra CommitterDate: Mon, 15 Jun 2020 14:10:05 +02:00 sched/deadline: Implement fallback mechanism for !fit case When a task has a runtime that cannot be served within the scheduling deadline by any of the idle CPU (later_mask) the task is doomed to miss its deadline. This can happen since the SCHED_DEADLINE admission control guarantees only bounded tardiness and not the hard respect of all deadlines. In this case try to select the idle CPU with the largest CPU capacity to minimize tardiness. Favor task_cpu(p) if it has max capacity of !fitting CPUs so that find_later_rq() can potentially still return it (most likely cache-hot) early. Signed-off-by: Luca Abeni Signed-off-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra (Intel) Acked-by: Juri Lelli Link: https://lkml.kernel.org/r/20200520134243.19352-6-dietmar.eggem...@arm.com --- kernel/sched/cpudeadline.c | 20 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index 8630f2a..8cb06c8 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -121,19 +121,31 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, if (later_mask && cpumask_and(later_mask, cp->free_cpus, p->cpus_ptr)) { - int cpu; + unsigned long cap, max_cap = 0; + int cpu, max_cpu = -1; if (!static_branch_unlikely(_asym_cpucapacity)) return 1; /* Ensure the capacity of the CPUs fits the task. */ for_each_cpu(cpu, later_mask) { - if (!dl_task_fits_capacity(p, cpu)) + if (!dl_task_fits_capacity(p, cpu)) { cpumask_clear_cpu(cpu, later_mask); + + cap = capacity_orig_of(cpu); + + if (cap > max_cap || + (cpu == task_cpu(p) && cap == max_cap)) { + max_cap = cap; + max_cpu = cpu; + } + } } - if (!cpumask_empty(later_mask)) - return 1; + if (cpumask_empty(later_mask)) + cpumask_set_cpu(max_cpu, later_mask); + + return 1; } else { int best_cpu = cpudl_maximum(cp);
[tip: sched/core] sched/deadline: Improve admission control for asymmetric CPU capacities
The following commit has been merged into the sched/core branch of tip: Commit-ID: 60ffd5edc5e4fa69622c125c54ef8e7d5d894af8 Gitweb: https://git.kernel.org/tip/60ffd5edc5e4fa69622c125c54ef8e7d5d894af8 Author:Luca Abeni AuthorDate:Wed, 20 May 2020 15:42:41 +02:00 Committer: Peter Zijlstra CommitterDate: Mon, 15 Jun 2020 14:10:05 +02:00 sched/deadline: Improve admission control for asymmetric CPU capacities The current SCHED_DEADLINE (DL) admission control ensures that sum of reserved CPU bandwidth < x * M where x = /proc/sys/kernel/sched_rt_{runtime,period}_us M = # CPUs in root domain. DL admission control works well for homogeneous systems where the capacity of all CPUs are equal (1024). I.e. bounded tardiness for DL and non-starvation of non-DL tasks is guaranteed. But on heterogeneous systems where capacity of CPUs are different it could fail by over-allocating CPU time on smaller capacity CPUs. On an Arm big.LITTLE/DynamIQ system DL tasks can easily starve other tasks making it unusable. Fix this by explicitly considering the CPU capacity in the DL admission test by replacing M with the root domain CPU capacity sum. Signed-off-by: Luca Abeni Signed-off-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra (Intel) Acked-by: Juri Lelli Link: https://lkml.kernel.org/r/20200520134243.19352-4-dietmar.eggem...@arm.com --- kernel/sched/deadline.c | 30 +- kernel/sched/sched.h| 6 +++--- 2 files changed, 20 insertions(+), 16 deletions(-) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 01f474a..9ebd0a9 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -2590,11 +2590,12 @@ void sched_dl_do_global(void) int sched_dl_overflow(struct task_struct *p, int policy, const struct sched_attr *attr) { - struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); u64 period = attr->sched_period ?: attr->sched_deadline; u64 runtime = attr->sched_runtime; u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0; - int cpus, err = -1; + int cpus, err = -1, cpu = task_cpu(p); + struct dl_bw *dl_b = dl_bw_of(cpu); + unsigned long cap; if (attr->sched_flags & SCHED_FLAG_SUGOV) return 0; @@ -2609,15 +2610,17 @@ int sched_dl_overflow(struct task_struct *p, int policy, * allocated bandwidth of the container. */ raw_spin_lock(_b->lock); - cpus = dl_bw_cpus(task_cpu(p)); + cpus = dl_bw_cpus(cpu); + cap = dl_bw_capacity(cpu); + if (dl_policy(policy) && !task_has_dl_policy(p) && - !__dl_overflow(dl_b, cpus, 0, new_bw)) { + !__dl_overflow(dl_b, cap, 0, new_bw)) { if (hrtimer_active(>dl.inactive_timer)) __dl_sub(dl_b, p->dl.dl_bw, cpus); __dl_add(dl_b, new_bw, cpus); err = 0; } else if (dl_policy(policy) && task_has_dl_policy(p) && - !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) { + !__dl_overflow(dl_b, cap, p->dl.dl_bw, new_bw)) { /* * XXX this is slightly incorrect: when the task * utilization decreases, we should delay the total @@ -2772,19 +2775,19 @@ bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr) #ifdef CONFIG_SMP int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed) { + unsigned long flags, cap; unsigned int dest_cpu; struct dl_bw *dl_b; bool overflow; - int cpus, ret; - unsigned long flags; + int ret; dest_cpu = cpumask_any_and(cpu_active_mask, cs_cpus_allowed); rcu_read_lock_sched(); dl_b = dl_bw_of(dest_cpu); raw_spin_lock_irqsave(_b->lock, flags); - cpus = dl_bw_cpus(dest_cpu); - overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw); + cap = dl_bw_capacity(dest_cpu); + overflow = __dl_overflow(dl_b, cap, 0, p->dl.dl_bw); if (overflow) { ret = -EBUSY; } else { @@ -2794,6 +2797,8 @@ int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allo * We will free resources in the source root_domain * later on (see set_cpus_allowed_dl()). */ + int cpus = dl_bw_cpus(dest_cpu); + __dl_add(dl_b, p->dl.dl_bw, cpus); ret = 0; } @@ -2826,16 +2831,15 @@ int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, bool dl_cpu_busy(unsigned int cpu) { - unsigned long flags; + unsigned long flags, cap; struct dl_bw *dl_b; bool overflow; - int cpus; rcu_read_lock_sched(); dl_b = dl_bw_of(cpu); raw_spin_lock_irqsave(_b->lock, fl
[tip: sched/core] sched/deadline: Make DL capacity-aware
The following commit has been merged into the sched/core branch of tip: Commit-ID: b4118988fdcb4554ea6687dd8ff68bcab690b8ea Gitweb: https://git.kernel.org/tip/b4118988fdcb4554ea6687dd8ff68bcab690b8ea Author:Luca Abeni AuthorDate:Wed, 20 May 2020 15:42:42 +02:00 Committer: Peter Zijlstra CommitterDate: Mon, 15 Jun 2020 14:10:05 +02:00 sched/deadline: Make DL capacity-aware The current SCHED_DEADLINE (DL) scheduler uses a global EDF scheduling algorithm w/o considering CPU capacity or task utilization. This works well on homogeneous systems where DL tasks are guaranteed to have a bounded tardiness but presents issues on heterogeneous systems. A DL task can migrate to a CPU which does not have enough CPU capacity to correctly serve the task (e.g. a task w/ 70ms runtime and 100ms period on a CPU w/ 512 capacity). Add the DL fitness function dl_task_fits_capacity() for DL admission control on heterogeneous systems. A task fits onto a CPU if: CPU original capacity / 1024 >= task runtime / task deadline Use this function on heterogeneous systems to try to find a CPU which meets this criterion during task wakeup, push and offline migration. On homogeneous systems the original behavior of the DL admission control should be retained. Signed-off-by: Luca Abeni Signed-off-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra (Intel) Acked-by: Juri Lelli Link: https://lkml.kernel.org/r/20200520134243.19352-5-dietmar.eggem...@arm.com --- kernel/sched/cpudeadline.c | 14 +- kernel/sched/deadline.c| 18 ++ kernel/sched/sched.h | 15 +++ 3 files changed, 42 insertions(+), 5 deletions(-) diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index 5cc4012..8630f2a 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -121,7 +121,19 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, if (later_mask && cpumask_and(later_mask, cp->free_cpus, p->cpus_ptr)) { - return 1; + int cpu; + + if (!static_branch_unlikely(_asym_cpucapacity)) + return 1; + + /* Ensure the capacity of the CPUs fits the task. */ + for_each_cpu(cpu, later_mask) { + if (!dl_task_fits_capacity(p, cpu)) + cpumask_clear_cpu(cpu, later_mask); + } + + if (!cpumask_empty(later_mask)) + return 1; } else { int best_cpu = cpudl_maximum(cp); diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 9ebd0a9..84e84ba 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1643,6 +1643,7 @@ static int select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags) { struct task_struct *curr; + bool select_rq; struct rq *rq; if (sd_flag != SD_BALANCE_WAKE) @@ -1662,10 +1663,19 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags) * other hand, if it has a shorter deadline, we * try to make it stay here, it might be important. */ - if (unlikely(dl_task(curr)) && - (curr->nr_cpus_allowed < 2 || -!dl_entity_preempt(>dl, >dl)) && - (p->nr_cpus_allowed > 1)) { + select_rq = unlikely(dl_task(curr)) && + (curr->nr_cpus_allowed < 2 || +!dl_entity_preempt(>dl, >dl)) && + p->nr_cpus_allowed > 1; + + /* +* Take the capacity of the CPU into account to +* ensure it fits the requirement of the task. +*/ + if (static_branch_unlikely(_asym_cpucapacity)) + select_rq |= !dl_task_fits_capacity(p, cpu); + + if (select_rq) { int target = find_later_rq(p); if (target != -1 && diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 91b250f..3368876 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -317,6 +317,21 @@ static inline bool __dl_overflow(struct dl_bw *dl_b, unsigned long cap, cap_scale(dl_b->bw, cap) < dl_b->total_bw - old_bw + new_bw; } +/* + * Verify the fitness of task @p to run on @cpu taking into account the + * CPU original capacity and the runtime/deadline ratio of the task. + * + * The function will return true if the CPU original capacity of the + * @cpu scaled by SCHED_CAPACITY_SCALE >= runtime/deadline ratio of the + * task and false otherwise. + */ +static inline bool dl_task_fits_capacity(struct task_struct *p, int cpu) +{ + unsigned long cap = arch_scale_cpu_capacity(cpu); + + return cap_scale(p->dl.dl_deadline, cap) >= p->dl.dl_runtime; +} + extern void init_dl_bw(struct dl_bw *dl_b); extern int sched_dl_global_validate(void); extern void sched_dl_do_global(void);
Re: [PATCH 4/5] sched/deadline: Cleanup on_dl_rq() handling
On Wed, 31 Jul 2019 18:32:47 +0100 Dietmar Eggemann wrote: [...] > static void dequeue_dl_entity(struct sched_dl_entity *dl_se) > { > +if (!on_dl_rq(dl_se)) > +return; > >>> > >>> Why allow double dequeue instead of WARN? > >> > >> As I was saying to Valentin, it can currently happen that a task > >> could have already been dequeued by update_curr_dl()->throttle > >> called by dequeue_task_dl() before calling __dequeue_task_dl(). Do > >> you think we should check for this condition before calling into > >> dequeue_dl_entity()? > > > > Yes, that's what ->dl_throttled is for, right? And !->dl_throttled > > && !on_dl_rq() is a BUG. > > OK, I will add the following snippet to the patch. > Although it's easy to provoke a situation in which DL tasks are > throttled, I haven't seen a throttling happening when the task is > being dequeued. This is a not-so-common situation, that can happen with periodic tasks (a-la rt-app) blocking on clock_nanosleep() (or similar) after executing for an amount of time comparable with the SCHED_DEADLINE runtime. It might happen that the task consumed a little bit more than the remaining runtime (but has not been throttled yet, because the accounting happens at every tick)... So, when dequeue_task_dl() invokes update_task_dl() the runtime becomes negative and the task is throttled. This happens infrequently, but if you try rt-app tasksets with multiple tasks and execution times near to the runtime you will see it happening, sooner or later. [...] > @@ -1592,6 +1591,10 @@ static void __dequeue_task_dl(struct rq *rq, > struct task_struct *p) static void dequeue_task_dl(struct rq *rq, > struct task_struct *p, int flags) { > update_curr_dl(rq); > + > + if (p->dl.dl_throttled) > + return; Sorry, I missed part of the previous discussion, so maybe I am missing something... But I suspect this "return" might be wrong (you risk to miss a call to task_non_contending(), coming later in this function). Maybe you cound use if (!p->dl_throttled) __dequeue_task_dl(rq, p) Or did I misunderstand something? Thanks, Luca
Re: [RFC][PATCH 00/13] SCHED_DEADLINE server infrastructure
Hi Peter, On Fri, 26 Jul 2019 16:54:09 +0200 Peter Zijlstra wrote: > Hi, > > So recently syzcaller ran into the big deadline/period issue (again), > and I figured I should at least propose a patch that puts limits on > that -- see Patch 1. > > During that discussion; SCHED_OTHER servers got mentioned (also > again), and I figured I should have a poke at that too. So I took > some inspiration from patches Alessio Balsini send a while back and > cobbled something together for that too. I think Patch 1 is a very good idea! The server patches look interesting (and they seem to be much simpler than our patchset :). I need to have a better look at them, but this seems to be very promising. Thanks, Luca > > Included are also a bunch of patches I did for core scheduling (2-8), > which I'm probably going to just merge as they're generic cleanups. > They're included here because they make pick_next_task() simpler and > thereby thinking about the nested pick_next_task() logic inherent in > servers was less of a head-ache. (I think it can work fine without > them, but its easier with them on) > > Anyway; after it all compiled it booted a kvm image all the way to > userspace on the first run, so clearly this code isn't to be trusted > at all. > > There's still lots of missing bits and pieces -- like changelogs and > the fair server isn't configurable or hooked into the bandwidth > accounting, but the foundation is there I think. > > Enjoy! >
Re: [PATCH 1/5] sched/deadline: Fix double accounting of rq/running bw in push_dl_task()
Hi, On Fri, 26 Jul 2019 09:27:52 +0100 Dietmar Eggemann wrote: [...] > @@ -2121,17 +2121,13 @@ static int push_dl_task(struct rq *rq) > } > > deactivate_task(rq, next_task, 0); > - sub_running_bw(_task->dl, >dl); > - sub_rq_bw(_task->dl, >dl); > set_task_cpu(next_task, later_rq->cpu); > - add_rq_bw(_task->dl, _rq->dl); > > /* >* Update the later_rq clock here, because the clock is used >* by the cpufreq_update_util() inside __add_running_bw(). >*/ > update_rq_clock(later_rq); > - add_running_bw(_task->dl, _rq->dl); Looking at the code again and thinking a little bit more about this issue, I suspect a similar change is needed in pull_dl_task() too, no? Luca
Re: [PATCH 5/5] sched/deadline: Use return value of SCHED_WARN_ON() in bw accounting
Hi Dietmar, On Fri, 26 Jul 2019 09:27:56 +0100 Dietmar Eggemann wrote: > To make the decision whether to set rq or running bw to 0 in underflow > case use the return value of SCHED_WARN_ON() rather than an extra if > condition. I think I tried this at some point, but if I remember well this solution does not work correctly when SCHED_DEBUG is not enabled. Luca > > Signed-off-by: Dietmar Eggemann > --- > kernel/sched/deadline.c | 6 ++ > 1 file changed, 2 insertions(+), 4 deletions(-) > > diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c > index a9cb52ceb761..66c594b5507e 100644 > --- a/kernel/sched/deadline.c > +++ b/kernel/sched/deadline.c > @@ -95,8 +95,7 @@ void __sub_running_bw(u64 dl_bw, struct dl_rq > *dl_rq) > lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock); > dl_rq->running_bw -= dl_bw; > - SCHED_WARN_ON(dl_rq->running_bw > old); /* underflow */ > - if (dl_rq->running_bw > old) > + if (SCHED_WARN_ON(dl_rq->running_bw > old)) /* underflow */ > dl_rq->running_bw = 0; > /* kick cpufreq (see the comment in kernel/sched/sched.h). */ > cpufreq_update_util(rq_of_dl_rq(dl_rq), 0); > @@ -119,8 +118,7 @@ void __sub_rq_bw(u64 dl_bw, struct dl_rq *dl_rq) > > lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock); > dl_rq->this_bw -= dl_bw; > - SCHED_WARN_ON(dl_rq->this_bw > old); /* underflow */ > - if (dl_rq->this_bw > old) > + if (SCHED_WARN_ON(dl_rq->this_bw > old)) /* underflow */ > dl_rq->this_bw = 0; > SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw); > }
Re: [PATCH 1/5] sched/deadline: Fix double accounting of rq/running bw in push_dl_task()
Hi Dietmar, On Fri, 26 Jul 2019 09:27:52 +0100 Dietmar Eggemann wrote: > push_dl_task() always calls deactivate_task() with flags=0 which sets > p->on_rq=TASK_ON_RQ_MIGRATING. Uhm... This is a recent change in the deactivate_task() behaviour, right? Because I tested SCHED_DEADLINE a lot, but I've never seen this issue :) Anyway, looking at the current code the change looks OK. Thanks for fixing this issue! Luca > push_dl_task()->deactivate_task()->dequeue_task()->dequeue_task_dl() > calls sub_[running/rq]_bw() since p->on_rq=TASK_ON_RQ_MIGRATING. > So sub_[running/rq]_bw() in push_dl_task() is double-accounting for > that task. > > The same is true for add_[rq/running]_bw() and activate_task() on the > destination (later) CPU. > push_dl_task()->activate_task()->enqueue_task()->enqueue_task_dl() > calls add_[rq/running]_bw() again since p->on_rq is still set to > TASK_ON_RQ_MIGRATING. > So the add_[rq/running]_bw() in enqueue_task_dl() is double-accounting > for that task. > > Fix this by removing the rq/running bw accounting in push_dl_task(). > > Trace (CONFIG_SCHED_DEBUG=y) before the fix on a 6 CPUs system with 6 > DL (12000, 10, 10) tasks showing the issue: > > [ 48.147868] dl_rq->running_bw > old > [ 48.147886] WARNING: CPU: 1 PID: 0 at kernel/sched/deadline.c:98 > ... > [ 48.274832] inactive_task_timer+0x468/0x4e8 > [ 48.279057] __hrtimer_run_queues+0x10c/0x3b8 > [ 48.283364] hrtimer_interrupt+0xd4/0x250 > [ 48.287330] tick_handle_oneshot_broadcast+0x198/0x1d0 > ... > [ 48.360057] dl_rq->running_bw > dl_rq->this_bw > [ 48.360065] WARNING: CPU: 1 PID: 0 at kernel/sched/deadline.c:86 > ... > [ 48.488294] task_contending+0x1a0/0x208 > [ 48.492172] enqueue_task_dl+0x3b8/0x970 > [ 48.496050] activate_task+0x70/0xd0 > [ 48.499584] ttwu_do_activate+0x50/0x78 > [ 48.503375] try_to_wake_up+0x270/0x7a0 > [ 48.507167] wake_up_process+0x14/0x20 > [ 48.510873] hrtimer_wakeup+0x1c/0x30 > ... > [ 50.062867] dl_rq->this_bw > old > [ 50.062885] WARNING: CPU: 1 PID: 2048 at > kernel/sched/deadline.c:122 ... > [ 50.190520] dequeue_task_dl+0x1e4/0x1f8 > [ 50.194400] __sched_setscheduler+0x1d0/0x860 > [ 50.198707] _sched_setscheduler+0x74/0x98 > [ 50.202757] do_sched_setscheduler+0xa8/0x110 > [ 50.207065] __arm64_sys_sched_setscheduler+0x1c/0x30 > > Signed-off-by: Dietmar Eggemann > --- > kernel/sched/deadline.c | 4 > 1 file changed, 4 deletions(-) > > diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c > index de2bd006fe93..d1aeada374e1 100644 > --- a/kernel/sched/deadline.c > +++ b/kernel/sched/deadline.c > @@ -2121,17 +2121,13 @@ static int push_dl_task(struct rq *rq) > } > > deactivate_task(rq, next_task, 0); > - sub_running_bw(_task->dl, >dl); > - sub_rq_bw(_task->dl, >dl); > set_task_cpu(next_task, later_rq->cpu); > - add_rq_bw(_task->dl, _rq->dl); > > /* >* Update the later_rq clock here, because the clock is used >* by the cpufreq_update_util() inside __add_running_bw(). >*/ > update_rq_clock(later_rq); > - add_running_bw(_task->dl, _rq->dl); > activate_task(later_rq, next_task, ENQUEUE_NOCLOCK); > ret = 1; >
Re: [RFC PATCH 3/6] sched/dl: Try better placement even for deadline tasks that do not block
Hi Peter, On Mon, 8 Jul 2019 15:55:36 +0200 Peter Zijlstra wrote: > On Mon, May 06, 2019 at 06:48:33AM +0200, Luca Abeni wrote: > > @@ -1223,8 +1250,17 @@ static void update_curr_dl(struct rq *rq) > > dl_se->dl_overrun = 1; > > > > __dequeue_task_dl(rq, curr, 0); > > - if (unlikely(dl_se->dl_boosted > > || !start_dl_timer(curr))) > > + if (unlikely(dl_se->dl_boosted > > || !start_dl_timer(curr))) { enqueue_task_dl(rq, curr, > > ENQUEUE_REPLENISH); +#ifdef CONFIG_SMP > > + } else if (dl_se->dl_adjust) { > > + if (rq->migrating_task == NULL) { > > + queue_balance_callback(rq, > > _cpu(dl_migrate_head, rq->cpu), migrate_dl_task); > > I'm not entirely sure about this one. > > That is, we only do those callbacks from: > > schedule_tail() > __schedule() > rt_mutex_setprio() > __sched_setscheduler() > > and the above looks like it can happen outside of those. Sorry, I did not know the constraints or requirements for using queue_balance_callback()... I used it because I wanted to trigger a migration from update_curr_dl(), but invoking double_lock_balance() from this function obviously resulted in a warning. So, I probably misunderstood the purpose of the balance callback API, and I misused it. What would have been the "right way" to trigger a migration for a task when it is throttled? > > The pattern in those sites is: > > rq_lock(); > ... do crap that leads to queue_balance_callback() > rq_unlock() > if (rq->balance_callback) { > raw_spin_lock_irqsave(rq->lock, flags); > ... do callbacks > raw_spin_unlock_irqrestore(rq->lock, flags); > } > > So I suppose can catch abuse of this API by doing something like the > below; can you validate? Sorry; right now I cannot run tests on big.LITTLE machines... Maybe Dietmar (added in cc), who is working on mainlining this patcset, can test? Thanks, Luca > > --- > > diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h > index aaca0e743776..89e615f1eae6 100644 > --- a/kernel/sched/sched.h > +++ b/kernel/sched/sched.h > @@ -1134,6 +1134,14 @@ static inline void rq_pin_lock(struct rq *rq, > struct rq_flags *rf) rf->cookie = lockdep_pin_lock(>lock); > > #ifdef CONFIG_SCHED_DEBUG > +#ifdef CONFIG_SMP > + /* > + * There should not be pending callbacks at the start of > rq_lock(); > + * all sites that handle them flush them at the end. > + */ > + WARN_ON_ONCE(rq->balance_callback); > +#endif > + > rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); > rf->clock_update_flags = 0; > #endif
Re: [RFC PATCH 2/6] sched/dl: Capacity-aware migrations
Hi Dietmar, On Thu, 4 Jul 2019 14:05:22 +0200 Dietmar Eggemann wrote: > On 5/6/19 6:48 AM, Luca Abeni wrote: > > [...] > > > diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c > > index 5b981eeeb944..3436f3d8fa8f 100644 > > --- a/kernel/sched/deadline.c > > +++ b/kernel/sched/deadline.c > > @@ -1584,6 +1584,9 @@ select_task_rq_dl(struct task_struct *p, int > > cpu, int sd_flag, int flags) if (sd_flag != SD_BALANCE_WAKE) > > goto out; > > > > + if (dl_entity_is_special(>dl)) > > + goto out; > > I wonder if this is really required. The if condition > > 1591 if (unlikely(dl_task(curr)) && > 1592 (curr->nr_cpus_allowed < 2 || > 1593 !dl_entity_preempt(>dl, >dl)) && > 1594 (p->nr_cpus_allowed > 1)) { > > further below uses '!dl_entity_preempt(>dl, >dl))' which > returns 'dl_entity_is_special(a) || ...' Uhm... I do not remember the details; I remember that the check fixed something during the development of the patchset, but I did not check if it was still needed when I forward-ported the patches... So, maybe it worked around some bugs in previous versions of the kernel, but is not needed now. Luca
Re: [RFC PATCH 4/6] sched/dl: Improve capacity-aware wakeup
Hi Juri, On Wed, 8 May 2019 15:10:18 +0200 Juri Lelli wrote: > On 08/05/19 14:47, luca abeni wrote: > > [...] > > > Notice that all this logic is used only to select one of the idle > > cores (instead of picking the first idle core, we select the less > > powerful core on which the task "fits"). > > > > So, running_bw does not provide any useful information, in this > > case; maybe this_bw can be more useful. > > Ah, indeed. > > However, what happens when cores are all busy? Can load balancing > decisions based on deadline values only break capacity awareness? (I > understand this is an RFC, so a "yes, something we need to look at" is > OK :-) I have no definitive answer about this :) If we do not want to break EDF and its properties, migrations have to be performed so that the "global EDF invariant" (schedule the m tasks with earliest deadlines) is not broken (and capacity-based migrations for non-idle cores risk to break this invariant). In theory, we should do something like "schedule the earliest deadline task on the fastest core", but this would require too many migrations (and would require to migrate a task while it is executing, etc...) I am convinced that doing the capacity-aware migrations when we have idle cores helps to reduce migrations (avoiding "useless migrations") when there are no idle cores, but I have no proof about this (just some simple examples); this is why I say that I have no definitive answer... Luca
Re: [RFC PATCH 4/6] sched/dl: Improve capacity-aware wakeup
Hi Juri, On Wed, 8 May 2019 14:05:26 +0200 Juri Lelli wrote: [...] > > > > + if ((rel_deadline < 0) || (rel_deadline * > > > > dl_se->dl_runtime < dl_se->dl_deadline * rem_runtime)) { > > > > + rel_deadline = dl_se->dl_deadline; > > > > + rem_runtime = dl_se->dl_runtime; > > > > + } > > > > > > So, are you basically checking if current remaining bw can be > > > consumed safely? > > > > I check if the current runtime (rescaled based on the capacity) is > > smaller than the time to the current scheduling deadline > > (basically, if it can be consumed in time). > > > > However, if > > q / (d - t) > Q / P > > (where "q" is the current runtime, "d" is the scheduling deadline, > > "Q" is the maximum runtime, and "P" is the CBS period), then a new > > scheduling deadline will be generated (later), and the runtime will > > be reset to Q... So, I need to use the maximum budget and CBS > > period for checking if the task fits in the core. > > OK. I'd add a comment about it. Ok; I'll add a comment in the next version of the patchset. [...] > > > I'm not actually sure if looking at dynamic values is what we > > > need to do at this stage. By considering static values we fix > > > admission control (and scheduling). Aren't dynamic values more to > > > do with energy tradeoffs (and so to be introduced when starting > > > to look at the energy model)? > > > > Using the current runtime and scheduling deadline might allow to > > migrate a task to SMALL cores (if its remaining runtime is small > > enough), even if the rescaled Q is larger than P. > > So, in theory it might allow to reduce the load on big cores. > > > > If we decide that this is overkilling, I can just drop the patch. > > So, my first impression was that we shouldn't be too clever until we > start using info from the energy model (using which one should be able > to understand if, for example, reducing load on big cores is a winning > power/perf decision). Ok. > However, I was also wondering if we should actually compare dynamic > parameters with {running,this}_bw (per-rq) the moment we search for > potential migration candidates (so that we are not overloading rqs). Notice that all this logic is used only to select one of the idle cores (instead of picking the first idle core, we select the less powerful core on which the task "fits"). So, running_bw does not provide any useful information, in this case; maybe this_bw can be more useful. Luca
Re: [RFC PATCH 4/6] sched/dl: Improve capacity-aware wakeup
On Wed, 8 May 2019 11:08:55 +0200 Juri Lelli wrote: > On 06/05/19 06:48, Luca Abeni wrote: > > From: luca abeni > > > > Instead of considering the "static CPU bandwidth" allocated to > > a SCHED_DEADLINE task (ratio between its maximum runtime and > > reservation period), try to use the remaining runtime and time > > to scheduling deadline. > > > > Signed-off-by: luca abeni > > --- > > kernel/sched/cpudeadline.c | 9 +++-- > > 1 file changed, 7 insertions(+), 2 deletions(-) > > > > diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c > > index d21f7905b9c1..111dd9ac837b 100644 > > --- a/kernel/sched/cpudeadline.c > > +++ b/kernel/sched/cpudeadline.c > > @@ -114,8 +114,13 @@ static inline int dl_task_fit(const struct > > sched_dl_entity *dl_se, int cpu, u64 *c) > > { > > u64 cap = (arch_scale_cpu_capacity(NULL, cpu) * > > arch_scale_freq_capacity(cpu)) >> SCHED_CAPACITY_SHIFT; > > - s64 rel_deadline = dl_se->dl_deadline; > > - u64 rem_runtime = dl_se->dl_runtime; > > + s64 rel_deadline = dl_se->deadline - > > sched_clock_cpu(smp_processor_id()); > > + u64 rem_runtime = dl_se->runtime; > > + > > + if ((rel_deadline < 0) || (rel_deadline * > > dl_se->dl_runtime < dl_se->dl_deadline * rem_runtime)) { > > + rel_deadline = dl_se->dl_deadline; > > + rem_runtime = dl_se->dl_runtime; > > + } > > So, are you basically checking if current remaining bw can be consumed > safely? I check if the current runtime (rescaled based on the capacity) is smaller than the time to the current scheduling deadline (basically, if it can be consumed in time). However, if q / (d - t) > Q / P (where "q" is the current runtime, "d" is the scheduling deadline, "Q" is the maximum runtime, and "P" is the CBS period), then a new scheduling deadline will be generated (later), and the runtime will be reset to Q... So, I need to use the maximum budget and CBS period for checking if the task fits in the core. > > I'm not actually sure if looking at dynamic values is what we need to > do at this stage. By considering static values we fix admission > control (and scheduling). Aren't dynamic values more to do with > energy tradeoffs (and so to be introduced when starting to look at > the energy model)? Using the current runtime and scheduling deadline might allow to migrate a task to SMALL cores (if its remaining runtime is small enough), even if the rescaled Q is larger than P. So, in theory it might allow to reduce the load on big cores. If we decide that this is overkilling, I can just drop the patch. Luca > Another pair of hands maybe is to look at the dynamic spare bw of CPUs > (to check that we don't overload CPUs). > > Thanks, > > - Juri
Re: [RFC PATCH 2/6] sched/dl: Capacity-aware migrations
On Wed, 8 May 2019 10:04:36 +0200 Juri Lelli wrote: > Hi Luca, > > On 06/05/19 06:48, Luca Abeni wrote: > > From: luca abeni > > > > Currently, the SCHED_DEADLINE scheduler uses a global EDF scheduling > > algorithm, migrating tasks to CPU cores without considering the core > > capacity and the task utilization. This works well on homogeneous > > systems (SCHED_DEADLINE tasks are guaranteed to have a bounded > > tardiness), but presents some issues on heterogeneous systems. For > > example, a SCHED_DEADLINE task might be migrated on a core that has > > not enough processing capacity to correctly serve the task (think > > about a task with runtime 70ms and period 100ms migrated to a core > > with processing capacity 0.5) > > > > This commit is a first step to address the issue: When a task wakes > > up or migrates away from a CPU core, the scheduler tries to find an > > idle core having enough processing capacity to serve the task. > > > > Signed-off-by: luca abeni > > --- > > kernel/sched/cpudeadline.c | 31 +-- > > kernel/sched/deadline.c| 8 ++-- > > kernel/sched/sched.h | 7 ++- > > 3 files changed, 41 insertions(+), 5 deletions(-) > > > > diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c > > index 50316455ea66..d21f7905b9c1 100644 > > --- a/kernel/sched/cpudeadline.c > > +++ b/kernel/sched/cpudeadline.c > > @@ -110,6 +110,22 @@ static inline int cpudl_maximum(struct cpudl > > *cp) return cp->elements[0].cpu; > > } > > > > +static inline int dl_task_fit(const struct sched_dl_entity *dl_se, > > + int cpu, u64 *c) > > +{ > > + u64 cap = (arch_scale_cpu_capacity(NULL, cpu) * > > arch_scale_freq_capacity(cpu)) >> SCHED_CAPACITY_SHIFT; > > + s64 rel_deadline = dl_se->dl_deadline; > > + u64 rem_runtime = dl_se->dl_runtime; > > This is not the dynamic remaining one, is it? Right; I preferred to split this in two patches so that if we decide to use only the static task parameters (dl_deadline and dl_runtime) I can simply drop a patch ;-) Luca > > I see however 4/6.. lemme better look at that. > > Best, > > - Juri
Re: [RFC PATCH 3/6] sched/dl: Try better placement even for deadline tasks that do not block
Hi Juri, On Wed, 8 May 2019 10:01:16 +0200 Juri Lelli wrote: > Hi Luca, > > On 06/05/19 06:48, Luca Abeni wrote: > > From: luca abeni > > > > Currently, the scheduler tries to find a proper placement for > > SCHED_DEADLINE tasks when they are pushed out of a core or when > > they wake up. Hence, if there is a single SCHED_DEADLINE task > > that never blocks and wakes up, such a task is never migrated to > > an appropriate CPU core, but continues to execute on its original > > core. > > > > This commit addresses the issue by trying to migrate a > > SCHED_DEADLINE task (searching for an appropriate CPU core) the > > first time it is throttled. > > Why we failed to put the task on a CPU with enough (max) capacity > right after it passed admission control? The very first time the task > was scheduled I mean. I think the currently executing task cannot be pushed out of a CPU/core, right? So, if a task switches from SCHED_OTHER to SCHED_DEADLINE while it is executing on a fast core, the only way to migrate it would be to preempt it (by using the stop_sched_class, I think), no? (the typical situation here is a "cpu hog" task that switches from SCHED_OTHER to SCHED_DEADLINE, and it is the only SCHED_DEADLINE task... The task never blocks, so push/pull functions are never invoked) Or am I missing something? Thanks, Luca
Re: [RFC PATCH 6/6] sched/dl: Try not to select a too fast core
Hi Quentin, On Tue, 7 May 2019 16:57:34 +0100 Quentin Perret wrote: > On Monday 06 May 2019 at 06:48:36 (+0200), Luca Abeni wrote: > > From: luca abeni > > > > When a task can fit on multiple CPU cores, try to select the slowest > > core that is able to properly serve the task. This avoids useless > > future migrations, leaving the "fast cores" idle for more > > heavyweight tasks. > > But only if the _current_ capacity of big CPUs (at the current freq) > is higher than the current capacity of the littles, is that right ? > So we don't really have a guarantee to pack small tasks on little > cores ... Yes, the capacity is estimated at the current frequency, so this is a potential problem. > What is the rationale for looking at the current freq in > dl_task_fit() ? Mainly two reasons: the first one is to try to reduce frequency switches (I did not perform measurements on the hikey960, I remember that on other CPUs a frequency switch can take a considerable amount of time). Then, I wanted to have a mechanism that can work with all the possible cpufreq governors... So, I did not assume that the frequency can change (for example, I remember that without considering the current frequency I had issues when using the "userspace" governor). Maybe I just do not know this kernel subsystem well enough, but I did not find any way to know the maximum frequency that the current governor can set (I mean, I found a "maximum frequency" field that tells me the maximum frequency that the cpufreq driver can set, but I do not know if the governor will be able to set it --- again, consider the "userspace" governor). If there is a way to know this value, then I can use it for checking if a task can fit in a core. Thanks, Luca > Energy reasons ? If so, I'd argue you should look at > the energy model to break the tie between CPU candidates ... ;) > > And in the mean time, you could just look at arch_scale_cpu_capacity() > to check if a task fits ? > > > Signed-off-by: luca abeni > > --- > > kernel/sched/cpudeadline.c | 17 - > > 1 file changed, 12 insertions(+), 5 deletions(-) > > > > diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c > > index 2a4ac7b529b7..897ed71af515 100644 > > --- a/kernel/sched/cpudeadline.c > > +++ b/kernel/sched/cpudeadline.c > > @@ -143,17 +143,24 @@ int cpudl_find(struct cpudl *cp, struct > > task_struct *p, struct cpumask *later_mask) > > { > > const struct sched_dl_entity *dl_se = >dl; > > + struct cpumask tmp_mask; > > Hmm, these can get pretty big, so not sure about having one on the > stack ... > > > > > if (later_mask && > > - cpumask_and(later_mask, cp->free_cpus, > > >cpus_allowed)) { > > + cpumask_and(_mask, cp->free_cpus, > > >cpus_allowed)) { int cpu, max_cpu = -1; > > - u64 max_cap = 0; > > + u64 max_cap = 0, min_cap = SCHED_CAPACITY_SCALE * > > SCHED_CAPACITY_SCALE; > > - for_each_cpu(cpu, later_mask) { > > + cpumask_clear(later_mask); > > + for_each_cpu(cpu, _mask) { > > u64 cap; > > > > - if (!dl_task_fit(>dl, cpu, )) > > - cpumask_clear_cpu(cpu, later_mask); > > + if (dl_task_fit(>dl, cpu, ) && (cap > > <= min_cap)) { > > + if (cap < min_cap) { > > + min_cap = cap; > > + cpumask_clear(later_mask); > > + } > > + cpumask_set_cpu(cpu, later_mask); > > + } > > > > if (cap > max_cap) { > > max_cap = cap; > > -- > > 2.20.1 > > > > Thanks, > Quentin
Re: [RFC PATCH 2/6] sched/dl: Capacity-aware migrations
Hi, On Tue, 7 May 2019 15:10:50 +0100 Quentin Perret wrote: > On Monday 06 May 2019 at 06:48:32 (+0200), Luca Abeni wrote: > > static inline unsigned long cpu_bw_dl(struct rq *rq) > > { > > - return (rq->dl.running_bw * SCHED_CAPACITY_SCALE) >> > > BW_SHIFT; > > + unsigned long res; > > + > > + res = (rq->dl.running_bw * SCHED_CAPACITY_SCALE) >> > > BW_SHIFT; + > > + return (res << SCHED_CAPACITY_SHIFT) / > > + arch_scale_cpu_capacity(NULL, rq->cpu); > > The only user of cpu_bw_dl() is schedutil right ? If yes, we probably > don't want to scale things here -- schedutil already does this I > believe. I think I added this modification because I arrived to the conclusion that schedutils does not perform this rescaling (at least, I think it did not perform it when I wrote the patch :) BTW, while re-reading the patch I do not understand why this change was added in this specific patch... I suspect it should have gone in a separate patch. Luca > > Thanks, > Quentin > > > } > > > > static inline unsigned long cpu_util_dl(struct rq *rq) > > -- > > 2.20.1 > >
Re: [RFC PATCH 1/6] sched/dl: Improve deadline admission control for asymmetric CPU capacities
On Tue, 7 May 2019 15:31:27 +0100 Quentin Perret wrote: > On Tuesday 07 May 2019 at 16:25:23 (+0200), luca abeni wrote: > > On Tue, 7 May 2019 14:48:52 +0100 > > Quentin Perret wrote: > > > > > Hi Luca, > > > > > > On Monday 06 May 2019 at 06:48:31 (+0200), Luca Abeni wrote: > > > > diff --git a/drivers/base/arch_topology.c > > > > b/drivers/base/arch_topology.c index edfcf8d982e4..646d6d349d53 > > > > 100644 --- a/drivers/base/arch_topology.c > > > > +++ b/drivers/base/arch_topology.c > > > > @@ -36,6 +36,7 @@ DEFINE_PER_CPU(unsigned long, cpu_scale) = > > > > SCHED_CAPACITY_SCALE; > > > > void topology_set_cpu_scale(unsigned int cpu, unsigned long > > > > capacity) { > > > > + topology_update_cpu_capacity(cpu, per_cpu(cpu_scale, > > > > cpu), capacity); > > > > > > Why is that one needed ? Don't you end up re-building the sched > > > domains after this anyways ? > > > > If I remember correctly, this function was called at boot time when > > the capacities are assigned to the CPU cores. > > > > I do not remember if the sched domain was re-built after this call, > > but I admit I do not know this part of the kernel very well... > > Right and things moved recently in this area, see bb1fbdd3c3fd > ("sched/topology, drivers/base/arch_topology: Rebuild the sched_domain > hierarchy when capacities change") Ah, thanks! I missed this change when rebasing the patchset. I guess this part of the patch has to be updated (and probably became useless?), then. Thanks, Luca > > > This achieved the effect of correctly setting up the "rd_capacity" > > field, but I do not know if there is a better/simpler way to achieve > > the same result :) > > OK, that's really an implementation detail, so no need to worry too > much about it at the RFC stage I suppose :-) > > Thanks, > Quentin
Re: [RFC PATCH 1/6] sched/dl: Improve deadline admission control for asymmetric CPU capacities
On Tue, 7 May 2019 14:48:52 +0100 Quentin Perret wrote: > Hi Luca, > > On Monday 06 May 2019 at 06:48:31 (+0200), Luca Abeni wrote: > > diff --git a/drivers/base/arch_topology.c > > b/drivers/base/arch_topology.c index edfcf8d982e4..646d6d349d53 > > 100644 --- a/drivers/base/arch_topology.c > > +++ b/drivers/base/arch_topology.c > > @@ -36,6 +36,7 @@ DEFINE_PER_CPU(unsigned long, cpu_scale) = > > SCHED_CAPACITY_SCALE; > > void topology_set_cpu_scale(unsigned int cpu, unsigned long > > capacity) { > > + topology_update_cpu_capacity(cpu, per_cpu(cpu_scale, cpu), > > capacity); > > Why is that one needed ? Don't you end up re-building the sched > domains after this anyways ? If I remember correctly, this function was called at boot time when the capacities are assigned to the CPU cores. I do not remember if the sched domain was re-built after this call, but I admit I do not know this part of the kernel very well... This achieved the effect of correctly setting up the "rd_capacity" field, but I do not know if there is a better/simpler way to achieve the same result :) Thanks, Luca > > > per_cpu(cpu_scale, cpu) = capacity; > > } > > Thanks, > Quentin
Re: [RFC PATCH 2/6] sched/dl: Capacity-aware migrations
Hi Quentin, On Tue, 7 May 2019 14:35:28 +0100 Quentin Perret wrote: > Hi Luca, > > On Monday 06 May 2019 at 06:48:32 (+0200), Luca Abeni wrote: > > +static inline int dl_task_fit(const struct sched_dl_entity *dl_se, > > + int cpu, u64 *c) > > +{ > > + u64 cap = (arch_scale_cpu_capacity(NULL, cpu) * > > arch_scale_freq_capacity(cpu)) >> SCHED_CAPACITY_SHIFT; > > I'm a little bit confused by this use of arch_scale_freq_capacity() > here. IIUC this means you would say a big DL task doesn't fit on a big > CPU just because it happens to be running at a low frequency when this > function is called. Is this what we want ? The idea of this approach was to avoid frequency switches when possible; so, I wanted to check if the task fits on a CPU core at its current operating frequency. > If the frequency is low, we can (probably) raise it to accommodate > this DL task so perhaps we should say it fits ? In a later patch, if the task does not fit on any core (at its current frequency), the task is moved to the core having the maximum capacity (without considering the operating frequency --- at least, this was my intention when I wrote the patches :) Luca > > > + s64 rel_deadline = dl_se->dl_deadline; > > + u64 rem_runtime = dl_se->dl_runtime; > > + > > + if (c) > > + *c = cap; > > + > > + if ((rel_deadline * cap) >> SCHED_CAPACITY_SHIFT < > > rem_runtime) > > + return 0; > > + > > + return 1; > > +} > > Thanks, > Quentin
[RFC PATCH 2/6] sched/dl: Capacity-aware migrations
From: luca abeni Currently, the SCHED_DEADLINE scheduler uses a global EDF scheduling algorithm, migrating tasks to CPU cores without considering the core capacity and the task utilization. This works well on homogeneous systems (SCHED_DEADLINE tasks are guaranteed to have a bounded tardiness), but presents some issues on heterogeneous systems. For example, a SCHED_DEADLINE task might be migrated on a core that has not enough processing capacity to correctly serve the task (think about a task with runtime 70ms and period 100ms migrated to a core with processing capacity 0.5) This commit is a first step to address the issue: When a task wakes up or migrates away from a CPU core, the scheduler tries to find an idle core having enough processing capacity to serve the task. Signed-off-by: luca abeni --- kernel/sched/cpudeadline.c | 31 +-- kernel/sched/deadline.c| 8 ++-- kernel/sched/sched.h | 7 ++- 3 files changed, 41 insertions(+), 5 deletions(-) diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index 50316455ea66..d21f7905b9c1 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -110,6 +110,22 @@ static inline int cpudl_maximum(struct cpudl *cp) return cp->elements[0].cpu; } +static inline int dl_task_fit(const struct sched_dl_entity *dl_se, + int cpu, u64 *c) +{ + u64 cap = (arch_scale_cpu_capacity(NULL, cpu) * arch_scale_freq_capacity(cpu)) >> SCHED_CAPACITY_SHIFT; + s64 rel_deadline = dl_se->dl_deadline; + u64 rem_runtime = dl_se->dl_runtime; + + if (c) + *c = cap; + + if ((rel_deadline * cap) >> SCHED_CAPACITY_SHIFT < rem_runtime) + return 0; + + return 1; +} + /* * cpudl_find - find the best (later-dl) CPU in the system * @cp: the cpudl max-heap context @@ -125,8 +141,19 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, if (later_mask && cpumask_and(later_mask, cp->free_cpus, >cpus_allowed)) { - return 1; - } else { + int cpu; + + for_each_cpu(cpu, later_mask) { + u64 cap; + + if (!dl_task_fit(>dl, cpu, )) + cpumask_clear_cpu(cpu, later_mask); + } + + if (!cpumask_empty(later_mask)) + return 1; + } + { int best_cpu = cpudl_maximum(cp); WARN_ON(best_cpu != -1 && !cpu_present(best_cpu)); diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 5b981eeeb944..3436f3d8fa8f 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1584,6 +1584,9 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags) if (sd_flag != SD_BALANCE_WAKE) goto out; + if (dl_entity_is_special(>dl)) + goto out; + rq = cpu_rq(cpu); rcu_read_lock(); @@ -1598,10 +1601,11 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags) * other hand, if it has a shorter deadline, we * try to make it stay here, it might be important. */ - if (unlikely(dl_task(curr)) && + if ((unlikely(dl_task(curr)) && (curr->nr_cpus_allowed < 2 || !dl_entity_preempt(>dl, >dl)) && - (p->nr_cpus_allowed > 1)) { + (p->nr_cpus_allowed > 1)) || + static_branch_unlikely(_asym_cpucapacity)) { int target = find_later_rq(p); if (target != -1 && diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 32d242694863..e5f9fd3aee80 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2367,7 +2367,12 @@ unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, static inline unsigned long cpu_bw_dl(struct rq *rq) { - return (rq->dl.running_bw * SCHED_CAPACITY_SCALE) >> BW_SHIFT; + unsigned long res; + + res = (rq->dl.running_bw * SCHED_CAPACITY_SCALE) >> BW_SHIFT; + + return (res << SCHED_CAPACITY_SHIFT) / + arch_scale_cpu_capacity(NULL, rq->cpu); } static inline unsigned long cpu_util_dl(struct rq *rq) -- 2.20.1
[RFC PATCH 5/6] sched/dl: If the task does not fit anywhere, select the fastest core
From: luca abeni When a task has a remaining runtime that cannot be served within the scheduling deadline by anyone of the idle cores, the task is doomed to miss its deadline (this can happen, because the admission control only guarantees a bounded tardiness, not the hard respect of all deadlines). In this case, try to select the fastest idle core, to minimize the tardiness. Signed-off-by: luca abeni --- kernel/sched/cpudeadline.c | 12 ++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index 111dd9ac837b..2a4ac7b529b7 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -123,7 +123,7 @@ static inline int dl_task_fit(const struct sched_dl_entity *dl_se, } if (c) - *c = cap; + *c = arch_scale_cpu_capacity(NULL, cpu); if ((rel_deadline * cap) >> SCHED_CAPACITY_SHIFT < rem_runtime) return 0; @@ -146,14 +146,22 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, if (later_mask && cpumask_and(later_mask, cp->free_cpus, >cpus_allowed)) { - int cpu; + int cpu, max_cpu = -1; + u64 max_cap = 0; for_each_cpu(cpu, later_mask) { u64 cap; if (!dl_task_fit(>dl, cpu, )) cpumask_clear_cpu(cpu, later_mask); + + if (cap > max_cap) { + max_cap = cap; + max_cpu = cpu; + } } + if (cpumask_empty(later_mask) && max_cap) + cpumask_set_cpu(max_cpu, later_mask); if (!cpumask_empty(later_mask)) return 1; -- 2.20.1
[RFC PATCH 6/6] sched/dl: Try not to select a too fast core
From: luca abeni When a task can fit on multiple CPU cores, try to select the slowest core that is able to properly serve the task. This avoids useless future migrations, leaving the "fast cores" idle for more heavyweight tasks. Signed-off-by: luca abeni --- kernel/sched/cpudeadline.c | 17 - 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index 2a4ac7b529b7..897ed71af515 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -143,17 +143,24 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, struct cpumask *later_mask) { const struct sched_dl_entity *dl_se = >dl; + struct cpumask tmp_mask; if (later_mask && - cpumask_and(later_mask, cp->free_cpus, >cpus_allowed)) { + cpumask_and(_mask, cp->free_cpus, >cpus_allowed)) { int cpu, max_cpu = -1; - u64 max_cap = 0; + u64 max_cap = 0, min_cap = SCHED_CAPACITY_SCALE * SCHED_CAPACITY_SCALE; - for_each_cpu(cpu, later_mask) { + cpumask_clear(later_mask); + for_each_cpu(cpu, _mask) { u64 cap; - if (!dl_task_fit(>dl, cpu, )) - cpumask_clear_cpu(cpu, later_mask); + if (dl_task_fit(>dl, cpu, ) && (cap <= min_cap)) { + if (cap < min_cap) { + min_cap = cap; + cpumask_clear(later_mask); + } + cpumask_set_cpu(cpu, later_mask); + } if (cap > max_cap) { max_cap = cap; -- 2.20.1
[RFC PATCH 4/6] sched/dl: Improve capacity-aware wakeup
From: luca abeni Instead of considering the "static CPU bandwidth" allocated to a SCHED_DEADLINE task (ratio between its maximum runtime and reservation period), try to use the remaining runtime and time to scheduling deadline. Signed-off-by: luca abeni --- kernel/sched/cpudeadline.c | 9 +++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index d21f7905b9c1..111dd9ac837b 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -114,8 +114,13 @@ static inline int dl_task_fit(const struct sched_dl_entity *dl_se, int cpu, u64 *c) { u64 cap = (arch_scale_cpu_capacity(NULL, cpu) * arch_scale_freq_capacity(cpu)) >> SCHED_CAPACITY_SHIFT; - s64 rel_deadline = dl_se->dl_deadline; - u64 rem_runtime = dl_se->dl_runtime; + s64 rel_deadline = dl_se->deadline - sched_clock_cpu(smp_processor_id()); + u64 rem_runtime = dl_se->runtime; + + if ((rel_deadline < 0) || (rel_deadline * dl_se->dl_runtime < dl_se->dl_deadline * rem_runtime)) { + rel_deadline = dl_se->dl_deadline; + rem_runtime = dl_se->dl_runtime; + } if (c) *c = cap; -- 2.20.1
[RFC PATCH 1/6] sched/dl: Improve deadline admission control for asymmetric CPU capacities
From: luca abeni Currently, the SCHED_DEADLINE admission control ensures that the sum of reserved CPU bandwidths is smaller than x * M, where the reserved CPU bandwidth of a SCHED_DEADLINE task is defined as the ratio between its runtime and its period, x is a user-definable percentage (95% by default, can be controlled through /proc/sys/kernel/sched_rt_{runtime,period}_us) and M is the number of CPU cores. This admission control works well (guaranteeing bounded tardiness for SCHED_DEADLINE tasks and non-starvation of non SCHED_DEADLINE tasks) for homogeneous systems (where all the CPU cores have the same computing capacity), but ends up over-allocating the CPU time in presence of less-powerful CPU cores. It can be easily shown how on asymmetric CPU capacity architectures (such as ARM big.LITTLE) SCHED_DEADLINE tasks can easily starve all the other tasks, making the system unusable. This commit fixes the issue by explicitly considering the cores' capacities in the admission test (where "M" is replaced by the sum of all the capacities of cores in the root domain). Signed-off-by: luca abeni --- drivers/base/arch_topology.c | 1 + include/linux/sched/topology.h | 3 +++ kernel/sched/core.c| 2 ++ kernel/sched/deadline.c| 19 +-- kernel/sched/sched.h | 7 +-- kernel/sched/topology.c| 9 + 6 files changed, 33 insertions(+), 8 deletions(-) diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c index edfcf8d982e4..646d6d349d53 100644 --- a/drivers/base/arch_topology.c +++ b/drivers/base/arch_topology.c @@ -36,6 +36,7 @@ DEFINE_PER_CPU(unsigned long, cpu_scale) = SCHED_CAPACITY_SCALE; void topology_set_cpu_scale(unsigned int cpu, unsigned long capacity) { + topology_update_cpu_capacity(cpu, per_cpu(cpu_scale, cpu), capacity); per_cpu(cpu_scale, cpu) = capacity; } diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 2bf680b42f3c..a4898e42f368 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -233,4 +233,7 @@ static inline int task_node(const struct task_struct *p) return cpu_to_node(task_cpu(p)); } +void topology_update_cpu_capacity(unsigned int cpu, unsigned long old, + unsigned long new); + #endif /* _LINUX_SCHED_TOPOLOGY_H */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d8456801caa3..a37bbb246802 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6271,6 +6271,7 @@ void set_rq_online(struct rq *rq) if (class->rq_online) class->rq_online(rq); } + rq->rd->rd_capacity += arch_scale_cpu_capacity(NULL, cpu_of(rq)); } } @@ -6286,6 +6287,7 @@ void set_rq_offline(struct rq *rq) cpumask_clear_cpu(rq->cpu, rq->rd->online); rq->online = 0; + rq->rd->rd_capacity -= arch_scale_cpu_capacity(NULL, cpu_of(rq)); } } diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 6a73e41a2016..5b981eeeb944 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -63,6 +63,10 @@ static inline int dl_bw_cpus(int i) return cpus; } +static inline unsigned long rd_capacity(struct rq *rq) +{ + return rq->rd->rd_capacity; +} #else static inline struct dl_bw *dl_bw_of(int i) { @@ -73,6 +77,11 @@ static inline int dl_bw_cpus(int i) { return 1; } + +static inline unsigned long rd_capacity(struct rq *rq) +{ + return SCHED_CAPACITY_SCALE; +} #endif static inline @@ -2535,13 +2544,13 @@ int sched_dl_overflow(struct task_struct *p, int policy, raw_spin_lock(_b->lock); cpus = dl_bw_cpus(task_cpu(p)); if (dl_policy(policy) && !task_has_dl_policy(p) && - !__dl_overflow(dl_b, cpus, 0, new_bw)) { + !__dl_overflow(dl_b, rd_capacity(task_rq(p)), 0, new_bw)) { if (hrtimer_active(>dl.inactive_timer)) __dl_sub(dl_b, p->dl.dl_bw, cpus); __dl_add(dl_b, new_bw, cpus); err = 0; } else if (dl_policy(policy) && task_has_dl_policy(p) && - !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) { + !__dl_overflow(dl_b, rd_capacity(task_rq(p)), p->dl.dl_bw, new_bw)) { /* * XXX this is slightly incorrect: when the task * utilization decreases, we should delay the total @@ -2689,7 +2698,7 @@ int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allo dl_b = dl_bw_of(dest_cpu); raw_spin_lock_irqsave(_b->lock, flags); cpus = dl_bw_cpus(dest_cpu); - overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw); + overflow = __dl_overflow(dl_b, rd
[RFC PATCH 3/6] sched/dl: Try better placement even for deadline tasks that do not block
From: luca abeni Currently, the scheduler tries to find a proper placement for SCHED_DEADLINE tasks when they are pushed out of a core or when they wake up. Hence, if there is a single SCHED_DEADLINE task that never blocks and wakes up, such a task is never migrated to an appropriate CPU core, but continues to execute on its original core. This commit addresses the issue by trying to migrate a SCHED_DEADLINE task (searching for an appropriate CPU core) the first time it is throttled. Signed-off-by: luca abeni --- include/linux/sched.h | 1 + kernel/sched/deadline.c | 53 - kernel/sched/sched.h| 2 ++ 3 files changed, 50 insertions(+), 6 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 863f70843875..5e322c8a94e0 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -560,6 +560,7 @@ struct sched_dl_entity { unsigned intdl_yielded: 1; unsigned intdl_non_contending : 1; unsigned intdl_overrun: 1; + unsigned intdl_adjust : 1; /* * Bandwidth enforcement timer. Each -deadline task has its diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 3436f3d8fa8f..db471889196b 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -515,6 +515,7 @@ static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev) return dl_task(prev); } +static DEFINE_PER_CPU(struct callback_head, dl_migrate_head); static DEFINE_PER_CPU(struct callback_head, dl_push_head); static DEFINE_PER_CPU(struct callback_head, dl_pull_head); @@ -1149,6 +1150,32 @@ static u64 grub_reclaim(u64 delta, struct rq *rq, struct sched_dl_entity *dl_se) return (delta * u_act) >> BW_SHIFT; } +#ifdef CONFIG_SMP +static int find_later_rq(struct task_struct *task); + +static void migrate_dl_task(struct rq *rq) +{ + struct task_struct *t = rq->migrating_task; + struct sched_dl_entity *dl_se = >dl; + int cpu = find_later_rq(t); + + if ((cpu != -1) && (cpu != rq->cpu)) { + struct rq *later_rq; + + later_rq = cpu_rq(cpu); + + double_lock_balance(rq, later_rq); + sub_running_bw(>dl, >dl); + sub_rq_bw(>dl, >dl); + set_task_cpu(t, later_rq->cpu); + add_rq_bw(>dl, _rq->dl); + add_running_bw(>dl, _rq->dl); + double_unlock_balance(rq, later_rq); + } + rq->migrating_task = NULL; + dl_se->dl_adjust = 0; +} +#endif /* * Update the current task's runtime statistics (provided it is still * a -deadline task and has not been removed from the dl_rq). @@ -1223,8 +1250,17 @@ static void update_curr_dl(struct rq *rq) dl_se->dl_overrun = 1; __dequeue_task_dl(rq, curr, 0); - if (unlikely(dl_se->dl_boosted || !start_dl_timer(curr))) + if (unlikely(dl_se->dl_boosted || !start_dl_timer(curr))) { enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH); +#ifdef CONFIG_SMP + } else if (dl_se->dl_adjust) { + if (rq->migrating_task == NULL) { + queue_balance_callback(rq, _cpu(dl_migrate_head, rq->cpu), migrate_dl_task); + rq->migrating_task = current; + } else + printk_deferred("Throttled task before migratin g the previous one???\n"); +#endif + } if (!is_leftmost(curr, >dl)) resched_curr(rq); @@ -1573,13 +1609,12 @@ static void yield_task_dl(struct rq *rq) #ifdef CONFIG_SMP -static int find_later_rq(struct task_struct *task); - static int select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags) { struct task_struct *curr; struct rq *rq; + bool het; if (sd_flag != SD_BALANCE_WAKE) goto out; @@ -1591,6 +1626,7 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags) rcu_read_lock(); curr = READ_ONCE(rq->curr); /* unlocked access */ + het = static_branch_unlikely(_asym_cpucapacity); /* * If we are dealing with a -deadline task, we must @@ -1604,15 +1640,17 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags) if ((unlikely(dl_task(curr)) && (curr->nr_cpus_allowed < 2 || !dl_entity_preempt(>dl, >dl)) && - (p->nr_cpus_allowed > 1)) || - static_branch_unlikely(_asym_cpucapacity)) { + (p->nr_cpus_allowed > 1)) || het) { int target = find_later_rq(p);
[RFC PATCH 0/6] Capacity awareness for SCHED_DEADLINE
From: luca abeni The SCHED_DEADLINE scheduling policy currently has some issues with asymmetric CPU capacity architectures (such as ARM big.LITTLE). In particular, the admission control mechanism does not work correctly (because it considers all cores to have the same speed of the fastest core), and the scheduler does not consider the cores' speed when migrating tasks. This patchset fixes the issues by explicitly considering the cores' capacities in the admission control mechanism and in tasks' migration. In particular, the scheduler is improved so that "heavyweight tasks" (processes or threads having a runtime that on LITTLE cores becomes larger than the relative deadline) are scheduled on big cores. This allows respecting deadlines that are missed by the current scheduler. Moreover, the migration logic is modified so that "lightweight tasks" (processes or threads having a runtime that on LITTLE cores is still smaller than the relative deadline) are scheduled on LITTLE cores when possible. This allows saving some energy without missing deadlines. luca abeni (6): sched/dl: Improve deadline admission control for asymmetric CPU capacities sched/dl: Capacity-aware migrations sched/dl: Try better placement even for deadline tasks that do not block sched/dl: Improve capacity-aware wakeup sched/dl: If the task does not fit anywhere, select the fastest core sched/dl: Try not to select a too fast core drivers/base/arch_topology.c | 1 + include/linux/sched.h | 1 + include/linux/sched/topology.h | 3 ++ kernel/sched/core.c| 2 + kernel/sched/cpudeadline.c | 53 ++-- kernel/sched/deadline.c| 76 -- kernel/sched/sched.h | 16 +-- kernel/sched/topology.c| 9 8 files changed, 143 insertions(+), 18 deletions(-) -- 2.20.1
[tip:sched/urgent] sched/deadline: Correctly handle active 0-lag timers
Commit-ID: 1b02cd6a2d7f3e2a6a5262887d2cb2912083e42f Gitweb: https://git.kernel.org/tip/1b02cd6a2d7f3e2a6a5262887d2cb2912083e42f Author: luca abeni AuthorDate: Mon, 25 Mar 2019 14:15:30 +0100 Committer: Ingo Molnar CommitDate: Tue, 16 Apr 2019 16:54:58 +0200 sched/deadline: Correctly handle active 0-lag timers syzbot reported the following warning: [Â ] WARNING: CPU: 4 PID: 17089 at kernel/sched/deadline.c:255 task_non_contending+0xae0/0x1950 line 255 of deadline.c is: WARN_ON(hrtimer_active(_se->inactive_timer)); in task_non_contending(). Unfortunately, in some cases (for example, a deadline task continuosly blocking and waking immediately) it can happen that a task blocks (and task_non_contending() is called) while the 0-lag timer is still active. In this case, the safest thing to do is to immediately decrease the running bandwidth of the task, without trying to re-arm the 0-lag timer. Signed-off-by: luca abeni Signed-off-by: Peter Zijlstra (Intel) Acked-by: Juri Lelli Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: chengjian (D) Link: https://lkml.kernel.org/r/20190325131530.34706-1-luca.ab...@santannapisa.it Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 6a73e41a2016..43901fa3f269 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -252,7 +252,6 @@ static void task_non_contending(struct task_struct *p) if (dl_entity_is_special(dl_se)) return; - WARN_ON(hrtimer_active(_se->inactive_timer)); WARN_ON(dl_se->dl_non_contending); zerolag_time = dl_se->deadline - @@ -269,7 +268,7 @@ static void task_non_contending(struct task_struct *p) * If the "0-lag time" already passed, decrease the active * utilization now, instead of starting a timer */ - if (zerolag_time < 0) { + if ((zerolag_time < 0) || hrtimer_active(_se->inactive_timer)) { if (dl_task(p)) sub_running_bw(dl_se, dl_rq); if (!dl_task(p) || p->state == TASK_DEAD) {
[PATCH] sched/deadline: correctly handle active 0-lag timers
syzbot reported the following warning: [Â 948.126369] WARNING: CPU: 4 PID: 17089 at kernel/sched/deadline.c:255 task_non_contending+0xae0/0x1950 [Â 948.130198] Kernel panic - not syncing: panic_on_warn set ... [Â 948.130198] [Â 948.134221] CPU: 4 PID: 17089 Comm: syz-executor.1 Not tainted 4.19.27 #2 [Â 948.139072] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1ubuntu1 04/01/2014 [Â 948.141603] Call Trace: [Â 948.142277]Â dump_stack+0xca/0x13e [Â 948.164636]Â panic+0x1f7/0x543 [Â 948.168704]Â ? refcount_error_report+0x29d/0x29d [Â 948.172438]Â ? __warn+0x1d1/0x210 [Â 948.183359]Â ? task_non_contending+0xae0/0x1950 [Â 948.191747]Â __warn+0x1ec/0x210 [Â 948.196276]Â ? task_non_contending+0xae0/0x1950 [Â 948.202476]Â report_bug+0x1ee/0x2b0 [Â 948.204622]Â fixup_bug.part.7+0x37/0x80 [Â 948.206879]Â do_error_trap+0x22c/0x290 [Â 948.211340]Â ? math_error+0x2f0/0x2f0 [Â 948.217033]Â ? trace_hardirqs_off_caller+0x40/0x190 [Â 948.222477]Â ? trace_hardirqs_off_thunk+0x1a/0x1c [Â 948.229877]Â invalid_op+0x14/0x20 [Â 948.238317] RIP: 0010:task_non_contending+0xae0/0x1950 [Â 948.253825] Code: 6d 29 83 48 89 4c 24 20 48 89 54 24 10 c6 05 d0 89 5a 03 01 e8 11 ea ee ff 0f 0b 48 8b 4c 24 20 48 8b 54 24 10 e9 bb f7 ff ff <0f> 0b e9 1d f6 ff ff e8 d4 a7 09 00 85 c0 0f 85 74 f8 ff ff 48 c7 [Â 948.272329] RSP: 0018:8883d443f8c0 EFLAGS: 00010002 [Â 948.293045] RAX: 0001 RBX: 8883d3572468 RCX: 813a6571 [Â 948.300323] RDX: 08ab RSI: c900030e4000 RDI: 8883e2fe6278 [Â 948.305278] RBP: 8883e2f0 R08: ed1078ea3ab2 R09: ed1078ea3ab2 [Â 948.316441] R10: 0001 R11: ed1078ea3ab1 R12: 0002c680 [Â 948.320257] R13: 8883d357217c R14: 0001 R15: 8883d3572140 [Â 948.324500]Â ? hrtimer_active+0x171/0x1f0 [Â 948.327421]Â ? dequeue_task_dl+0x38/0x970 [Â 948.330572]Â __schedule+0x94b/0x1a80 [Â 948.333578]Â ? __sched_text_start+0x8/0x8 [Â 948.336141]Â ? lock_downgrade+0x5e0/0x5e0 [Â 948.338111]Â ? plist_add+0x23e/0x480 [Â 948.339706]Â schedule+0x7c/0x1a0 [Â 948.341395]Â futex_wait_queue_me+0x319/0x600 [Â 948.343329]Â ? get_futex_key_refs+0xd0/0xd0 [Â 948.345037]Â ? lock_downgrade+0x5e0/0x5e0 [Â 948.347206]Â ? get_futex_key_refs+0xa4/0xd0 [Â 948.353007]Â futex_wait+0x1e7/0x590 [Â 948.355328]Â ? futex_wait_setup+0x2b0/0x2b0 [Â 948.360578]Â ? __lock_acquire+0x60c/0x3b70 [Â 948.369186]Â ? __save_stack_trace+0x92/0x100 [Â 948.374344]Â ? hash_futex+0x15/0x210 [Â 948.376832]Â ? drop_futex_key_refs+0x3c/0xd0 [Â 948.378591]Â ? futex_wake+0x14e/0x450 [Â 948.381609]Â do_futex+0x5c9/0x15e0 [Â 948.384567]Â ? perf_syscall_enter+0xb1/0xc80 [Â 948.390307]Â ? exit_robust_list+0x240/0x240 [Â 948.393566]Â ? ftrace_syscall_exit+0x5c0/0x5c0 [Â 948.396369]Â ? lock_downgrade+0x5e0/0x5e0 [Â 948.401748]Â ? __might_fault+0x17c/0x1c0 [Â 948.404171]Â __x64_sys_futex+0x296/0x380 [Â 948.406472]Â ? __ia32_sys_futex+0x370/0x370 [Â 948.440630]Â ? trace_hardirqs_on_thunk+0x1a/0x1c [Â 948.441774]Â ? trace_hardirqs_off_caller+0x40/0x190 [Â 948.442770]Â ? do_syscall_64+0x3b/0x580 [Â 948.486728]Â do_syscall_64+0xc8/0x580 [Â 948.489138]Â entry_SYSCALL_64_after_hwframe+0x49/0xbe [Â 948.492072] RIP: 0033:0x462eb9 [Â 948.492788] Code: f7 d8 64 89 02 b8 ff ff ff ff c3 66 0f 1f 44 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 bc ff ff ff f7 d8 64 89 01 48 [Â 948.532016] RSP: 002b:7f7ac8a67cd8 EFLAGS: 0246 ORIG_RAX: 00ca [Â 948.536811] RAX: ffda RBX: 0073bf08 RCX: 00462eb9 [Â 948.542138] RDX: RSI: 0080 RDI: 0073bf08 [Â 948.548077] RBP: 0073bf00 R08: R09: [Â 948.562535] R10: R11: 0246 R12: 0073bf0c [Â 948.569184] R13: R14: 0073bf00 R15: 7fff106d8c10 line 255 of deadline.c is WARN_ON(hrtimer_active(_se->inactive_timer)); in task_non_contending(). Unfortunately, in some cases (for example, a deadline task continuosly blocking and waking immediately) it can happen that a task blocks (and task_non_contending() is called) while the 0-lag timer is still active. In this case, the safest thing to do is to immediately decrease the running bandwidth of the task, without trying to re-arm the 0-lag timer. Signed-off-by: luca abeni --- kernel/sched/deadline.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 6a73e41a2016..43901fa3f269 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -252,7 +252,6 @@ static void task_non_contending(struct task_struct *p) if (dl_entity_is_special(dl_se)) return; - WARN_ON(hrtimer_active(_se->inactive_timer)); WARN_ON(dl_se->dl_non_contending); zerolag_time = dl_se->deadline - @@ -269,7 +268,7 @@ static
Re: WARN ON at kernel/sched/deadline.c task_non_contending
Hi Juri, On Fri, 22 Mar 2019 15:32:32 +0100 Juri Lelli wrote: [...] > > diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c > > index 6a73e41a2016..43901fa3f269 100644 > > --- a/kernel/sched/deadline.c > > +++ b/kernel/sched/deadline.c > > @@ -252,7 +252,6 @@ static void task_non_contending(struct > > task_struct *p) if (dl_entity_is_special(dl_se)) > > return; > > > > - WARN_ON(hrtimer_active(_se->inactive_timer)); > > WARN_ON(dl_se->dl_non_contending); > > > > zerolag_time = dl_se->deadline - > > @@ -269,7 +268,7 @@ static void task_non_contending(struct > > task_struct *p) > > * If the "0-lag time" already passed, decrease the active > > * utilization now, instead of starting a timer > > */ > > - if (zerolag_time < 0) { > > + if ((zerolag_time < 0) || > > hrtimer_active(_se->inactive_timer)) { if (dl_task(p)) > > sub_running_bw(dl_se, dl_rq); > > if (!dl_task(p) || p->state == TASK_DEAD) { > > > > > > The idea is that if the timer is active, we leave dl_non_contending > > set to 0 (so that the timer handler does nothing), and we > > immediately decrease the running bw. > > I think this is OK, because this situation can happen only if the > > task blocks, wakes up while the timer handler is running, and then > > immediately blocks again - while the timer handler is still > > running. So, the "zero lag time" cannot be too much in the future. > > And if we get here and the handler is running it means that the > handler is spinning on rq->lock waiting the dequeue to release it. > So, this looks safe to me as well. > > BTW, I could reproduce with Steve's deadline_test [1], and this seems > to fix it. > > Would you mind sending out a proper patch Luca? Thanks for looking at this. I'll try to prepare and send a patch in next week. Thanks, Luca
Re: WARN ON at kernel/sched/deadline.c task_non_contending
Hi, On Fri, 15 Mar 2019 08:43:00 +0800 "chengjian (D)" wrote: [...] > > diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c > > index 6a73e41a2016..43901fa3f269 100644 > > --- a/kernel/sched/deadline.c > > +++ b/kernel/sched/deadline.c > > @@ -252,7 +252,6 @@ static void task_non_contending(struct > > task_struct *p) if (dl_entity_is_special(dl_se)) > > return; > > > > - WARN_ON(hrtimer_active(_se->inactive_timer)); > > WARN_ON(dl_se->dl_non_contending); > > > > zerolag_time = dl_se->deadline - > > @@ -269,7 +268,7 @@ static void task_non_contending(struct > > task_struct *p) > > * If the "0-lag time" already passed, decrease the active > > * utilization now, instead of starting a timer > > */ > > - if (zerolag_time < 0) { > > + if ((zerolag_time < 0) || > > hrtimer_active(_se->inactive_timer)) { if (dl_task(p)) > > sub_running_bw(dl_se, dl_rq); > > if (!dl_task(p) || p->state == TASK_DEAD) { > > > > > > The idea is that if the timer is active, we leave dl_non_contending > > set to 0 (so that the timer handler does nothing), and we > > immediately decrease the running bw. > > I think this is OK, because this situation can happen only if the > > task blocks, wakes up while the timer handler is running, and then > > immediately blocks again - while the timer handler is still > > running. So, the "zero lag time" cannot be too much in the future. > > > > > > Thanks, > > Luca > > > > . > > > Yeah, it looks good. > > I can do some experiments with it , > > Do you have some testcases to help me with the test ? I just tried the test you provided... I also have some other SCHED_DEADLINE tests at https://github.com/lucabe72/ReclaimingTests but I did not try them with this patch yet. Claudio Scordino also had some SCHED_DEADLINE tests here: https://github.com/evidence/test-sched-dl Luca
Re: WARN ON at kernel/sched/deadline.c task_non_contending
Hi, (I added Juri in cc) On Tue, 12 Mar 2019 10:03:12 +0800 "chengjian (D)" wrote: [...] > diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c > index 31c050a0d0ce..d73cb033a06d 100644 > --- a/kernel/sched/deadline.c > +++ b/kernel/sched/deadline.c > @@ -252,7 +252,6 @@ static void task_non_contending(struct > task_struct *p) if (dl_entity_is_special(dl_se)) > Â Â Â return; > > -Â Â WARN_ON(hrtimer_active(_se->inactive_timer)); > Â Â Â WARN_ON(dl_se->dl_non_contending); > > Â Â Â zerolag_time = dl_se->deadline - > @@ -287,7 +286,9 @@ static void task_non_contending(struct > task_struct *p) } > > Â Â Â dl_se->dl_non_contending = 1; > -Â Â get_task_struct(p); > + > +Â Â if (!hrtimer_active(_se->inactive_timer)); > +Â Â get_task_struct(p); > Â Â Â hrtimer_start(timer, ns_to_ktime(zerolag_time), > HRTIMER_MODE_REL); } After looking at the patch a little bit more and running some tests, I suspect this solution might be racy: when the timer is already active, (and hrtimer_start() fails), it relies on its handler to decrease the running bw (by setting dl_non_contending to 1)... But inactive_task_timer() might have already checked dl_non_contending, finding it equal to 0 (so, it ends up doing nothing and the running bw is not decreased). So, I would prefer a different solution. I think this patch should work: diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 6a73e41a2016..43901fa3f269 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -252,7 +252,6 @@ static void task_non_contending(struct task_struct *p) if (dl_entity_is_special(dl_se)) return; - WARN_ON(hrtimer_active(_se->inactive_timer)); WARN_ON(dl_se->dl_non_contending); zerolag_time = dl_se->deadline - @@ -269,7 +268,7 @@ static void task_non_contending(struct task_struct *p) * If the "0-lag time" already passed, decrease the active * utilization now, instead of starting a timer */ - if (zerolag_time < 0) { + if ((zerolag_time < 0) || hrtimer_active(_se->inactive_timer)) { if (dl_task(p)) sub_running_bw(dl_se, dl_rq); if (!dl_task(p) || p->state == TASK_DEAD) { The idea is that if the timer is active, we leave dl_non_contending set to 0 (so that the timer handler does nothing), and we immediately decrease the running bw. I think this is OK, because this situation can happen only if the task blocks, wakes up while the timer handler is running, and then immediately blocks again - while the timer handler is still running. So, the "zero lag time" cannot be too much in the future. Thanks, Luca
Re: WARN ON at kernel/sched/deadline.c task_non_contending
Hi all, On Tue, 12 Mar 2019 10:03:12 +0800 "chengjian (D)" wrote: > Hi. > > When looking to test SCHED_DEADLINE syzkaller report an warn in > task_non_contending(). I tested the mainline kernel with the C program > and captured the same call trace. [...] > diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c > index 31c050a0d0ce..d73cb033a06d 100644 > --- a/kernel/sched/deadline.c > +++ b/kernel/sched/deadline.c > @@ -252,7 +252,6 @@ static void task_non_contending(struct > task_struct *p) if (dl_entity_is_special(dl_se)) > Â Â Â return; > > -Â Â WARN_ON(hrtimer_active(_se->inactive_timer)); > Â Â Â WARN_ON(dl_se->dl_non_contending); > > Â Â Â zerolag_time = dl_se->deadline - > @@ -287,7 +286,9 @@ static void task_non_contending(struct > task_struct *p) } > > Â Â Â dl_se->dl_non_contending = 1; > -Â Â get_task_struct(p); > + > +Â Â if (!hrtimer_active(_se->inactive_timer)); > +Â Â get_task_struct(p); > Â Â Â hrtimer_start(timer, ns_to_ktime(zerolag_time), > HRTIMER_MODE_REL); } At a first glance, I think the patch is OK, but I need some more time to look at the details. I'll run some experiments with the reproducer, and I'll let you know my conclusions. > Did I miss something ? > > I saw it directly remove the hrtimer in hrtime_start() if hrtime is > queued, it may be unsafe here when the timer handler is running. This is probably why I added that WARN_ON()... I'll look at a possible solution. Thanks, Luca > > Help ? > > I put the syzkaller log and C demo in attachments. > > Thanks. > > >
Re: WARNING in enqueue_task_dl
Hi all, (and, happy new year to everyone!) this looks similar to a bug we have seen some time ago (a task switching from SCHED_OTHER to SCHED_DEADLINE while inheriting a deadline from a SCHED_DEADLINE task triggers the warning)... Juri, I think you found a fix for such a bug; has it been committed? Thanks, Luca On Mon, 31 Dec 2018 07:02:04 -0800 syzbot wrote: > syzbot has found a reproducer for the following crash on: > > HEAD commit:195303136f19 Merge tag 'kconfig-v4.21-2' of > git://git.kern.. git tree: upstream > console output: > https://syzkaller.appspot.com/x/log.txt?x=118af84b40 kernel > config: https://syzkaller.appspot.com/x/.config?x=76d28549be7c27cf > dashboard link: > https://syzkaller.appspot.com/bug?extid=119ba87189432ead09b4 > compiler: gcc (GCC) 9.0.0 20181231 (experimental) syz > repro: > https://syzkaller.appspot.com/x/repro.syz?x=10eb7ebf40 C > reproducer: https://syzkaller.appspot.com/x/repro.c?x=14156d7740 > > IMPORTANT: if you fix the bug, please add the following tag to the > commit: Reported-by: > syzbot+119ba87189432ead0...@syzkaller.appspotmail.com > > WARNING: CPU: 0 PID: 9019 at kernel/sched/deadline.c:628 > setup_new_dl_entity kernel/sched/deadline.c:629 [inline] > WARNING: CPU: 0 PID: 9019 at kernel/sched/deadline.c:628 > enqueue_dl_entity kernel/sched/deadline.c:1429 [inline] > WARNING: CPU: 0 PID: 9019 at kernel/sched/deadline.c:628 > enqueue_task_dl+0x2355/0x3dc0 kernel/sched/deadline.c:1500 > Kernel panic - not syncing: panic_on_warn set ... > CPU: 0 PID: 9019 Comm: syz-executor280 Not tainted 4.20.0+ #1 > Hardware name: Google Google Compute Engine/Google Compute Engine, > BIOS Google 01/01/2011 > Call Trace: > __dump_stack lib/dump_stack.c:77 [inline] > dump_stack+0x1db/0x2d0 lib/dump_stack.c:113 > panic+0x2cb/0x589 kernel/panic.c:189 > __warn.cold+0x20/0x4b kernel/panic.c:544 > report_bug+0x263/0x2b0 lib/bug.c:186 > fixup_bug arch/x86/kernel/traps.c:178 [inline] > fixup_bug arch/x86/kernel/traps.c:173 [inline] > do_error_trap+0x11b/0x200 arch/x86/kernel/traps.c:271 > do_invalid_op+0x37/0x50 arch/x86/kernel/traps.c:290 > invalid_op+0x14/0x20 arch/x86/entry/entry_64.S:973 > RIP: 0010:setup_new_dl_entity kernel/sched/deadline.c:628 [inline] > RIP: 0010:enqueue_dl_entity kernel/sched/deadline.c:1429 [inline] > RIP: 0010:enqueue_task_dl+0x2355/0x3dc0 kernel/sched/deadline.c:1500 > Code: 3c 02 00 0f 85 ba 05 00 00 49 8b b5 50 0a 00 00 e9 53 fa ff ff > e8 fb f2 64 00 48 8d 4d d8 e9 48 dd ff ff 0f 0b e9 92 f1 ff ff <0f> > 0b e9 18 f1 ff ff 4c 89 ef 4c 89 95 28 ff ff ff 4c 89 85 30 ff > RSP: 0018:88809eebfaf8 EFLAGS: 00010002 > RAX: 0002 RBX: 111013dd7f6a RCX: dc00 > RDX: 00333cf09f75 RSI: 0004 RDI: 8880ae62d850 > RBP: 88809eebfbf8 R08: 88807fb0a538 R09: 88807fb0a2fc > R10: 88807fb0a580 R11: 8880ae62dc7b R12: 88807fb0a2c0 > R13: 8880ae62ce00 R14: 8880ae62ce00 R15: 88807fb0a58c > enqueue_task+0xb9/0x380 kernel/sched/core.c:730 > __sched_setscheduler+0xe32/0x1fe0 kernel/sched/core.c:4336 > sched_setattr kernel/sched/core.c:4394 [inline] > __do_sys_sched_setattr kernel/sched/core.c:4570 [inline] > __se_sys_sched_setattr kernel/sched/core.c:4549 [inline] > __x64_sys_sched_setattr+0x1af/0x2f0 kernel/sched/core.c:4549 > do_syscall_64+0x1a3/0x800 arch/x86/entry/common.c:290 > entry_SYSCALL_64_after_hwframe+0x49/0xbe > RIP: 0033:0x44c829 > Code: e8 8c d8 02 00 48 83 c4 18 c3 0f 1f 80 00 00 00 00 48 89 f8 48 > 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> > 3d 01 f0 ff ff 0f 83 eb c9 fb ff c3 66 2e 0f 1f 84 00 00 00 00 > RSP: 002b:7f28685e8ce8 EFLAGS: 0246 ORIG_RAX: 013a > RAX: ffda RBX: 006e49f8 RCX: 0044c829 > RDX: RSI: 2000 RDI: > RBP: 006e49f0 R08: R09: > R10: R11: 0246 R12: 006e49fc > R13: 7ffd1981c8af R14: 7f28685e99c0 R15: 0001 > > == > WARNING: possible circular locking dependency detected > 4.20.0+ #1 Not tainted > -- > syz-executor280/9019 is trying to acquire lock: > 1aef527c ((console_sem).lock){-.-.}, at: > down_trylock+0x13/0x70 kernel/locking/semaphore.c:136 > > but task is already holding lock: > 0ba17b09 (>lock){-.-.}, at: task_rq_lock+0xc8/0x290 > kernel/sched/core.c:99 > > which lock already depends on the new lock. > > > the existing dependency chain (in reverse order) is: > > -> #2 (>lock){-.-.}: > __raw_spin_lock include/linux/spinlock_api_smp.h:142 [inline] > _raw_spin_lock+0x2f/0x40 kernel/locking/spinlock.c:144 > rq_lock kernel/sched/sched.h:1149 [inline] >
Re: WARNING in enqueue_task_dl
Hi all, On Mon, 19 Nov 2018 09:23:03 +0100 (CET) Thomas Gleixner wrote: > Adding scheduler folks > > On Sun, 18 Nov 2018, syzbot wrote: > > > Hello, > > > > syzbot found the following crash on: > > > > HEAD commit:1ce80e0fe98e Merge tag 'fsnotify_for_v4.20-rc3' of > > git://g.. git tree: upstream > > console output: > > https://syzkaller.appspot.com/x/log.txt?x=14ddbb0b40 kernel > > config: https://syzkaller.appspot.com/x/.config?x=d86f24333880b605 > > dashboard link: > > https://syzkaller.appspot.com/bug?extid=119ba87189432ead09b4 > > compiler: gcc (GCC) 8.0.1 20180413 (experimental) syz > > repro: > > https://syzkaller.appspot.com/x/repro.syz?x=13e9e01540 > > > > IMPORTANT: if you fix the bug, please add the following tag to the > > commit: Reported-by: > > syzbot+119ba87189432ead0...@syzkaller.appspotmail.com > > > > IPv6: ADDRCONF(NETDEV_CHANGE): veth1: link becomes ready > > IPv6: ADDRCONF(NETDEV_CHANGE): veth0: link becomes ready > > 8021q: adding VLAN 0 to HW filter on device team0 > > hrtimer: interrupt took 33411 ns > > sched: DL replenish lagged too much > > WARNING: CPU: 1 PID: 6351 at kernel/sched/deadline.c:628 > > enqueue_task_dl+0x22da/0x38a0 kernel/sched/deadline.c:1504 Here, it looks like a task is invoking sched_setattr() to become SCHED_DEADLINE when dl_boosted is set... Is this possible / correct? If this (sched_setattr() with dl_boosted set) should not be possible, then we have a bug that we need to investigate... Otherwise, I suspect we can just remove the WARN_ON at line 628 of deadline.c Luca > > PM: Basic memory bitmaps freed > > Kernel panic - not syncing: panic_on_warn set ... > > CPU: 1 PID: 6351 Comm: syz-executor0 Not tainted 4.20.0-rc2+ #338 > > Hardware name: Google Google Compute Engine/Google Compute Engine, > > BIOS Google 01/01/2011 > > Call Trace: > > __dump_stack lib/dump_stack.c:77 [inline] > > dump_stack+0x244/0x39d lib/dump_stack.c:113 > > panic+0x2ad/0x55c kernel/panic.c:188 > > __warn.cold.8+0x20/0x45 kernel/panic.c:540 > > report_bug+0x254/0x2d0 lib/bug.c:186 > > fixup_bug arch/x86/kernel/traps.c:178 [inline] > > do_error_trap+0x11b/0x200 arch/x86/kernel/traps.c:271 > > do_invalid_op+0x36/0x40 arch/x86/kernel/traps.c:290 > > invalid_op+0x14/0x20 arch/x86/entry/entry_64.S:969 > > RIP: 0010:enqueue_task_dl+0x22da/0x38a0 kernel/sched/deadline.c:1504 > > Code: ff 48 8b 8d c8 fe ff ff 48 c1 e6 2a 4c 8b 9d d0 fe ff ff 8b > > 95 d8 fe ff ff 48 8b 85 e0 fe ff ff e9 16 e4 ff ff e8 16 d0 ea ff > > <0f> 0b e9 17 f1 ff ff 48 8b bd e8 fe ff ff 4c 89 95 c8 fe ff ff 48 > > RSP: 0018:8881ba39fa18 EFLAGS: 00010002 > > RAX: RBX: 8881b9d6c000 RCX: 8881b9d6c278 > > RDX: 8881b9d6c03c RSI: 0002 RDI: 8881daf2d710 > > RBP: 8881ba39fb78 R08: 0001 R09: 8881daf0 > > R10: 001a4d4f1987 R11: 8881daf2db3b R12: 111037473f4e > > R13: 8881b9d6c2cc R14: 8881daf2ccc0 R15: 8881daf2ccc0 > > enqueue_task+0x184/0x390 kernel/sched/core.c:730 > > __sched_setscheduler+0xe99/0x2190 kernel/sched/core.c:4336 > > sched_setattr kernel/sched/core.c:4394 [inline] > > __do_sys_sched_setattr kernel/sched/core.c:4570 [inline] > > __se_sys_sched_setattr kernel/sched/core.c:4549 [inline] > > __x64_sys_sched_setattr+0x1b2/0x2f0 kernel/sched/core.c:4549 > > do_syscall_64+0x1b9/0x820 arch/x86/entry/common.c:290 > > entry_SYSCALL_64_after_hwframe+0x49/0xbe > > RIP: 0033:0x457569 > > Code: fd b3 fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 > > 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 > > <48> 3d 01 f0 ff ff 0f 83 cb b3 fb ff c3 66 2e 0f 1f 84 00 00 00 00 > > RSP: 002b:7f05ce0a2c78 EFLAGS: 0246 ORIG_RAX: > > 013a RAX: ffda RBX: 0003 RCX: > > 00457569 RDX: RSI: 2000 RDI: > > RBP: 0072bfa0 R08: R09: > > R10: R11: 0246 R12: > > 7f05ce0a36d4 R13: 004c369f R14: 004d5730 R15: > > > > > > == > > WARNING: possible circular locking dependency detected > > 4.20.0-rc2+ #338 Not tainted > > -- > > syz-executor0/6351 is trying to acquire lock: > > b2b97155 ((console_sem).lock){-.-.}, at: > > down_trylock+0x13/0x70 kernel/locking/semaphore.c:136 > > > > but task is already holding lock: > > 4cd5557e (>lock){-.-.}, at: task_rq_lock+0xc5/0x2a0 > > kernel/sched/core.c:99 > > > > which lock already depends on the new lock. > > > > > > the existing dependency chain (in reverse order) is: > > > > -> #2 (>lock){-.-.}: > > __raw_spin_lock include/linux/spinlock_api_smp.h:142 [inline] > > _raw_spin_lock+0x2d/0x40 kernel/locking/spinlock.c:144 > > rq_lock kernel/sched/sched.h:1126 [inline] > >
Re: WARNING in enqueue_task_dl
Hi all, On Mon, 19 Nov 2018 09:23:03 +0100 (CET) Thomas Gleixner wrote: > Adding scheduler folks > > On Sun, 18 Nov 2018, syzbot wrote: > > > Hello, > > > > syzbot found the following crash on: > > > > HEAD commit:1ce80e0fe98e Merge tag 'fsnotify_for_v4.20-rc3' of > > git://g.. git tree: upstream > > console output: > > https://syzkaller.appspot.com/x/log.txt?x=14ddbb0b40 kernel > > config: https://syzkaller.appspot.com/x/.config?x=d86f24333880b605 > > dashboard link: > > https://syzkaller.appspot.com/bug?extid=119ba87189432ead09b4 > > compiler: gcc (GCC) 8.0.1 20180413 (experimental) syz > > repro: > > https://syzkaller.appspot.com/x/repro.syz?x=13e9e01540 > > > > IMPORTANT: if you fix the bug, please add the following tag to the > > commit: Reported-by: > > syzbot+119ba87189432ead0...@syzkaller.appspotmail.com > > > > IPv6: ADDRCONF(NETDEV_CHANGE): veth1: link becomes ready > > IPv6: ADDRCONF(NETDEV_CHANGE): veth0: link becomes ready > > 8021q: adding VLAN 0 to HW filter on device team0 > > hrtimer: interrupt took 33411 ns > > sched: DL replenish lagged too much > > WARNING: CPU: 1 PID: 6351 at kernel/sched/deadline.c:628 > > enqueue_task_dl+0x22da/0x38a0 kernel/sched/deadline.c:1504 Here, it looks like a task is invoking sched_setattr() to become SCHED_DEADLINE when dl_boosted is set... Is this possible / correct? If this (sched_setattr() with dl_boosted set) should not be possible, then we have a bug that we need to investigate... Otherwise, I suspect we can just remove the WARN_ON at line 628 of deadline.c Luca > > PM: Basic memory bitmaps freed > > Kernel panic - not syncing: panic_on_warn set ... > > CPU: 1 PID: 6351 Comm: syz-executor0 Not tainted 4.20.0-rc2+ #338 > > Hardware name: Google Google Compute Engine/Google Compute Engine, > > BIOS Google 01/01/2011 > > Call Trace: > > __dump_stack lib/dump_stack.c:77 [inline] > > dump_stack+0x244/0x39d lib/dump_stack.c:113 > > panic+0x2ad/0x55c kernel/panic.c:188 > > __warn.cold.8+0x20/0x45 kernel/panic.c:540 > > report_bug+0x254/0x2d0 lib/bug.c:186 > > fixup_bug arch/x86/kernel/traps.c:178 [inline] > > do_error_trap+0x11b/0x200 arch/x86/kernel/traps.c:271 > > do_invalid_op+0x36/0x40 arch/x86/kernel/traps.c:290 > > invalid_op+0x14/0x20 arch/x86/entry/entry_64.S:969 > > RIP: 0010:enqueue_task_dl+0x22da/0x38a0 kernel/sched/deadline.c:1504 > > Code: ff 48 8b 8d c8 fe ff ff 48 c1 e6 2a 4c 8b 9d d0 fe ff ff 8b > > 95 d8 fe ff ff 48 8b 85 e0 fe ff ff e9 16 e4 ff ff e8 16 d0 ea ff > > <0f> 0b e9 17 f1 ff ff 48 8b bd e8 fe ff ff 4c 89 95 c8 fe ff ff 48 > > RSP: 0018:8881ba39fa18 EFLAGS: 00010002 > > RAX: RBX: 8881b9d6c000 RCX: 8881b9d6c278 > > RDX: 8881b9d6c03c RSI: 0002 RDI: 8881daf2d710 > > RBP: 8881ba39fb78 R08: 0001 R09: 8881daf0 > > R10: 001a4d4f1987 R11: 8881daf2db3b R12: 111037473f4e > > R13: 8881b9d6c2cc R14: 8881daf2ccc0 R15: 8881daf2ccc0 > > enqueue_task+0x184/0x390 kernel/sched/core.c:730 > > __sched_setscheduler+0xe99/0x2190 kernel/sched/core.c:4336 > > sched_setattr kernel/sched/core.c:4394 [inline] > > __do_sys_sched_setattr kernel/sched/core.c:4570 [inline] > > __se_sys_sched_setattr kernel/sched/core.c:4549 [inline] > > __x64_sys_sched_setattr+0x1b2/0x2f0 kernel/sched/core.c:4549 > > do_syscall_64+0x1b9/0x820 arch/x86/entry/common.c:290 > > entry_SYSCALL_64_after_hwframe+0x49/0xbe > > RIP: 0033:0x457569 > > Code: fd b3 fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 > > 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 > > <48> 3d 01 f0 ff ff 0f 83 cb b3 fb ff c3 66 2e 0f 1f 84 00 00 00 00 > > RSP: 002b:7f05ce0a2c78 EFLAGS: 0246 ORIG_RAX: > > 013a RAX: ffda RBX: 0003 RCX: > > 00457569 RDX: RSI: 2000 RDI: > > RBP: 0072bfa0 R08: R09: > > R10: R11: 0246 R12: > > 7f05ce0a36d4 R13: 004c369f R14: 004d5730 R15: > > > > > > == > > WARNING: possible circular locking dependency detected > > 4.20.0-rc2+ #338 Not tainted > > -- > > syz-executor0/6351 is trying to acquire lock: > > b2b97155 ((console_sem).lock){-.-.}, at: > > down_trylock+0x13/0x70 kernel/locking/semaphore.c:136 > > > > but task is already holding lock: > > 4cd5557e (>lock){-.-.}, at: task_rq_lock+0xc5/0x2a0 > > kernel/sched/core.c:99 > > > > which lock already depends on the new lock. > > > > > > the existing dependency chain (in reverse order) is: > > > > -> #2 (>lock){-.-.}: > > __raw_spin_lock include/linux/spinlock_api_smp.h:142 [inline] > > _raw_spin_lock+0x2d/0x40 kernel/locking/spinlock.c:144 > > rq_lock kernel/sched/sched.h:1126 [inline] > >
Re: INFO: rcu detected stall in do_idle
Hi Peter, On Tue, 30 Oct 2018 11:45:54 +0100 Peter Zijlstra wrote: [...] > > 2. This is related to perf_event_open syscall reproducer does > > before becoming DEADLINE and entering the busy loop. Enabling of > > perf swevents generates lot of hrtimers load that happens in the > > reproducer task context. Now, DEADLINE uses rq_clock() for > > setting deadlines, but rq_clock_task() for doing runtime > > enforcement. In a situation like this it seems that the amount of > > irq pressure becomes pretty big (I'm seeing this on kvm, real hw > > should maybe do better, pain point remains I guess), so rq_clock() > > and rq_clock_task() might become more a more skewed w.r.t. each > > other. Since rq_clock() is only used when setting absolute > > deadlines for the first time (or when resetting them in certain > > cases), after a bit the replenishment code will start to see > > postponed deadlines always in the past w.r.t. rq_clock(). And this > > brings us back to the fact that the task is never stopped, since it > > can't keep up with rq_clock(). > > > > - Not sure yet how we want to address this [1]. We could use > > rq_clock() everywhere, but tasks might be penalized by irq > > pressure (theoretically this would mandate that irqs are > > explicitly accounted for I guess). I tried to use the skew > > between the two clocks to "fix" deadlines, but that puts us at > > risks of de-synchronizing userspace and kernel views of deadlines. > > Hurm.. right. We knew of this issue back when we did it. > I suppose now it hurts and we need to figure something out. > > By virtue of being a real-time class, we do indeed need to have > deadline on the wall-clock. But if we then don't account runtime on > that same clock, but on a potentially slower clock, we get the > problem that we can run longer than our period/deadline, which is > what we're running into here I suppose. I might be hugely misunderstanding something here, but in my impression the issue is just that if the IRQ time is not accounted to the -deadline task, then the non-deadline tasks might be starved. I do not see this as a skew between two clocks, but as an accounting thing: - if we decide that the IRQ time is accounted to the -deadline task (this is what happens with CONFIG_IRQ_TIME_ACCOUNTING disabled), then the non-deadline tasks are not starved (but of course the -deadline tasks executes for less than its reserved time in the period); - if we decide that the IRQ time is not accounted to the -deadline task (this is what happens with CONFIG_IRQ_TIME_ACCOUNTING enabled), then the -deadline task executes for the expected amount of time (about 60% of the CPU time), but an IRQ load of 40% will starve non-deadline tasks (this is what happens in the bug that triggered this discussion) I think this might be seen as an adimission control issue: when CONFIG_IRQ_TIME_ACCOUNTING is disabled, the IRQ time is accounted for in the admission control (because it ends up in the task's runtime), but when CONFIG_IRQ_TIME_ACCOUNTING is enabled the IRQ time is not accounted for in the admission test (the IRQ handler becomes some sort of entity with a higher priority than -deadline tasks, on which no accounting or enforcement is performed). > And yes, at some point RT workloads need to be aware of the jitter > injected by things like IRQs and such. But I believe the rationale was > that for soft real-time workloads this current semantic was 'easier' > because we get to ignore IRQ overhead for workload estimation etc. > > What we could maybe do is track runtime in both rq_clock_task() and > rq_clock() and detect where the rq_clock based one exceeds the period > and then push out the deadline (and add runtime). > > Maybe something along such lines; does that make sense? Uhm... I have to study and test your patch... I'll comment on this later. Thanks, Luca > > --- > include/linux/sched.h | 3 +++ > kernel/sched/deadline.c | 53 > - 2 files changed, 38 > insertions(+), 18 deletions(-) > > diff --git a/include/linux/sched.h b/include/linux/sched.h > index 8f8a5418b627..6aec81cb3d2e 100644 > --- a/include/linux/sched.h > +++ b/include/linux/sched.h > @@ -522,6 +522,9 @@ struct sched_dl_entity { > u64 deadline; /* > Absolute deadline for this instance */ unsigned > int flags; /* Specifying the > scheduler behaviour */ > + u64 wallstamp; > + s64 walltime; > + > /* >* Some bool flags: >* > diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c > index 91e4202b0634..633c8f36c700 100644 > --- a/kernel/sched/deadline.c > +++ b/kernel/sched/deadline.c > @@ -683,16 +683,7 @@ static void replenish_dl_entity(struct > sched_dl_entity *dl_se, if (dl_se->dl_yielded
Re: INFO: rcu detected stall in do_idle
Hi Peter, On Tue, 30 Oct 2018 11:45:54 +0100 Peter Zijlstra wrote: [...] > > 2. This is related to perf_event_open syscall reproducer does > > before becoming DEADLINE and entering the busy loop. Enabling of > > perf swevents generates lot of hrtimers load that happens in the > > reproducer task context. Now, DEADLINE uses rq_clock() for > > setting deadlines, but rq_clock_task() for doing runtime > > enforcement. In a situation like this it seems that the amount of > > irq pressure becomes pretty big (I'm seeing this on kvm, real hw > > should maybe do better, pain point remains I guess), so rq_clock() > > and rq_clock_task() might become more a more skewed w.r.t. each > > other. Since rq_clock() is only used when setting absolute > > deadlines for the first time (or when resetting them in certain > > cases), after a bit the replenishment code will start to see > > postponed deadlines always in the past w.r.t. rq_clock(). And this > > brings us back to the fact that the task is never stopped, since it > > can't keep up with rq_clock(). > > > > - Not sure yet how we want to address this [1]. We could use > > rq_clock() everywhere, but tasks might be penalized by irq > > pressure (theoretically this would mandate that irqs are > > explicitly accounted for I guess). I tried to use the skew > > between the two clocks to "fix" deadlines, but that puts us at > > risks of de-synchronizing userspace and kernel views of deadlines. > > Hurm.. right. We knew of this issue back when we did it. > I suppose now it hurts and we need to figure something out. > > By virtue of being a real-time class, we do indeed need to have > deadline on the wall-clock. But if we then don't account runtime on > that same clock, but on a potentially slower clock, we get the > problem that we can run longer than our period/deadline, which is > what we're running into here I suppose. I might be hugely misunderstanding something here, but in my impression the issue is just that if the IRQ time is not accounted to the -deadline task, then the non-deadline tasks might be starved. I do not see this as a skew between two clocks, but as an accounting thing: - if we decide that the IRQ time is accounted to the -deadline task (this is what happens with CONFIG_IRQ_TIME_ACCOUNTING disabled), then the non-deadline tasks are not starved (but of course the -deadline tasks executes for less than its reserved time in the period); - if we decide that the IRQ time is not accounted to the -deadline task (this is what happens with CONFIG_IRQ_TIME_ACCOUNTING enabled), then the -deadline task executes for the expected amount of time (about 60% of the CPU time), but an IRQ load of 40% will starve non-deadline tasks (this is what happens in the bug that triggered this discussion) I think this might be seen as an adimission control issue: when CONFIG_IRQ_TIME_ACCOUNTING is disabled, the IRQ time is accounted for in the admission control (because it ends up in the task's runtime), but when CONFIG_IRQ_TIME_ACCOUNTING is enabled the IRQ time is not accounted for in the admission test (the IRQ handler becomes some sort of entity with a higher priority than -deadline tasks, on which no accounting or enforcement is performed). > And yes, at some point RT workloads need to be aware of the jitter > injected by things like IRQs and such. But I believe the rationale was > that for soft real-time workloads this current semantic was 'easier' > because we get to ignore IRQ overhead for workload estimation etc. > > What we could maybe do is track runtime in both rq_clock_task() and > rq_clock() and detect where the rq_clock based one exceeds the period > and then push out the deadline (and add runtime). > > Maybe something along such lines; does that make sense? Uhm... I have to study and test your patch... I'll comment on this later. Thanks, Luca > > --- > include/linux/sched.h | 3 +++ > kernel/sched/deadline.c | 53 > - 2 files changed, 38 > insertions(+), 18 deletions(-) > > diff --git a/include/linux/sched.h b/include/linux/sched.h > index 8f8a5418b627..6aec81cb3d2e 100644 > --- a/include/linux/sched.h > +++ b/include/linux/sched.h > @@ -522,6 +522,9 @@ struct sched_dl_entity { > u64 deadline; /* > Absolute deadline for this instance */ unsigned > int flags; /* Specifying the > scheduler behaviour */ > + u64 wallstamp; > + s64 walltime; > + > /* >* Some bool flags: >* > diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c > index 91e4202b0634..633c8f36c700 100644 > --- a/kernel/sched/deadline.c > +++ b/kernel/sched/deadline.c > @@ -683,16 +683,7 @@ static void replenish_dl_entity(struct > sched_dl_entity *dl_se, if (dl_se->dl_yielded
Re: INFO: rcu detected stall in do_idle
On Fri, 19 Oct 2018 13:39:42 +0200 Peter Zijlstra wrote: > On Thu, Oct 18, 2018 at 01:08:11PM +0200, luca abeni wrote: > > Ok, I see the issue now: the problem is that the "while > > (dl_se->runtime <= 0)" loop is executed at replenishment time, but > > the deadline should be postponed at enforcement time. > > > > I mean: in update_curr_dl() we do: > > dl_se->runtime -= scaled_delta_exec; > > if (dl_runtime_exceeded(dl_se) || dl_se->dl_yielded) { > > ... > > enqueue replenishment timer at dl_next_period(dl_se) > > But dl_next_period() is based on a "wrong" deadline! > > > > > > I think that inserting a > > while (dl_se->runtime <= -pi_se->dl_runtime) { > > dl_se->deadline += pi_se->dl_period; > > dl_se->runtime += pi_se->dl_runtime; > > } > > immediately after "dl_se->runtime -= scaled_delta_exec;" would fix > > the problem, no? > > That certainly makes sense to me. Good; I'll try to work on this idea in the weekend. Thanks, Luca > The only remaining issue would then > be placing a limit on the amount of times we can take that loop; > which, as you propose in a later email; can be done separately as a > limit on runtime.
Re: INFO: rcu detected stall in do_idle
On Fri, 19 Oct 2018 13:39:42 +0200 Peter Zijlstra wrote: > On Thu, Oct 18, 2018 at 01:08:11PM +0200, luca abeni wrote: > > Ok, I see the issue now: the problem is that the "while > > (dl_se->runtime <= 0)" loop is executed at replenishment time, but > > the deadline should be postponed at enforcement time. > > > > I mean: in update_curr_dl() we do: > > dl_se->runtime -= scaled_delta_exec; > > if (dl_runtime_exceeded(dl_se) || dl_se->dl_yielded) { > > ... > > enqueue replenishment timer at dl_next_period(dl_se) > > But dl_next_period() is based on a "wrong" deadline! > > > > > > I think that inserting a > > while (dl_se->runtime <= -pi_se->dl_runtime) { > > dl_se->deadline += pi_se->dl_period; > > dl_se->runtime += pi_se->dl_runtime; > > } > > immediately after "dl_se->runtime -= scaled_delta_exec;" would fix > > the problem, no? > > That certainly makes sense to me. Good; I'll try to work on this idea in the weekend. Thanks, Luca > The only remaining issue would then > be placing a limit on the amount of times we can take that loop; > which, as you propose in a later email; can be done separately as a > limit on runtime.
Re: INFO: rcu detected stall in do_idle
Hi Juri, On Thu, 18 Oct 2018 14:21:42 +0200 Juri Lelli wrote: [...] > > > > I missed the original emails, but maybe the issue is that the > > > > task blocks before the tick, and when it wakes up again > > > > something goes wrong with the deadline and runtime assignment? > > > > (maybe because the deadline is in the past?) > > > > > > No, the problem is that the task won't be throttled at all, > > > because its replenishing instant is always way in the past when > > > tick occurs. :-/ > > > > Ok, I see the issue now: the problem is that the "while > > (dl_se->runtime <= 0)" loop is executed at replenishment time, but > > the deadline should be postponed at enforcement time. > > > > I mean: in update_curr_dl() we do: > > dl_se->runtime -= scaled_delta_exec; > > if (dl_runtime_exceeded(dl_se) || dl_se->dl_yielded) { > > ... > > enqueue replenishment timer at dl_next_period(dl_se) > > But dl_next_period() is based on a "wrong" deadline! > > > > > > I think that inserting a > > while (dl_se->runtime <= -pi_se->dl_runtime) { > > dl_se->deadline += pi_se->dl_period; > > dl_se->runtime += pi_se->dl_runtime; > > } > > immediately after "dl_se->runtime -= scaled_delta_exec;" would fix > > the problem, no? > > Mmm, I also thought of letting the task "pay back" its overrunning. > But, doesn't this get us quite far from what one would expect. I mean, > enforcement granularity will be way different from task period, no? Yes, the granularity will be what the kernel can provide (due to the HZ value and to the hrtick on/off state). But at least the task will not starve non-deadline tasks (which is bug that originated this discussion, I think). If I understand well, there are two different (and orthogonal) issues here: 1) Due to a bug in the accounting / enforcement mechanisms (the wrong placement of the while() loop), the tasks consumes 100% of the CPU time, starving non-deadline tasks 2) Due to the large HZ value, the small runtime (and period) and the fact that hrtick is disabled, the kernel cannot provide the requested scheduling granularity The second issue can be fixed by imposing limits on minimum and maximum runtime and the first issue can be fixed by changing the code as I suggested in my previous email. I would suggest to address both the two issues, with separate changes (the current replenishment code looks strange anyway). Luca
Re: INFO: rcu detected stall in do_idle
Hi Juri, On Thu, 18 Oct 2018 14:21:42 +0200 Juri Lelli wrote: [...] > > > > I missed the original emails, but maybe the issue is that the > > > > task blocks before the tick, and when it wakes up again > > > > something goes wrong with the deadline and runtime assignment? > > > > (maybe because the deadline is in the past?) > > > > > > No, the problem is that the task won't be throttled at all, > > > because its replenishing instant is always way in the past when > > > tick occurs. :-/ > > > > Ok, I see the issue now: the problem is that the "while > > (dl_se->runtime <= 0)" loop is executed at replenishment time, but > > the deadline should be postponed at enforcement time. > > > > I mean: in update_curr_dl() we do: > > dl_se->runtime -= scaled_delta_exec; > > if (dl_runtime_exceeded(dl_se) || dl_se->dl_yielded) { > > ... > > enqueue replenishment timer at dl_next_period(dl_se) > > But dl_next_period() is based on a "wrong" deadline! > > > > > > I think that inserting a > > while (dl_se->runtime <= -pi_se->dl_runtime) { > > dl_se->deadline += pi_se->dl_period; > > dl_se->runtime += pi_se->dl_runtime; > > } > > immediately after "dl_se->runtime -= scaled_delta_exec;" would fix > > the problem, no? > > Mmm, I also thought of letting the task "pay back" its overrunning. > But, doesn't this get us quite far from what one would expect. I mean, > enforcement granularity will be way different from task period, no? Yes, the granularity will be what the kernel can provide (due to the HZ value and to the hrtick on/off state). But at least the task will not starve non-deadline tasks (which is bug that originated this discussion, I think). If I understand well, there are two different (and orthogonal) issues here: 1) Due to a bug in the accounting / enforcement mechanisms (the wrong placement of the while() loop), the tasks consumes 100% of the CPU time, starving non-deadline tasks 2) Due to the large HZ value, the small runtime (and period) and the fact that hrtick is disabled, the kernel cannot provide the requested scheduling granularity The second issue can be fixed by imposing limits on minimum and maximum runtime and the first issue can be fixed by changing the code as I suggested in my previous email. I would suggest to address both the two issues, with separate changes (the current replenishment code looks strange anyway). Luca
Re: INFO: rcu detected stall in do_idle
On Thu, 18 Oct 2018 12:47:13 +0200 Juri Lelli wrote: > Hi, > > On 18/10/18 12:23, luca abeni wrote: > > Hi Juri, > > > > On Thu, 18 Oct 2018 10:28:38 +0200 > > Juri Lelli wrote: > > [...] > > > struct sched_attr { > > > .size = 0, > > > .policy = 6, > > > .flags= 0, > > > .nice = 0, > > > .priority = 0, > > > .runtime = 0x9917, > > > .deadline = 0x, > > > .period = 0, > > > } > > > > > > So, we seem to be correctly (in theory, see below) accepting the > > > task. > > > > > > What seems to generate the problem here is that CONFIG_HZ=100 and > > > reproducer task has "tiny" runtime (~40us) and deadline (~66us) > > > parameters, combination that "bypasses" the enforcing mechanism > > > (performed at each tick). > > > > Ok, so the task can execute for at most 1 tick before being > > throttled... Which does not look too bad. > > > > I missed the original emails, but maybe the issue is that the task > > blocks before the tick, and when it wakes up again something goes > > wrong with the deadline and runtime assignment? (maybe because the > > deadline is in the past?) > > No, the problem is that the task won't be throttled at all, because > its replenishing instant is always way in the past when tick > occurs. :-/ Ok, I see the issue now: the problem is that the "while (dl_se->runtime <= 0)" loop is executed at replenishment time, but the deadline should be postponed at enforcement time. I mean: in update_curr_dl() we do: dl_se->runtime -= scaled_delta_exec; if (dl_runtime_exceeded(dl_se) || dl_se->dl_yielded) { ... enqueue replenishment timer at dl_next_period(dl_se) But dl_next_period() is based on a "wrong" deadline! I think that inserting a while (dl_se->runtime <= -pi_se->dl_runtime) { dl_se->deadline += pi_se->dl_period; dl_se->runtime += pi_se->dl_runtime; } immediately after "dl_se->runtime -= scaled_delta_exec;" would fix the problem, no? If we go this way, then we can remove the while loop from replenish_dl_entity(), and change it in WARN_ON(dl_se->runtime <= -pi_se->dl_runtime); WARN_ON(dl_se->runtime > 0); dl_se->deadline += pi_se->dl_period; dl_se->runtime += pi_se->dl_runtime; or something similar. Luca
Re: INFO: rcu detected stall in do_idle
On Thu, 18 Oct 2018 12:47:13 +0200 Juri Lelli wrote: > Hi, > > On 18/10/18 12:23, luca abeni wrote: > > Hi Juri, > > > > On Thu, 18 Oct 2018 10:28:38 +0200 > > Juri Lelli wrote: > > [...] > > > struct sched_attr { > > > .size = 0, > > > .policy = 6, > > > .flags= 0, > > > .nice = 0, > > > .priority = 0, > > > .runtime = 0x9917, > > > .deadline = 0x, > > > .period = 0, > > > } > > > > > > So, we seem to be correctly (in theory, see below) accepting the > > > task. > > > > > > What seems to generate the problem here is that CONFIG_HZ=100 and > > > reproducer task has "tiny" runtime (~40us) and deadline (~66us) > > > parameters, combination that "bypasses" the enforcing mechanism > > > (performed at each tick). > > > > Ok, so the task can execute for at most 1 tick before being > > throttled... Which does not look too bad. > > > > I missed the original emails, but maybe the issue is that the task > > blocks before the tick, and when it wakes up again something goes > > wrong with the deadline and runtime assignment? (maybe because the > > deadline is in the past?) > > No, the problem is that the task won't be throttled at all, because > its replenishing instant is always way in the past when tick > occurs. :-/ Ok, I see the issue now: the problem is that the "while (dl_se->runtime <= 0)" loop is executed at replenishment time, but the deadline should be postponed at enforcement time. I mean: in update_curr_dl() we do: dl_se->runtime -= scaled_delta_exec; if (dl_runtime_exceeded(dl_se) || dl_se->dl_yielded) { ... enqueue replenishment timer at dl_next_period(dl_se) But dl_next_period() is based on a "wrong" deadline! I think that inserting a while (dl_se->runtime <= -pi_se->dl_runtime) { dl_se->deadline += pi_se->dl_period; dl_se->runtime += pi_se->dl_runtime; } immediately after "dl_se->runtime -= scaled_delta_exec;" would fix the problem, no? If we go this way, then we can remove the while loop from replenish_dl_entity(), and change it in WARN_ON(dl_se->runtime <= -pi_se->dl_runtime); WARN_ON(dl_se->runtime > 0); dl_se->deadline += pi_se->dl_period; dl_se->runtime += pi_se->dl_runtime; or something similar. Luca
Re: INFO: rcu detected stall in do_idle
Hi Juri, On Thu, 18 Oct 2018 12:10:08 +0200 Juri Lelli wrote: [...] > > Yes, a HZ related limit sounds like something we'd want. But if > > we're going to do a minimum sysctl, we should also consider adding > > a maximum, if you set a massive period/deadline, you can, even with > > a relatively low u, incur significant delays. > > > > And do we want to put the limit on runtime or on period ? > > > > That is, something like: > > > > TICK_NSEC/2 < period < 10*TICK_NSEC > > > > and/or > > > > TICK_NSEC/2 < runtime < 10*TICK_NSEC > > > > Hmm, for HZ=1000 that ends up with a max period of 10ms, that's far > > too low, 24Hz needs ~41ms. We can of course also limit the runtime > > by capping u for users (as we should anyway). > > I also thought of TICK_NSEC/2 as a reasonably safe lower limit I tend to think that something larger than "2" should be used (maybe 10? I mean: even if HZ = 100, it might make sense to allow a runtime equal to 1ms...) > that will implicitly limit period as well since > >runtime <= deadline <= period I agree that if we end up with TICK_NSEC/2 for the runtime limit then explicitly enforcing a minimum period is not needed. > Not sure about the upper limit, though. Lower limit is something > related to the inherent granularity of the platform/config, upper > limit is more to do with highest prio stuff with huge period delaying > everything else; doesn't seem to be related to HZ? I agree Luca
Re: INFO: rcu detected stall in do_idle
Hi Juri, On Thu, 18 Oct 2018 12:10:08 +0200 Juri Lelli wrote: [...] > > Yes, a HZ related limit sounds like something we'd want. But if > > we're going to do a minimum sysctl, we should also consider adding > > a maximum, if you set a massive period/deadline, you can, even with > > a relatively low u, incur significant delays. > > > > And do we want to put the limit on runtime or on period ? > > > > That is, something like: > > > > TICK_NSEC/2 < period < 10*TICK_NSEC > > > > and/or > > > > TICK_NSEC/2 < runtime < 10*TICK_NSEC > > > > Hmm, for HZ=1000 that ends up with a max period of 10ms, that's far > > too low, 24Hz needs ~41ms. We can of course also limit the runtime > > by capping u for users (as we should anyway). > > I also thought of TICK_NSEC/2 as a reasonably safe lower limit I tend to think that something larger than "2" should be used (maybe 10? I mean: even if HZ = 100, it might make sense to allow a runtime equal to 1ms...) > that will implicitly limit period as well since > >runtime <= deadline <= period I agree that if we end up with TICK_NSEC/2 for the runtime limit then explicitly enforcing a minimum period is not needed. > Not sure about the upper limit, though. Lower limit is something > related to the inherent granularity of the platform/config, upper > limit is more to do with highest prio stuff with huge period delaying > everything else; doesn't seem to be related to HZ? I agree Luca
Re: INFO: rcu detected stall in do_idle
Hi Peter, On Thu, 18 Oct 2018 11:48:50 +0200 Peter Zijlstra wrote: [...] > > So, I tend to think that we might want to play safe and put some > > higher minimum value for dl_runtime (it's currently at 1ULL << > > DL_SCALE). Guess the problem is to pick a reasonable value, though. > > Maybe link it someway to HZ? Then we might add a sysctl (or > > similar) thing with which knowledgeable users can do whatever they > > think their platform/config can support? > > Yes, a HZ related limit sounds like something we'd want. But if we're > going to do a minimum sysctl, we should also consider adding a > maximum, if you set a massive period/deadline, you can, even with a > relatively low u, incur significant delays. I agree with this. > And do we want to put the limit on runtime or on period ? I think we should have a minimum allowed runtime, a maximum allowed runtime, a minimum allowed period and a (per-user? per-control group?) maximum allowed utilization. I suspect having a maximum period is useless, if we already enforce a maximum runtime. > That is, something like: > > TICK_NSEC/2 < period < 10*TICK_NSEC As written above I would not enforce a maximum period. > > and/or > > TICK_NSEC/2 < runtime < 10*TICK_NSEC I think (but I might be wrong) that "TICK_NSEC/2" is too large... I would divide the tick for a larger number (how many time do we want to allow the loop to run?) And I think the maximum runtime should not be TICK-dependent... It is the maximum amount of time for which we allow the dealdine task to starve non-deadline tasks, so it should be an absolute time, not something HZ-dependent... No? > Hmm, for HZ=1000 that ends up with a max period of 10ms, that's far > too low, 24Hz needs ~41ms. We can of course also limit the runtime by > capping u for users (as we should anyway). Regarding capping u for users: some time ago, with Juri we discussed the idea of having per-cgroup limits on the deadline utilization... I think this is a good idea (and if the userspace creates a cgroup per user, this results in per-user capping - but it is more flexible in general) Luca
Re: INFO: rcu detected stall in do_idle
Hi Peter, On Thu, 18 Oct 2018 11:48:50 +0200 Peter Zijlstra wrote: [...] > > So, I tend to think that we might want to play safe and put some > > higher minimum value for dl_runtime (it's currently at 1ULL << > > DL_SCALE). Guess the problem is to pick a reasonable value, though. > > Maybe link it someway to HZ? Then we might add a sysctl (or > > similar) thing with which knowledgeable users can do whatever they > > think their platform/config can support? > > Yes, a HZ related limit sounds like something we'd want. But if we're > going to do a minimum sysctl, we should also consider adding a > maximum, if you set a massive period/deadline, you can, even with a > relatively low u, incur significant delays. I agree with this. > And do we want to put the limit on runtime or on period ? I think we should have a minimum allowed runtime, a maximum allowed runtime, a minimum allowed period and a (per-user? per-control group?) maximum allowed utilization. I suspect having a maximum period is useless, if we already enforce a maximum runtime. > That is, something like: > > TICK_NSEC/2 < period < 10*TICK_NSEC As written above I would not enforce a maximum period. > > and/or > > TICK_NSEC/2 < runtime < 10*TICK_NSEC I think (but I might be wrong) that "TICK_NSEC/2" is too large... I would divide the tick for a larger number (how many time do we want to allow the loop to run?) And I think the maximum runtime should not be TICK-dependent... It is the maximum amount of time for which we allow the dealdine task to starve non-deadline tasks, so it should be an absolute time, not something HZ-dependent... No? > Hmm, for HZ=1000 that ends up with a max period of 10ms, that's far > too low, 24Hz needs ~41ms. We can of course also limit the runtime by > capping u for users (as we should anyway). Regarding capping u for users: some time ago, with Juri we discussed the idea of having per-cgroup limits on the deadline utilization... I think this is a good idea (and if the userspace creates a cgroup per user, this results in per-user capping - but it is more flexible in general) Luca
Re: INFO: rcu detected stall in do_idle
Hi Juri, On Thu, 18 Oct 2018 10:28:38 +0200 Juri Lelli wrote: [...] > struct sched_attr { > .size = 0, > .policy = 6, > .flags= 0, > .nice = 0, > .priority = 0, > .runtime = 0x9917, > .deadline = 0x, > .period = 0, > } > > So, we seem to be correctly (in theory, see below) accepting the task. > > What seems to generate the problem here is that CONFIG_HZ=100 and > reproducer task has "tiny" runtime (~40us) and deadline (~66us) > parameters, combination that "bypasses" the enforcing mechanism > (performed at each tick). Ok, so the task can execute for at most 1 tick before being throttled... Which does not look too bad. I missed the original emails, but maybe the issue is that the task blocks before the tick, and when it wakes up again something goes wrong with the deadline and runtime assignment? (maybe because the deadline is in the past?) > Another side problem seems also to be that with such tiny parameters > we spend lot of time in the while (dl_se->runtime <= 0) loop of > replenish_dl_ entity() (actually uselessly, as deadline is most > probably going to still be in the past when eventually runtime > becomes positive again), as delta_exec is huge w.r.t. runtime and > runtime has to keep up with tiny increments of dl_runtime. I guess we > could ameliorate things here by limiting the number of time we > execute the loop before bailing out. Actually, I think the loop will iterate at most 10ms / 39us times, which is about 256 times, right? If this is too much (I do not know how much time it is spent executing the loop), then the solution is (as you suggest) to increase the minimum allowed runtime. [...] > So, I tend to think that we might want to play safe and put some > higher minimum value for dl_runtime (it's currently at 1ULL << > DL_SCALE). Guess the problem is to pick a reasonable value, though. > Maybe link it someway to HZ? Yes, a value dependent on HZ looks like a good idea. I would propose HZ / N, where N is the maximum number of times you want the loop above to be executed. > Then we might add a sysctl (or similar) > thing with which knowledgeable users can do whatever they think their > platform/config can support? I guess this can be related to the utilization limits we were discussing some time ago... I would propose a cgroup-based interface to set all of these limits. Luca
Re: INFO: rcu detected stall in do_idle
Hi Juri, On Thu, 18 Oct 2018 10:28:38 +0200 Juri Lelli wrote: [...] > struct sched_attr { > .size = 0, > .policy = 6, > .flags= 0, > .nice = 0, > .priority = 0, > .runtime = 0x9917, > .deadline = 0x, > .period = 0, > } > > So, we seem to be correctly (in theory, see below) accepting the task. > > What seems to generate the problem here is that CONFIG_HZ=100 and > reproducer task has "tiny" runtime (~40us) and deadline (~66us) > parameters, combination that "bypasses" the enforcing mechanism > (performed at each tick). Ok, so the task can execute for at most 1 tick before being throttled... Which does not look too bad. I missed the original emails, but maybe the issue is that the task blocks before the tick, and when it wakes up again something goes wrong with the deadline and runtime assignment? (maybe because the deadline is in the past?) > Another side problem seems also to be that with such tiny parameters > we spend lot of time in the while (dl_se->runtime <= 0) loop of > replenish_dl_ entity() (actually uselessly, as deadline is most > probably going to still be in the past when eventually runtime > becomes positive again), as delta_exec is huge w.r.t. runtime and > runtime has to keep up with tiny increments of dl_runtime. I guess we > could ameliorate things here by limiting the number of time we > execute the loop before bailing out. Actually, I think the loop will iterate at most 10ms / 39us times, which is about 256 times, right? If this is too much (I do not know how much time it is spent executing the loop), then the solution is (as you suggest) to increase the minimum allowed runtime. [...] > So, I tend to think that we might want to play safe and put some > higher minimum value for dl_runtime (it's currently at 1ULL << > DL_SCALE). Guess the problem is to pick a reasonable value, though. > Maybe link it someway to HZ? Yes, a value dependent on HZ looks like a good idea. I would propose HZ / N, where N is the maximum number of times you want the loop above to be executed. > Then we might add a sysctl (or similar) > thing with which knowledgeable users can do whatever they think their > platform/config can support? I guess this can be related to the utilization limits we were discussing some time ago... I would propose a cgroup-based interface to set all of these limits. Luca
Re: [RFD/RFC PATCH 5/8] sched: Add proxy execution
On Thu, 11 Oct 2018 14:53:25 +0200 Peter Zijlstra wrote: [...] > > > > + if (rq->curr != rq->idle) { > > > > + rq->proxy = rq->idle; > > > > + set_tsk_need_resched(rq->idle); > > > > + /* > > > > +* XXX [juril] don't we still need to migrate > > > > @next to > > > > +* @owner's CPU? > > > > +*/ > > > > + return rq->idle; > > > > + } > > > > > > If I understand well, this code ends up migrating the task only > > > if the CPU was previously idle? (scheduling the idle task if the > > > CPU was not previously idle) > > > > > > Out of curiosity (I admit this is my ignorance), why is this > > > needed? If I understand well, after scheduling the idle task the > > > scheduler will be invoked again (because of the > > > set_tsk_need_resched(rq->idle)) but I do not understand why it is > > > not possible to migrate task "p" immediately (I would just check > > > "rq->curr != p", to avoid migrating the currently scheduled > > > task). [...] > I think it was the safe and simple choice; note that we're not > migrating just a single @p, but a whole chain of @p. Ah, that's the point I was missing... Thanks for explaining, now everything looks more clear! But... Here is my next dumb question: once the tasks are migrated to the other runqueue, what prevents the scheduler from migrating them back? In particular, task p: if it is (for example) a fixed priority task an is on this runqueue, it is probably because the FP invariant wants this... So, the push mechanism might end up migrating p back to this runqueue soon... No? Another doubt: if I understand well, when a task p "blocks" on a mutex the proxy mechanism migrates it (and the whole chain of blocked tasks) to the owner's core... Right? Now, I understand why this is simpler to implement, but from the schedulability point of view shouldn't we migrate the owner to p's core instead? Thanks, Luca > rq->curr must > not be any of the possible @p's. rq->idle, is per definition not one > of the @p's. > > Does that make sense?
Re: [RFD/RFC PATCH 5/8] sched: Add proxy execution
On Thu, 11 Oct 2018 14:53:25 +0200 Peter Zijlstra wrote: [...] > > > > + if (rq->curr != rq->idle) { > > > > + rq->proxy = rq->idle; > > > > + set_tsk_need_resched(rq->idle); > > > > + /* > > > > +* XXX [juril] don't we still need to migrate > > > > @next to > > > > +* @owner's CPU? > > > > +*/ > > > > + return rq->idle; > > > > + } > > > > > > If I understand well, this code ends up migrating the task only > > > if the CPU was previously idle? (scheduling the idle task if the > > > CPU was not previously idle) > > > > > > Out of curiosity (I admit this is my ignorance), why is this > > > needed? If I understand well, after scheduling the idle task the > > > scheduler will be invoked again (because of the > > > set_tsk_need_resched(rq->idle)) but I do not understand why it is > > > not possible to migrate task "p" immediately (I would just check > > > "rq->curr != p", to avoid migrating the currently scheduled > > > task). [...] > I think it was the safe and simple choice; note that we're not > migrating just a single @p, but a whole chain of @p. Ah, that's the point I was missing... Thanks for explaining, now everything looks more clear! But... Here is my next dumb question: once the tasks are migrated to the other runqueue, what prevents the scheduler from migrating them back? In particular, task p: if it is (for example) a fixed priority task an is on this runqueue, it is probably because the FP invariant wants this... So, the push mechanism might end up migrating p back to this runqueue soon... No? Another doubt: if I understand well, when a task p "blocks" on a mutex the proxy mechanism migrates it (and the whole chain of blocked tasks) to the owner's core... Right? Now, I understand why this is simpler to implement, but from the schedulability point of view shouldn't we migrate the owner to p's core instead? Thanks, Luca > rq->curr must > not be any of the possible @p's. rq->idle, is per definition not one > of the @p's. > > Does that make sense?
Re: [RFD/RFC PATCH 0/8] Towards implementing proxy execution
Hi all, On Tue, 9 Oct 2018 11:24:26 +0200 Juri Lelli wrote: > Hi all, > > Proxy Execution (also goes under several other names) isn't a new > concept, it has been mentioned already in the past to this community > (both in email discussions and at conferences [1, 2]), but no actual > implementation that applies to a fairly recent kernel exists as of > today (of which I'm aware of at least - happy to be proven wrong). > > Very broadly speaking, more info below, proxy execution enables a task > to run using the context of some other task that is "willing" to > participate in the mechanism, as this helps both tasks to improve > performance (w.r.t. the latter task not participating to proxy > execution). First of all, thanks to Juri for working on this! I am looking at the patchset, and I have some questions / comments (maybe some of my questions are really stupid, I do not know... :) To begin, I have a very high-level comment about proxy execution: I believe proxy execution is a very powerful concept, that can be used in many cases, not only to do inheritance on mutual exclusion (think, for example, about pipelines of tasks: a task implementing a stage of the pipeline can act as a proxy for the task implementing the previous stage). So, I would propose to make the proxy() function of patch more generic, and not strictly bound to mutexes. Maybe a task structure can contain a list of tasks for which the task can act as a proxy, and we can have a function like "I want to act as a proxy for task T" to be invoked when a task blocks? Luca
Re: [RFD/RFC PATCH 0/8] Towards implementing proxy execution
Hi all, On Tue, 9 Oct 2018 11:24:26 +0200 Juri Lelli wrote: > Hi all, > > Proxy Execution (also goes under several other names) isn't a new > concept, it has been mentioned already in the past to this community > (both in email discussions and at conferences [1, 2]), but no actual > implementation that applies to a fairly recent kernel exists as of > today (of which I'm aware of at least - happy to be proven wrong). > > Very broadly speaking, more info below, proxy execution enables a task > to run using the context of some other task that is "willing" to > participate in the mechanism, as this helps both tasks to improve > performance (w.r.t. the latter task not participating to proxy > execution). First of all, thanks to Juri for working on this! I am looking at the patchset, and I have some questions / comments (maybe some of my questions are really stupid, I do not know... :) To begin, I have a very high-level comment about proxy execution: I believe proxy execution is a very powerful concept, that can be used in many cases, not only to do inheritance on mutual exclusion (think, for example, about pipelines of tasks: a task implementing a stage of the pipeline can act as a proxy for the task implementing the previous stage). So, I would propose to make the proxy() function of patch more generic, and not strictly bound to mutexes. Maybe a task structure can contain a list of tasks for which the task can act as a proxy, and we can have a function like "I want to act as a proxy for task T" to be invoked when a task blocks? Luca
Re: [RFD/RFC PATCH 0/8] Towards implementing proxy execution
On Wed, 10 Oct 2018 12:57:10 +0200 Peter Zijlstra wrote: > On Wed, Oct 10, 2018 at 12:34:17PM +0200, luca abeni wrote: > > So, I would propose to make the proxy() function of patch more > > generic, and not strictly bound to mutexes. Maybe a task structure > > can contain a list of tasks for which the task can act as a proxy, > > and we can have a function like "I want to act as a proxy for task > > T" to be invoked when a task blocks? > > Certainly possible, but that's something I'd prefer to look at after > it all 'works'. Of course :) I was mentioning this idea because maybe it can have some impact on the design. BTW, here is another "interesting" issue I had in the past with changes like this one: how do we check if the patchset works as expected? "No crashes" is surely a requirement, but I think we also need some kind of testcase that fails if the inheritance mechanism is not working properly, and is successful if the inheritance works. Maybe we can develop some testcase based on rt-app (if noone has such a testcase already) Thanks, Luca > The mutex blocking thing doesn't require external > interfaces etc..
Re: [RFD/RFC PATCH 0/8] Towards implementing proxy execution
On Wed, 10 Oct 2018 12:57:10 +0200 Peter Zijlstra wrote: > On Wed, Oct 10, 2018 at 12:34:17PM +0200, luca abeni wrote: > > So, I would propose to make the proxy() function of patch more > > generic, and not strictly bound to mutexes. Maybe a task structure > > can contain a list of tasks for which the task can act as a proxy, > > and we can have a function like "I want to act as a proxy for task > > T" to be invoked when a task blocks? > > Certainly possible, but that's something I'd prefer to look at after > it all 'works'. Of course :) I was mentioning this idea because maybe it can have some impact on the design. BTW, here is another "interesting" issue I had in the past with changes like this one: how do we check if the patchset works as expected? "No crashes" is surely a requirement, but I think we also need some kind of testcase that fails if the inheritance mechanism is not working properly, and is successful if the inheritance works. Maybe we can develop some testcase based on rt-app (if noone has such a testcase already) Thanks, Luca > The mutex blocking thing doesn't require external > interfaces etc..
Re: [RFD/RFC PATCH 5/8] sched: Add proxy execution
Hi, On Tue, 9 Oct 2018 11:24:31 +0200 Juri Lelli wrote: [...] > +migrate_task: [...] > + put_prev_task(rq, next); > + if (rq->curr != rq->idle) { > + rq->proxy = rq->idle; > + set_tsk_need_resched(rq->idle); > + /* > + * XXX [juril] don't we still need to migrate @next > to > + * @owner's CPU? > + */ > + return rq->idle; > + } If I understand well, this code ends up migrating the task only if the CPU was previously idle? (scheduling the idle task if the CPU was not previously idle) Out of curiosity (I admit this is my ignorance), why is this needed? If I understand well, after scheduling the idle task the scheduler will be invoked again (because of the set_tsk_need_resched(rq->idle)) but I do not understand why it is not possible to migrate task "p" immediately (I would just check "rq->curr != p", to avoid migrating the currently scheduled task). Thanks, Luca > + rq->proxy = _task; > + > + for (; p; p = p->blocked_task) { > + int wake_cpu = p->wake_cpu; > + > + WARN_ON(p == rq->curr); > + > + p->on_rq = TASK_ON_RQ_MIGRATING; > + dequeue_task(rq, p, 0); > + set_task_cpu(p, that_cpu); > + /* > + * We can abuse blocked_entry to migrate the thing, > because @p is > + * still on the rq. > + */ > + list_add(>blocked_entry, _list); > + > + /* > + * Preserve p->wake_cpu, such that we can tell where > it > + * used to run later. > + */ > + p->wake_cpu = wake_cpu; > + } > + > + rq_unpin_lock(rq, rf); > + raw_spin_unlock(>lock); > + raw_spin_lock(_rq->lock); > + > + while (!list_empty(_list)) { > + p = list_first_entry(_list, struct > task_struct, blocked_entry); > + list_del_init(>blocked_entry); > + > + enqueue_task(that_rq, p, 0); > + check_preempt_curr(that_rq, p, 0); > + p->on_rq = TASK_ON_RQ_QUEUED; > + resched_curr(that_rq); > + } > + > + raw_spin_unlock(_rq->lock); > + raw_spin_lock(>lock); > + rq_repin_lock(rq, rf); > + > + return NULL; /* Retry task selection on _this_ CPU. */ > + > +owned_task: > + /* > + * Its possible we interleave with mutex_unlock like: > + * > + * lock(>lock); > + *proxy() > + * mutex_unlock() > + * lock(_lock); > + * next(owner) = current->blocked_task; > + * unlock(_lock); > + * > + * wake_up_q(); > + * ... > + * ttwu_remote() > + * __task_rq_lock() > + *lock(_lock); > + *owner == p > + * > + * Which leaves us to finish the ttwu_remote() and make it > go. > + * > + * XXX is this happening in case of an HANDOFF to p? > + * In any case, reading of the owner in > __mutex_unlock_slowpath is > + * done atomically outside wait_lock (only adding waiters to > wake_q is > + * done inside the critical section). > + * Does this means we can get to proxy _w/o an owner_ if > that was > + * cleared before grabbing wait_lock? Do we account for this > case? > + * OK we actually do (see PROXY_EXEC ifdeffery in unlock > function). > + */ > + > + /* > + * Finish wakeup, will make the contending ttwu do a > + * _spurious_ wakeup, but all code should be able to > + * deal with that. > + */ > + owner->blocked_on = NULL; > + owner->state = TASK_RUNNING; > + // XXX task_woken > + > + /* > + * If @owner/@p is allowed to run on this CPU, make it go. > + */ > + if (cpumask_test_cpu(this_cpu, >cpus_allowed)) { > + raw_spin_unlock(>wait_lock); > + return owner; > + } > + > + /* > + * We have to let ttwu fix things up, because we > + * can't restore the affinity. So dequeue. > + */ > + owner->on_rq = 0; > + deactivate_task(rq, p, DEQUEUE_SLEEP); > + goto blocked_task; > + > +blocked_task: > + /* > + * If !@owner->on_rq, holding @rq->lock will not pin the > task, > + * so we cannot drop @mutex->wait_lock until we're sure its > a blocked > + * task on this rq. > + * > + * We use @owner->blocked_lock to serialize against > ttwu_activate(). > + * Either we see its new owner->on_rq or it will see our > list_add(). > + */ > + raw_spin_lock(>blocked_lock); > + > + /* > + * If we became runnable while waiting for blocked_lock, > retry. > + */ > + if (owner->on_rq) { > + /* > + * If we see the new on->rq, we must also see the > new task_cpu(). > + */ > +
Re: [RFD/RFC PATCH 5/8] sched: Add proxy execution
Hi, On Tue, 9 Oct 2018 11:24:31 +0200 Juri Lelli wrote: [...] > +migrate_task: [...] > + put_prev_task(rq, next); > + if (rq->curr != rq->idle) { > + rq->proxy = rq->idle; > + set_tsk_need_resched(rq->idle); > + /* > + * XXX [juril] don't we still need to migrate @next > to > + * @owner's CPU? > + */ > + return rq->idle; > + } If I understand well, this code ends up migrating the task only if the CPU was previously idle? (scheduling the idle task if the CPU was not previously idle) Out of curiosity (I admit this is my ignorance), why is this needed? If I understand well, after scheduling the idle task the scheduler will be invoked again (because of the set_tsk_need_resched(rq->idle)) but I do not understand why it is not possible to migrate task "p" immediately (I would just check "rq->curr != p", to avoid migrating the currently scheduled task). Thanks, Luca > + rq->proxy = _task; > + > + for (; p; p = p->blocked_task) { > + int wake_cpu = p->wake_cpu; > + > + WARN_ON(p == rq->curr); > + > + p->on_rq = TASK_ON_RQ_MIGRATING; > + dequeue_task(rq, p, 0); > + set_task_cpu(p, that_cpu); > + /* > + * We can abuse blocked_entry to migrate the thing, > because @p is > + * still on the rq. > + */ > + list_add(>blocked_entry, _list); > + > + /* > + * Preserve p->wake_cpu, such that we can tell where > it > + * used to run later. > + */ > + p->wake_cpu = wake_cpu; > + } > + > + rq_unpin_lock(rq, rf); > + raw_spin_unlock(>lock); > + raw_spin_lock(_rq->lock); > + > + while (!list_empty(_list)) { > + p = list_first_entry(_list, struct > task_struct, blocked_entry); > + list_del_init(>blocked_entry); > + > + enqueue_task(that_rq, p, 0); > + check_preempt_curr(that_rq, p, 0); > + p->on_rq = TASK_ON_RQ_QUEUED; > + resched_curr(that_rq); > + } > + > + raw_spin_unlock(_rq->lock); > + raw_spin_lock(>lock); > + rq_repin_lock(rq, rf); > + > + return NULL; /* Retry task selection on _this_ CPU. */ > + > +owned_task: > + /* > + * Its possible we interleave with mutex_unlock like: > + * > + * lock(>lock); > + *proxy() > + * mutex_unlock() > + * lock(_lock); > + * next(owner) = current->blocked_task; > + * unlock(_lock); > + * > + * wake_up_q(); > + * ... > + * ttwu_remote() > + * __task_rq_lock() > + *lock(_lock); > + *owner == p > + * > + * Which leaves us to finish the ttwu_remote() and make it > go. > + * > + * XXX is this happening in case of an HANDOFF to p? > + * In any case, reading of the owner in > __mutex_unlock_slowpath is > + * done atomically outside wait_lock (only adding waiters to > wake_q is > + * done inside the critical section). > + * Does this means we can get to proxy _w/o an owner_ if > that was > + * cleared before grabbing wait_lock? Do we account for this > case? > + * OK we actually do (see PROXY_EXEC ifdeffery in unlock > function). > + */ > + > + /* > + * Finish wakeup, will make the contending ttwu do a > + * _spurious_ wakeup, but all code should be able to > + * deal with that. > + */ > + owner->blocked_on = NULL; > + owner->state = TASK_RUNNING; > + // XXX task_woken > + > + /* > + * If @owner/@p is allowed to run on this CPU, make it go. > + */ > + if (cpumask_test_cpu(this_cpu, >cpus_allowed)) { > + raw_spin_unlock(>wait_lock); > + return owner; > + } > + > + /* > + * We have to let ttwu fix things up, because we > + * can't restore the affinity. So dequeue. > + */ > + owner->on_rq = 0; > + deactivate_task(rq, p, DEQUEUE_SLEEP); > + goto blocked_task; > + > +blocked_task: > + /* > + * If !@owner->on_rq, holding @rq->lock will not pin the > task, > + * so we cannot drop @mutex->wait_lock until we're sure its > a blocked > + * task on this rq. > + * > + * We use @owner->blocked_lock to serialize against > ttwu_activate(). > + * Either we see its new owner->on_rq or it will see our > list_add(). > + */ > + raw_spin_lock(>blocked_lock); > + > + /* > + * If we became runnable while waiting for blocked_lock, > retry. > + */ > + if (owner->on_rq) { > + /* > + * If we see the new on->rq, we must also see the > new task_cpu(). > + */ > +
Re: [RFD/RFC PATCH 3/8] locking/mutex: Rework task_struct::blocked_on
Hi, On Tue, 9 Oct 2018 11:24:29 +0200 Juri Lelli wrote: > From: Peter Zijlstra > > Track the blocked-on relation for mutexes, this allows following this > relation at schedule time. Add blocked_task to track the inverse > relation. > > ,-> task > | | blocked-on > | v >blocked-task | mutex > | | owner > | v > `-- task I was a little bit confused by this description, because (if I understand the code well) blocked_task does not actually track the inverse of the "blocked_on" relationship, but just points to the task that is _currently_ acting as a proxy for a given task. In theory, we could have multiple tasks blocked on "mutex" (which is owned by "task"), so if "blocked_task" tracked the inverse of "blocked_on" it should have been a list (or a data structure containing pointers to multiple task structures), no? I would propose to change "blocked_task" into something like "current_proxy", or similar, which should be more clear (unless I completely misunderstood this stuff... In that case, sorry about the noise) Also, I suspect that this "blocked_task" (or "current_proxy") field should be introcuced in patch 5 (same for the "task_is_blocked()" function from patch 4... Should it go in patch 5?) Luca > > This patch only enables blocked-on relation, blocked-task will be > enabled in a later patch implementing proxy(). > > Signed-off-by: Peter Zijlstra (Intel) > [minor changes while rebasing] > Signed-off-by: Juri Lelli > --- > include/linux/sched.h| 6 ++ > kernel/fork.c| 6 +++--- > kernel/locking/mutex-debug.c | 7 +++ > kernel/locking/mutex.c | 3 +++ > 4 files changed, 11 insertions(+), 11 deletions(-) > > diff --git a/include/linux/sched.h b/include/linux/sched.h > index 977cb57d7bc9..a35e8ab3eef1 100644 > --- a/include/linux/sched.h > +++ b/include/linux/sched.h > @@ -907,10 +907,8 @@ struct task_struct { > struct rt_mutex_waiter *pi_blocked_on; > #endif > > -#ifdef CONFIG_DEBUG_MUTEXES > - /* Mutex deadlock detection: */ > - struct mutex_waiter *blocked_on; > -#endif > + struct task_struct *blocked_task; /* task > that's boosting us */ > + struct mutex*blocked_on;/* lock > we're blocked on */ > #ifdef CONFIG_TRACE_IRQFLAGS > unsigned intirq_events; > diff --git a/kernel/fork.c b/kernel/fork.c > index f0b58479534f..ef27a675b0d7 100644 > --- a/kernel/fork.c > +++ b/kernel/fork.c > @@ -1827,9 +1827,9 @@ static __latent_entropy struct task_struct > *copy_process( lockdep_init_task(p); > #endif > > -#ifdef CONFIG_DEBUG_MUTEXES > - p->blocked_on = NULL; /* not blocked yet */ > -#endif > + p->blocked_task = NULL; /* nobody is boosting us yet*/ > + p->blocked_on = NULL; /* not blocked yet */ > + > #ifdef CONFIG_BCACHE > p->sequential_io= 0; > p->sequential_io_avg= 0; > diff --git a/kernel/locking/mutex-debug.c > b/kernel/locking/mutex-debug.c index a660d38b6c29..6605e083a3e9 100644 > --- a/kernel/locking/mutex-debug.c > +++ b/kernel/locking/mutex-debug.c > @@ -53,8 +53,8 @@ void debug_mutex_add_waiter(struct mutex *lock, > struct mutex_waiter *waiter, { > SMP_DEBUG_LOCKS_WARN_ON(!raw_spin_is_locked(>wait_lock)); > > - /* Mark the current thread as blocked on the lock: */ > - task->blocked_on = waiter; > + /* Current thread can't be alredy blocked (since it's > executing!) */ > + DEBUG_LOCKS_WARN_ON(task->blocked_on); > } > > void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter > *waiter, @@ -62,8 +62,7 @@ void mutex_remove_waiter(struct mutex > *lock, struct mutex_waiter *waiter, { > DEBUG_LOCKS_WARN_ON(list_empty(>list)); > DEBUG_LOCKS_WARN_ON(waiter->task != task); > - DEBUG_LOCKS_WARN_ON(task->blocked_on != waiter); > - task->blocked_on = NULL; > + DEBUG_LOCKS_WARN_ON(task->blocked_on != lock); > > list_del_init(>list); > waiter->task = NULL; > diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c > index f37402cd8496..76b59b555da3 100644 > --- a/kernel/locking/mutex.c > +++ b/kernel/locking/mutex.c > @@ -979,6 +979,7 @@ __mutex_lock_common(struct mutex *lock, long > state, unsigned int subclass, } > > waiter.task = current; > + current->blocked_on = lock; > > set_current_state(state); > for (;;) { > @@ -1047,6 +1048,8 @@ __mutex_lock_common(struct mutex *lock, long > state, unsigned int subclass, } > > mutex_remove_waiter(lock, , current); > + current->blocked_on = NULL; > + > if (likely(list_empty(>wait_list))) > __mutex_clear_flag(lock, MUTEX_FLAGS); >
Re: [RFD/RFC PATCH 3/8] locking/mutex: Rework task_struct::blocked_on
Hi, On Tue, 9 Oct 2018 11:24:29 +0200 Juri Lelli wrote: > From: Peter Zijlstra > > Track the blocked-on relation for mutexes, this allows following this > relation at schedule time. Add blocked_task to track the inverse > relation. > > ,-> task > | | blocked-on > | v >blocked-task | mutex > | | owner > | v > `-- task I was a little bit confused by this description, because (if I understand the code well) blocked_task does not actually track the inverse of the "blocked_on" relationship, but just points to the task that is _currently_ acting as a proxy for a given task. In theory, we could have multiple tasks blocked on "mutex" (which is owned by "task"), so if "blocked_task" tracked the inverse of "blocked_on" it should have been a list (or a data structure containing pointers to multiple task structures), no? I would propose to change "blocked_task" into something like "current_proxy", or similar, which should be more clear (unless I completely misunderstood this stuff... In that case, sorry about the noise) Also, I suspect that this "blocked_task" (or "current_proxy") field should be introcuced in patch 5 (same for the "task_is_blocked()" function from patch 4... Should it go in patch 5?) Luca > > This patch only enables blocked-on relation, blocked-task will be > enabled in a later patch implementing proxy(). > > Signed-off-by: Peter Zijlstra (Intel) > [minor changes while rebasing] > Signed-off-by: Juri Lelli > --- > include/linux/sched.h| 6 ++ > kernel/fork.c| 6 +++--- > kernel/locking/mutex-debug.c | 7 +++ > kernel/locking/mutex.c | 3 +++ > 4 files changed, 11 insertions(+), 11 deletions(-) > > diff --git a/include/linux/sched.h b/include/linux/sched.h > index 977cb57d7bc9..a35e8ab3eef1 100644 > --- a/include/linux/sched.h > +++ b/include/linux/sched.h > @@ -907,10 +907,8 @@ struct task_struct { > struct rt_mutex_waiter *pi_blocked_on; > #endif > > -#ifdef CONFIG_DEBUG_MUTEXES > - /* Mutex deadlock detection: */ > - struct mutex_waiter *blocked_on; > -#endif > + struct task_struct *blocked_task; /* task > that's boosting us */ > + struct mutex*blocked_on;/* lock > we're blocked on */ > #ifdef CONFIG_TRACE_IRQFLAGS > unsigned intirq_events; > diff --git a/kernel/fork.c b/kernel/fork.c > index f0b58479534f..ef27a675b0d7 100644 > --- a/kernel/fork.c > +++ b/kernel/fork.c > @@ -1827,9 +1827,9 @@ static __latent_entropy struct task_struct > *copy_process( lockdep_init_task(p); > #endif > > -#ifdef CONFIG_DEBUG_MUTEXES > - p->blocked_on = NULL; /* not blocked yet */ > -#endif > + p->blocked_task = NULL; /* nobody is boosting us yet*/ > + p->blocked_on = NULL; /* not blocked yet */ > + > #ifdef CONFIG_BCACHE > p->sequential_io= 0; > p->sequential_io_avg= 0; > diff --git a/kernel/locking/mutex-debug.c > b/kernel/locking/mutex-debug.c index a660d38b6c29..6605e083a3e9 100644 > --- a/kernel/locking/mutex-debug.c > +++ b/kernel/locking/mutex-debug.c > @@ -53,8 +53,8 @@ void debug_mutex_add_waiter(struct mutex *lock, > struct mutex_waiter *waiter, { > SMP_DEBUG_LOCKS_WARN_ON(!raw_spin_is_locked(>wait_lock)); > > - /* Mark the current thread as blocked on the lock: */ > - task->blocked_on = waiter; > + /* Current thread can't be alredy blocked (since it's > executing!) */ > + DEBUG_LOCKS_WARN_ON(task->blocked_on); > } > > void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter > *waiter, @@ -62,8 +62,7 @@ void mutex_remove_waiter(struct mutex > *lock, struct mutex_waiter *waiter, { > DEBUG_LOCKS_WARN_ON(list_empty(>list)); > DEBUG_LOCKS_WARN_ON(waiter->task != task); > - DEBUG_LOCKS_WARN_ON(task->blocked_on != waiter); > - task->blocked_on = NULL; > + DEBUG_LOCKS_WARN_ON(task->blocked_on != lock); > > list_del_init(>list); > waiter->task = NULL; > diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c > index f37402cd8496..76b59b555da3 100644 > --- a/kernel/locking/mutex.c > +++ b/kernel/locking/mutex.c > @@ -979,6 +979,7 @@ __mutex_lock_common(struct mutex *lock, long > state, unsigned int subclass, } > > waiter.task = current; > + current->blocked_on = lock; > > set_current_state(state); > for (;;) { > @@ -1047,6 +1048,8 @@ __mutex_lock_common(struct mutex *lock, long > state, unsigned int subclass, } > > mutex_remove_waiter(lock, , current); > + current->blocked_on = NULL; > + > if (likely(list_empty(>wait_list))) > __mutex_clear_flag(lock, MUTEX_FLAGS); >
Re: [PATCH] sched/deadline: Fix switched_from_dl
On Wed, 11 Jul 2018 09:29:48 +0200 Juri Lelli wrote: > Mark noticed that syzkaller is able to reliably trigger the following > > dl_rq->running_bw > dl_rq->this_bw > WARNING: CPU: 1 PID: 153 at kernel/sched/deadline.c:124 > switched_from_dl+0x454/0x608 Kernel panic - not syncing: > panic_on_warn set ... > > CPU: 1 PID: 153 Comm: syz-executor253 Not tainted 4.18.0-rc3+ #29 > Hardware name: linux,dummy-virt (DT) > Call trace: >dump_backtrace+0x0/0x458 >show_stack+0x20/0x30 >dump_stack+0x180/0x250 >panic+0x2dc/0x4ec >__warn_printk+0x0/0x150 >report_bug+0x228/0x2d8 >bug_handler+0xa0/0x1a0 >brk_handler+0x2f0/0x568 >do_debug_exception+0x1bc/0x5d0 >el1_dbg+0x18/0x78 >switched_from_dl+0x454/0x608 >__sched_setscheduler+0x8cc/0x2018 >sys_sched_setattr+0x340/0x758 >el0_svc_naked+0x30/0x34 > > syzkaller reproducer runs a bunch of threads that constantly switch > between DEADLINE and NORMAL classes while interacting through futexes. > > The splat above is caused by the fact that if a DEADLINE task is > setattr back to NORMAL while in non_contending state (blocked on a > futex - inactive timer armed), its contribution to running_bw is not > removed before sub_rq_bw() gets called (!task_on_rq_queued() branch) > and the latter sees running_bw > this_bw. > > Fix it by removing a task contribution from running_bw if the task is > not queued and in non_contending state while switched to a different > class. > > Reported-by: Mark Rutland > Signed-off-by: Juri Lelli > --- > kernel/sched/deadline.c | 11 ++- > 1 file changed, 10 insertions(+), 1 deletion(-) Reviewed-by: Luca Abeni And, thanks for taking care of this issue! Thanks, Luca > > diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c > index fbfc3f1d368a..10c7b51c0d1f 100644 > --- a/kernel/sched/deadline.c > +++ b/kernel/sched/deadline.c > @@ -2290,8 +2290,17 @@ static void switched_from_dl(struct rq *rq, > struct task_struct *p) if (task_on_rq_queued(p) && p->dl.dl_runtime) > task_non_contending(p); > > - if (!task_on_rq_queued(p)) > + if (!task_on_rq_queued(p)) { > + /* > + * Inactive timer is armed. However, p is leaving > DEADLINE and > + * might migrate away from this rq while continuing > to run on > + * some other class. We need to remove its > contribution from > + * this rq running_bw now, or sub_rq_bw (below) will > complain. > + */ > + if (p->dl.dl_non_contending) > + sub_running_bw(>dl, >dl); > sub_rq_bw(>dl, >dl); > + } > > /* >* We cannot use inactive_task_timer() to invoke > sub_running_bw()
Re: [PATCH] sched/deadline: Fix switched_from_dl
On Wed, 11 Jul 2018 09:29:48 +0200 Juri Lelli wrote: > Mark noticed that syzkaller is able to reliably trigger the following > > dl_rq->running_bw > dl_rq->this_bw > WARNING: CPU: 1 PID: 153 at kernel/sched/deadline.c:124 > switched_from_dl+0x454/0x608 Kernel panic - not syncing: > panic_on_warn set ... > > CPU: 1 PID: 153 Comm: syz-executor253 Not tainted 4.18.0-rc3+ #29 > Hardware name: linux,dummy-virt (DT) > Call trace: >dump_backtrace+0x0/0x458 >show_stack+0x20/0x30 >dump_stack+0x180/0x250 >panic+0x2dc/0x4ec >__warn_printk+0x0/0x150 >report_bug+0x228/0x2d8 >bug_handler+0xa0/0x1a0 >brk_handler+0x2f0/0x568 >do_debug_exception+0x1bc/0x5d0 >el1_dbg+0x18/0x78 >switched_from_dl+0x454/0x608 >__sched_setscheduler+0x8cc/0x2018 >sys_sched_setattr+0x340/0x758 >el0_svc_naked+0x30/0x34 > > syzkaller reproducer runs a bunch of threads that constantly switch > between DEADLINE and NORMAL classes while interacting through futexes. > > The splat above is caused by the fact that if a DEADLINE task is > setattr back to NORMAL while in non_contending state (blocked on a > futex - inactive timer armed), its contribution to running_bw is not > removed before sub_rq_bw() gets called (!task_on_rq_queued() branch) > and the latter sees running_bw > this_bw. > > Fix it by removing a task contribution from running_bw if the task is > not queued and in non_contending state while switched to a different > class. > > Reported-by: Mark Rutland > Signed-off-by: Juri Lelli > --- > kernel/sched/deadline.c | 11 ++- > 1 file changed, 10 insertions(+), 1 deletion(-) Reviewed-by: Luca Abeni And, thanks for taking care of this issue! Thanks, Luca > > diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c > index fbfc3f1d368a..10c7b51c0d1f 100644 > --- a/kernel/sched/deadline.c > +++ b/kernel/sched/deadline.c > @@ -2290,8 +2290,17 @@ static void switched_from_dl(struct rq *rq, > struct task_struct *p) if (task_on_rq_queued(p) && p->dl.dl_runtime) > task_non_contending(p); > > - if (!task_on_rq_queued(p)) > + if (!task_on_rq_queued(p)) { > + /* > + * Inactive timer is armed. However, p is leaving > DEADLINE and > + * might migrate away from this rq while continuing > to run on > + * some other class. We need to remove its > contribution from > + * this rq running_bw now, or sub_rq_bw (below) will > complain. > + */ > + if (p->dl.dl_non_contending) > + sub_running_bw(>dl, >dl); > sub_rq_bw(>dl, >dl); > + } > > /* >* We cannot use inactive_task_timer() to invoke > sub_running_bw()
Re: [PATCH v5 00/10] track CPU utilization
Hi all, sorry; I missed the beginning of this thread... Anyway, below I add some comments: On Wed, 6 Jun 2018 15:05:58 +0200 Claudio Scordino wrote: [...] > >> Ok, I see ... Have you guys already tried something like my patch > >> above (keeping the freq >= this_bw) in real world use cases ? Is > >> this costing that much energy in practice ? If we fill the gaps > >> left by DL (when it > > > > IIRC, Claudio (now Cc-ed) did experiment a bit with both > > approaches, so he might add some numbers to my words above. I > > didn't (yet). But, please consider that I might be reserving (for > > example) 50% of bandwidth for my heavy and time sensitive task and > > then have that task wake up only once in a while (but I'll be > > keeping clock speed up for the whole time). :/ > > As far as I can remember, we never tested energy consumption of > running_bw vs this_bw, as at OSPM'17 we had already decided to use > running_bw implementing GRUB-PA. The rationale is that, as Juri > pointed out, the amount of spare (i.e. reclaimable) bandwidth in > this_bw is very user-dependent. Yes, I agree with this. The appropriateness of using this_bw or running_bw is highly workload-dependent... If a periodic task consumes much less than its runtime (or if a sporadic task has inter-activation times much larger than the SCHED_DEADLINE period), then running_bw has to be preferred. But if a periodic task consumes almost all of its runtime before blocking, then this_bw has to be preferred... But this also depends on the hardware: if the frequency switch time is small, then running_bw is more appropriate... On the other hand, this_bw works much better if the frequency switch time is high. (Talking about this, I remember Claudio measured frequency switch times large almost 3ms... Is this really due to hardware issues? Or maybe there is some software issue invoved? 3ms look like a lot of time...) Anyway, this is why I proposed to use some kind of /sys/... file to control the kind of deadline utilization used for frequency scaling: in this way, the system designer / administrator, who hopefully has the needed information about workload and hardware, can optimize the frequency scaling behaviour by deciding if running_bw or this_bw will be used. Luca > For example, the user can let this_bw > be much higher than the measured bandwidth, just to be sure that the > deadlines are met even in corner cases. In practice, this means that > the task executes for quite a short time and then blocks (with its > bandwidth reclaimed, hence the CPU frequency reduced, at the 0lag > time). Using this_bw rather than running_bw, the CPU frequency would > remain at the same fixed value even when the task is blocked. I > understand that on some cases it could even be better (i.e. no waste > of energy in frequency switch). However, IMHO, these are corner cases > and in the average case it is better to rely on running_bw and reduce > the CPU frequency accordingly. > > Best regards, > > Claudio
Re: [PATCH v5 00/10] track CPU utilization
Hi all, sorry; I missed the beginning of this thread... Anyway, below I add some comments: On Wed, 6 Jun 2018 15:05:58 +0200 Claudio Scordino wrote: [...] > >> Ok, I see ... Have you guys already tried something like my patch > >> above (keeping the freq >= this_bw) in real world use cases ? Is > >> this costing that much energy in practice ? If we fill the gaps > >> left by DL (when it > > > > IIRC, Claudio (now Cc-ed) did experiment a bit with both > > approaches, so he might add some numbers to my words above. I > > didn't (yet). But, please consider that I might be reserving (for > > example) 50% of bandwidth for my heavy and time sensitive task and > > then have that task wake up only once in a while (but I'll be > > keeping clock speed up for the whole time). :/ > > As far as I can remember, we never tested energy consumption of > running_bw vs this_bw, as at OSPM'17 we had already decided to use > running_bw implementing GRUB-PA. The rationale is that, as Juri > pointed out, the amount of spare (i.e. reclaimable) bandwidth in > this_bw is very user-dependent. Yes, I agree with this. The appropriateness of using this_bw or running_bw is highly workload-dependent... If a periodic task consumes much less than its runtime (or if a sporadic task has inter-activation times much larger than the SCHED_DEADLINE period), then running_bw has to be preferred. But if a periodic task consumes almost all of its runtime before blocking, then this_bw has to be preferred... But this also depends on the hardware: if the frequency switch time is small, then running_bw is more appropriate... On the other hand, this_bw works much better if the frequency switch time is high. (Talking about this, I remember Claudio measured frequency switch times large almost 3ms... Is this really due to hardware issues? Or maybe there is some software issue invoved? 3ms look like a lot of time...) Anyway, this is why I proposed to use some kind of /sys/... file to control the kind of deadline utilization used for frequency scaling: in this way, the system designer / administrator, who hopefully has the needed information about workload and hardware, can optimize the frequency scaling behaviour by deciding if running_bw or this_bw will be used. Luca > For example, the user can let this_bw > be much higher than the measured bandwidth, just to be sure that the > deadlines are met even in corner cases. In practice, this means that > the task executes for quite a short time and then blocks (with its > bandwidth reclaimed, hence the CPU frequency reduced, at the 0lag > time). Using this_bw rather than running_bw, the CPU frequency would > remain at the same fixed value even when the task is blocked. I > understand that on some cases it could even be better (i.e. no waste > of energy in frequency switch). However, IMHO, these are corner cases > and in the average case it is better to rely on running_bw and reduce > the CPU frequency accordingly. > > Best regards, > > Claudio
Re: [PATCH v5 00/10] track CPU utilization
Hi, On Wed, 6 Jun 2018 14:20:46 +0100 Quentin Perret wrote: [...] > > However, IMHO, these are corner cases and in the average case it is > > better to rely on running_bw and reduce the CPU frequency > > accordingly. > > My point was that accepting to go at a lower frequency than required > by this_bw is fundamentally unsafe. If you're at a low frequency when > a DL task starts, there are real situations where you won't be able > to increase the frequency immediately, which can eventually lead to > missing deadlines. Now, if this risk is known, has been discussed, > and is accepted, that's fair enough. I'm just too late for the > discussion :-) Well, our conclusion was that this issue can be addressed when designing the scheduling parameters: - If we do not consider frequency scaling, a task can respect its deadlines if the SCHED_DEADLINE runtime is larger than the task's execution time and the SCHED_DEADLINE period is smaller than the task's period (and if some kind of "global" admission test is respected) - Considering frequency scaling (and 0-time frequency switches), the SCHED_DEADLINE runtime must be larger than the task execution time at the highest frequency - If the frequency switch time is larger than 0, then the SCHED_DEADLINE runtime must be larger than the task execution time (at the highest frequency) plus the frequency switch time If this third condition is respected, I think that deadline misses can be avoided even if running_bw is used (and the CPU takes a considerable time to switch frequency). Of course, this requires an over-allocation of runtime (and the global admission test has more probabilities to fail)... Luca
Re: [PATCH v5 00/10] track CPU utilization
Hi, On Wed, 6 Jun 2018 14:20:46 +0100 Quentin Perret wrote: [...] > > However, IMHO, these are corner cases and in the average case it is > > better to rely on running_bw and reduce the CPU frequency > > accordingly. > > My point was that accepting to go at a lower frequency than required > by this_bw is fundamentally unsafe. If you're at a low frequency when > a DL task starts, there are real situations where you won't be able > to increase the frequency immediately, which can eventually lead to > missing deadlines. Now, if this risk is known, has been discussed, > and is accepted, that's fair enough. I'm just too late for the > discussion :-) Well, our conclusion was that this issue can be addressed when designing the scheduling parameters: - If we do not consider frequency scaling, a task can respect its deadlines if the SCHED_DEADLINE runtime is larger than the task's execution time and the SCHED_DEADLINE period is smaller than the task's period (and if some kind of "global" admission test is respected) - Considering frequency scaling (and 0-time frequency switches), the SCHED_DEADLINE runtime must be larger than the task execution time at the highest frequency - If the frequency switch time is larger than 0, then the SCHED_DEADLINE runtime must be larger than the task execution time (at the highest frequency) plus the frequency switch time If this third condition is respected, I think that deadline misses can be avoided even if running_bw is used (and the CPU takes a considerable time to switch frequency). Of course, this requires an over-allocation of runtime (and the global admission test has more probabilities to fail)... Luca
Re: [PATCH V2 0/7] sched/deadline: fix cpusets bandwidth accounting
Hi Mathieu, On Thu, 1 Feb 2018 09:51:02 -0700 Mathieu Poirierwrote: > This is the follow-up patchset to [1] that attempt to fix a problem > reported by Steve Rostedt [2] where DL bandwidth accounting is not > recomputed after CPUset and CPU hotplug operations. When CPU hotplug and > some CUPset manipulation take place root domains are destroyed and new ones > created, loosing at the same time DL accounting information pertaining to > utilisation. Please see [1] for a full description of the approach. I do not know the cgroup / cpuset code too much, so I have no useful comments on your patches... But I think this patchset is a nice improvemnt respect to the current situation. [...] > A notable addition is patch 7/7 - it addresses a problem seen when hot > plugging out a CPU where a DL task is running (see changelog for full > details). The issue is unrelated to this patchset and will manifest > itself on a mainline kernel. I think I introduced this bug with my reclaiming patches, so I am interested. When a cpu is hot-plugged out, which code in the kernel is responsible for migrating the tasks that are executing on such CPU? I was sure I was handling all the relevant codepaths, but this bug clearly shows that I was wrong. Thanks, Luca
Re: [PATCH V2 0/7] sched/deadline: fix cpusets bandwidth accounting
Hi Mathieu, On Thu, 1 Feb 2018 09:51:02 -0700 Mathieu Poirier wrote: > This is the follow-up patchset to [1] that attempt to fix a problem > reported by Steve Rostedt [2] where DL bandwidth accounting is not > recomputed after CPUset and CPU hotplug operations. When CPU hotplug and > some CUPset manipulation take place root domains are destroyed and new ones > created, loosing at the same time DL accounting information pertaining to > utilisation. Please see [1] for a full description of the approach. I do not know the cgroup / cpuset code too much, so I have no useful comments on your patches... But I think this patchset is a nice improvemnt respect to the current situation. [...] > A notable addition is patch 7/7 - it addresses a problem seen when hot > plugging out a CPU where a DL task is running (see changelog for full > details). The issue is unrelated to this patchset and will manifest > itself on a mainline kernel. I think I introduced this bug with my reclaiming patches, so I am interested. When a cpu is hot-plugged out, which code in the kernel is responsible for migrating the tasks that are executing on such CPU? I was sure I was handling all the relevant codepaths, but this bug clearly shows that I was wrong. Thanks, Luca
Re: Sparse warnings from sched.h
On Sat, 25 Nov 2017 21:46:11 -0800 Jakub Kicinskiwrote: > Hi! > > Did these: > > ./include/linux/sched.h:476:62: error: dubious one-bit signed bitfield > ./include/linux/sched.h:477:62: error: dubious one-bit signed bitfield > ./include/linux/sched.h:478:62: error: dubious one-bit signed bitfield > ./include/linux/sched.h:479:62: error: dubious one-bit signed bitfield > > got fixed? I saw there were patches posted, but nothing have reached > Linus's tree, yet. I think a fix is in the tip tree: https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git/commit/?id=aa5222e92f8000ed3c1c38dddf11c83222aadfb3 Luca
Re: Sparse warnings from sched.h
On Sat, 25 Nov 2017 21:46:11 -0800 Jakub Kicinski wrote: > Hi! > > Did these: > > ./include/linux/sched.h:476:62: error: dubious one-bit signed bitfield > ./include/linux/sched.h:477:62: error: dubious one-bit signed bitfield > ./include/linux/sched.h:478:62: error: dubious one-bit signed bitfield > ./include/linux/sched.h:479:62: error: dubious one-bit signed bitfield > > got fixed? I saw there were patches posted, but nothing have reached > Linus's tree, yet. I think a fix is in the tip tree: https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git/commit/?id=aa5222e92f8000ed3c1c38dddf11c83222aadfb3 Luca
Re: [PATCH] sched/deadline: Use bools for the state flags
Hello, On Tue, 21 Nov 2017 11:44:05 +0100 (CET) Jiri Kosina <ji...@kernel.org> wrote: > From: Jiri Kosina <jkos...@suse.cz> > > Commit > > 799ba82de01e ("sched/deadline: Use C bitfields for the state flags") > > converted state flags into one-bit signed int. Signed one-bit type can be > either 0 or -1, which is going to cause a problem once 1 is assigned to it > and then the value later tested against 1. I think a different fix has just been committed to tip: https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git/commit/?id=aa5222e92f8000ed3c1c38dddf11c83222aadfb3 Luca > > The current code is okay, as all the checks are (non-)zero check, but I > believe that we'd rather be safe than sorry here; remove the fragility by > converting the state flags to bool. > > This also silences annoying sparse complaints about this very issue when > compiling any code that includes sched.h. > > Fixes: 799ba82de01e ("sched/deadline: Use C bitfields for the state flags") > Cc: luca abeni <luca.ab...@santannapisa.it> > Cc: Peter Zijlstra (Intel) <pet...@infradead.org> > Cc: Daniel Bristot de Oliveira <bris...@redhat.com> > Cc: Juri Lelli <juri.le...@arm.com> > Cc: Linus Torvalds <torva...@linux-foundation.org> > Cc: Mathieu Poirier <mathieu.poir...@linaro.org> > Cc: Mike Galbraith <efa...@gmx.de> > Cc: Peter Zijlstra <pet...@infradead.org> > Cc: Steven Rostedt <rost...@goodmis.org> > Cc: Thomas Gleixner <t...@linutronix.de> > Signed-off-by: Jiri Kosina <jkos...@suse.cz> > --- > include/linux/sched.h | 8 > kernel/sched/core.c | 8 > kernel/sched/deadline.c | 32 > 3 files changed, 24 insertions(+), 24 deletions(-) > > diff --git a/include/linux/sched.h b/include/linux/sched.h > index a5dc7c98b0a2..b19fa1b96726 100644 > --- a/include/linux/sched.h > +++ b/include/linux/sched.h > @@ -473,10 +473,10 @@ struct sched_dl_entity { >* conditions between the inactive timer handler and the wakeup >* code. >*/ > - int dl_throttled : 1; > - int dl_boosted: 1; > - int dl_yielded: 1; > - int dl_non_contending : 1; > + booldl_throttled; > + booldl_boosted; > + booldl_yielded; > + booldl_non_contending; > > /* >* Bandwidth enforcement timer. Each -deadline task has its > diff --git a/kernel/sched/core.c b/kernel/sched/core.c > index 75554f366fd3..f9bbf0f55e0e 100644 > --- a/kernel/sched/core.c > +++ b/kernel/sched/core.c > @@ -3738,20 +3738,20 @@ void rt_mutex_setprio(struct task_struct *p, struct > task_struct *pi_task) > if (dl_prio(prio)) { > if (!dl_prio(p->normal_prio) || > (pi_task && dl_entity_preempt(_task->dl, >dl))) { > - p->dl.dl_boosted = 1; > + p->dl.dl_boosted = true; > queue_flag |= ENQUEUE_REPLENISH; > } else > - p->dl.dl_boosted = 0; > + p->dl.dl_boosted = false; > p->sched_class = _sched_class; > } else if (rt_prio(prio)) { > if (dl_prio(oldprio)) > - p->dl.dl_boosted = 0; > + p->dl.dl_boosted = false; > if (oldprio < prio) > queue_flag |= ENQUEUE_HEAD; > p->sched_class = _sched_class; > } else { > if (dl_prio(oldprio)) > - p->dl.dl_boosted = 0; > + p->dl.dl_boosted = false; > if (rt_prio(oldprio)) > p->rt.timeout = 0; > p->sched_class = _sched_class; > diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c > index 2473736c7616..78cb00ae8b2f 100644 > --- a/kernel/sched/deadline.c > +++ b/kernel/sched/deadline.c > @@ -133,7 +133,7 @@ void dl_change_utilization(struct task_struct *p, u64 > new_bw) > rq = task_rq(p); > if (p->dl.dl_non_contending) { > sub_running_bw(p->dl.dl_bw, >dl); > - p->dl.dl_non_contending = 0; > + p->dl.dl_non_contending = false; > /* >* If the timer handler is currently running and the >* timer cannot be cancelled, i
Re: [PATCH] sched/deadline: Use bools for the state flags
Hello, On Tue, 21 Nov 2017 11:44:05 +0100 (CET) Jiri Kosina wrote: > From: Jiri Kosina > > Commit > > 799ba82de01e ("sched/deadline: Use C bitfields for the state flags") > > converted state flags into one-bit signed int. Signed one-bit type can be > either 0 or -1, which is going to cause a problem once 1 is assigned to it > and then the value later tested against 1. I think a different fix has just been committed to tip: https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git/commit/?id=aa5222e92f8000ed3c1c38dddf11c83222aadfb3 Luca > > The current code is okay, as all the checks are (non-)zero check, but I > believe that we'd rather be safe than sorry here; remove the fragility by > converting the state flags to bool. > > This also silences annoying sparse complaints about this very issue when > compiling any code that includes sched.h. > > Fixes: 799ba82de01e ("sched/deadline: Use C bitfields for the state flags") > Cc: luca abeni > Cc: Peter Zijlstra (Intel) > Cc: Daniel Bristot de Oliveira > Cc: Juri Lelli > Cc: Linus Torvalds > Cc: Mathieu Poirier > Cc: Mike Galbraith > Cc: Peter Zijlstra > Cc: Steven Rostedt > Cc: Thomas Gleixner > Signed-off-by: Jiri Kosina > --- > include/linux/sched.h | 8 > kernel/sched/core.c | 8 > kernel/sched/deadline.c | 32 > 3 files changed, 24 insertions(+), 24 deletions(-) > > diff --git a/include/linux/sched.h b/include/linux/sched.h > index a5dc7c98b0a2..b19fa1b96726 100644 > --- a/include/linux/sched.h > +++ b/include/linux/sched.h > @@ -473,10 +473,10 @@ struct sched_dl_entity { >* conditions between the inactive timer handler and the wakeup >* code. >*/ > - int dl_throttled : 1; > - int dl_boosted: 1; > - int dl_yielded: 1; > - int dl_non_contending : 1; > + booldl_throttled; > + booldl_boosted; > + booldl_yielded; > + booldl_non_contending; > > /* >* Bandwidth enforcement timer. Each -deadline task has its > diff --git a/kernel/sched/core.c b/kernel/sched/core.c > index 75554f366fd3..f9bbf0f55e0e 100644 > --- a/kernel/sched/core.c > +++ b/kernel/sched/core.c > @@ -3738,20 +3738,20 @@ void rt_mutex_setprio(struct task_struct *p, struct > task_struct *pi_task) > if (dl_prio(prio)) { > if (!dl_prio(p->normal_prio) || > (pi_task && dl_entity_preempt(_task->dl, >dl))) { > - p->dl.dl_boosted = 1; > + p->dl.dl_boosted = true; > queue_flag |= ENQUEUE_REPLENISH; > } else > - p->dl.dl_boosted = 0; > + p->dl.dl_boosted = false; > p->sched_class = _sched_class; > } else if (rt_prio(prio)) { > if (dl_prio(oldprio)) > - p->dl.dl_boosted = 0; > + p->dl.dl_boosted = false; > if (oldprio < prio) > queue_flag |= ENQUEUE_HEAD; > p->sched_class = _sched_class; > } else { > if (dl_prio(oldprio)) > - p->dl.dl_boosted = 0; > + p->dl.dl_boosted = false; > if (rt_prio(oldprio)) > p->rt.timeout = 0; > p->sched_class = _sched_class; > diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c > index 2473736c7616..78cb00ae8b2f 100644 > --- a/kernel/sched/deadline.c > +++ b/kernel/sched/deadline.c > @@ -133,7 +133,7 @@ void dl_change_utilization(struct task_struct *p, u64 > new_bw) > rq = task_rq(p); > if (p->dl.dl_non_contending) { > sub_running_bw(p->dl.dl_bw, >dl); > - p->dl.dl_non_contending = 0; > + p->dl.dl_non_contending = false; > /* >* If the timer handler is currently running and the >* timer cannot be cancelled, inactive_task_timer() > @@ -251,7 +251,7 @@ static void task_non_contending(struct task_struct *p) > return; > } > > - dl_se->dl_non_contending = 1; > + dl_se->dl_non_contending = true; > get_task_struct(p); > hrtimer_start(timer, ns_to_ktime(zerolag_time), HRTIMER_MODE_REL); > } > @@ -271,7 +27
Re: [PATCH] sched: use unsigned int for one-bit bitfield in sched_dl_entity
Hi, On Fri, 17 Nov 2017 14:50:11 +0800 Xin Longwrote: > This patch is to fix the 'dubious one-bit signed bitfield' error reported > by sparse, when using 'make C=2'. > > Fixes: 799ba82de01e ("sched/deadline: Use C bitfields for the state flags") > Signed-off-by: Xin Long I think this is very similar to patches already sent by Dan Carpenter and Matthew Wilcox. As for the previous patches, I think the change is ok. Luca > --- > include/linux/sched.h | 8 > 1 file changed, 4 insertions(+), 4 deletions(-) > > diff --git a/include/linux/sched.h b/include/linux/sched.h > index a5dc7c9..3e35a37 100644 > --- a/include/linux/sched.h > +++ b/include/linux/sched.h > @@ -473,10 +473,10 @@ struct sched_dl_entity { >* conditions between the inactive timer handler and the wakeup >* code. >*/ > - int dl_throttled : 1; > - int dl_boosted: 1; > - int dl_yielded: 1; > - int dl_non_contending : 1; > + unsigned intdl_throttled : 1, > + dl_boosted: 1, > + dl_yielded: 1, > + dl_non_contending : 1; > > /* >* Bandwidth enforcement timer. Each -deadline task has its
Re: [PATCH] sched: use unsigned int for one-bit bitfield in sched_dl_entity
Hi, On Fri, 17 Nov 2017 14:50:11 +0800 Xin Long wrote: > This patch is to fix the 'dubious one-bit signed bitfield' error reported > by sparse, when using 'make C=2'. > > Fixes: 799ba82de01e ("sched/deadline: Use C bitfields for the state flags") > Signed-off-by: Xin Long I think this is very similar to patches already sent by Dan Carpenter and Matthew Wilcox. As for the previous patches, I think the change is ok. Luca > --- > include/linux/sched.h | 8 > 1 file changed, 4 insertions(+), 4 deletions(-) > > diff --git a/include/linux/sched.h b/include/linux/sched.h > index a5dc7c9..3e35a37 100644 > --- a/include/linux/sched.h > +++ b/include/linux/sched.h > @@ -473,10 +473,10 @@ struct sched_dl_entity { >* conditions between the inactive timer handler and the wakeup >* code. >*/ > - int dl_throttled : 1; > - int dl_boosted: 1; > - int dl_yielded: 1; > - int dl_non_contending : 1; > + unsigned intdl_throttled : 1, > + dl_boosted: 1, > + dl_yielded: 1, > + dl_non_contending : 1; > > /* >* Bandwidth enforcement timer. Each -deadline task has its
Re: New sparse warnings from sched.h
Hi, On Tue, 14 Nov 2017 12:41:35 -0800 Matthew Wilcox <wi...@infradead.org> wrote: > commit 799ba82de01e7543f6b2042e1a739f3a20255f23 > Author: luca abeni <luca.ab...@santannapisa.it> > Date: Thu Sep 7 12:09:31 2017 +0200 > > sched/deadline: Use C bitfields for the state flags > > Ask the compiler to use a single bit for storing true / false > values, instead of wasting the size of a whole int value. > Tested with gcc 5.4.0 on x86_64, and the compiler produces the > expected Assembly (similar to the Assembly code generated when > explicitly accessing the bits with bitmasks, "&" and "|"). > > produces four warnings from sparse for every file which includes > sched.h: > > ./include/linux/sched.h:476:62: error: dubious one-bit signed bitfield > ./include/linux/sched.h:477:62: error: dubious one-bit signed bitfield > ./include/linux/sched.h:478:62: error: dubious one-bit signed bitfield > ./include/linux/sched.h:479:62: error: dubious one-bit signed bitfield > > This seems like the trivial fix (untested): [...] I think Dan Carpenter sent a similar patch (and I replied agreeing with it). I believe in this particular case signed integers are safe, but if "unsigned int" is the preferred style in the kernel I agree with the change. Thanks, Luca > > > diff --git a/include/linux/sched.h b/include/linux/sched.h > index a5dc7c98b0a2..21991d668d35 100644 > --- a/include/linux/sched.h > +++ b/include/linux/sched.h > @@ -473,10 +473,10 @@ struct sched_dl_entity { >* conditions between the inactive timer handler and the > wakeup >* code. >*/ > - int dl_throttled : 1; > - int dl_boosted: 1; > - int dl_yielded: 1; > - int dl_non_contending : 1; > + unsigned intdl_throttled : 1; > + unsigned intdl_boosted: 1; > + unsigned intdl_yielded: 1; > + unsigned intdl_non_contending : 1; > > /* >* Bandwidth enforcement timer. Each -deadline task has its
Re: New sparse warnings from sched.h
Hi, On Tue, 14 Nov 2017 12:41:35 -0800 Matthew Wilcox wrote: > commit 799ba82de01e7543f6b2042e1a739f3a20255f23 > Author: luca abeni > Date: Thu Sep 7 12:09:31 2017 +0200 > > sched/deadline: Use C bitfields for the state flags > > Ask the compiler to use a single bit for storing true / false > values, instead of wasting the size of a whole int value. > Tested with gcc 5.4.0 on x86_64, and the compiler produces the > expected Assembly (similar to the Assembly code generated when > explicitly accessing the bits with bitmasks, "&" and "|"). > > produces four warnings from sparse for every file which includes > sched.h: > > ./include/linux/sched.h:476:62: error: dubious one-bit signed bitfield > ./include/linux/sched.h:477:62: error: dubious one-bit signed bitfield > ./include/linux/sched.h:478:62: error: dubious one-bit signed bitfield > ./include/linux/sched.h:479:62: error: dubious one-bit signed bitfield > > This seems like the trivial fix (untested): [...] I think Dan Carpenter sent a similar patch (and I replied agreeing with it). I believe in this particular case signed integers are safe, but if "unsigned int" is the preferred style in the kernel I agree with the change. Thanks, Luca > > > diff --git a/include/linux/sched.h b/include/linux/sched.h > index a5dc7c98b0a2..21991d668d35 100644 > --- a/include/linux/sched.h > +++ b/include/linux/sched.h > @@ -473,10 +473,10 @@ struct sched_dl_entity { >* conditions between the inactive timer handler and the > wakeup >* code. >*/ > - int dl_throttled : 1; > - int dl_boosted: 1; > - int dl_yielded: 1; > - int dl_non_contending : 1; > + unsigned intdl_throttled : 1; > + unsigned intdl_boosted: 1; > + unsigned intdl_yielded: 1; > + unsigned intdl_non_contending : 1; > > /* >* Bandwidth enforcement timer. Each -deadline task has its
Re: [PATCH] sched/deadline: Don't use dubious signed bitfields
Hi, On Fri, 13 Oct 2017 10:01:22 +0300 Dan Carpenter <dan.carpen...@oracle.com> wrote: > It doesn't cause a run-time bug, but these bitfields should be unsigned. > When it's signed ->dl_throttled is set to either 0 or -1, instead of > 0 and 1 as expected. The sched.h file is included into tons of places > so Sparse generates a flood of warnings like this: > > ./include/linux/sched.h:477:54: error: dubious one-bit signed bitfield > > Signed-off-by: Dan Carpenter <dan.carpen...@oracle.com> I did not notice any issue when testing, but if "unsigned int" is the common practice for bitfields, I agree with the change. Reviewed-by: Luca Abeni <luca.ab...@santannapisa.it> Thanks, Luca > > diff --git a/include/linux/sched.h b/include/linux/sched.h > index 0f897dfc195e..105eaff8a5e7 100644 > --- a/include/linux/sched.h > +++ b/include/linux/sched.h > @@ -474,10 +474,10 @@ struct sched_dl_entity { >* conditions between the inactive timer handler and the wakeup >* code. >*/ > - int dl_throttled : 1; > - int dl_boosted: 1; > - int dl_yielded: 1; > - int dl_non_contending : 1; > + unsigned intdl_throttled : 1; > + unsigned intdl_boosted: 1; > + unsigned intdl_yielded: 1; > + unsigned intdl_non_contending : 1; > > /* >* Bandwidth enforcement timer. Each -deadline task has its
Re: [PATCH] sched/deadline: Don't use dubious signed bitfields
Hi, On Fri, 13 Oct 2017 10:01:22 +0300 Dan Carpenter wrote: > It doesn't cause a run-time bug, but these bitfields should be unsigned. > When it's signed ->dl_throttled is set to either 0 or -1, instead of > 0 and 1 as expected. The sched.h file is included into tons of places > so Sparse generates a flood of warnings like this: > > ./include/linux/sched.h:477:54: error: dubious one-bit signed bitfield > > Signed-off-by: Dan Carpenter I did not notice any issue when testing, but if "unsigned int" is the common practice for bitfields, I agree with the change. Reviewed-by: Luca Abeni Thanks, Luca > > diff --git a/include/linux/sched.h b/include/linux/sched.h > index 0f897dfc195e..105eaff8a5e7 100644 > --- a/include/linux/sched.h > +++ b/include/linux/sched.h > @@ -474,10 +474,10 @@ struct sched_dl_entity { >* conditions between the inactive timer handler and the wakeup >* code. >*/ > - int dl_throttled : 1; > - int dl_boosted: 1; > - int dl_yielded: 1; > - int dl_non_contending : 1; > + unsigned intdl_throttled : 1; > + unsigned intdl_boosted: 1; > + unsigned intdl_yielded: 1; > + unsigned intdl_non_contending : 1; > > /* >* Bandwidth enforcement timer. Each -deadline task has its
Re: [PATCH 0/7] sched/deadline: fix cpusets bandwidth accounting
Hi Mathieu, On Thu, 12 Oct 2017 10:57:09 -0600 Mathieu Poirierwrote: [...] > >> Regardless of how we proceed (using existing CPUset list or new ones) we > >> need to deal with DL tasks that span more than one root domain, something > >> that will typically happen after a CPUset operation. For example, if we > >> split the number of available CPUs on a system in two CPUsets and then turn > >> off the 'sched_load_balance' flag on the parent CPUset, DL tasks in the > >> parent CPUset will end up spanning two root domains. > >> > >> One way to deal with this is to prevent CPUset operations from happening > >> when such condition is detected, as enacted in this set. Although simple > >> this approach feels brittle and akin to a "whack-a-mole" game. A better > >> and more reliable approach would be to teach the DL scheduler to deal with > >> tasks that span multiple root domains, a serious and substantial > >> undertaking. > >> > >> I am sending this as a starting point for discussion. I would be grateful > >> if you could take the time to comment on the approach and most importantly > >> provide input on how to deal with the open issue underlined above. > > > > Right, so teaching DEADLINE about arbitrary affinities is 'interesting'. > > > > Although the rules proposed by Tomasso; if found sufficient; would > > greatly simplify things. Also the online semi-partition approach to SMP > > could help with that. > > The "rules" proposed by Tomasso, are you referring to patches or the > deadline/cgroup extension work that he presented at OSPM? No, that is an unrelated thing... Tommaso previously proposed some improvements to the admission control mechanism to take arbitrary affinities into account. I think Tommaso's proposal is similar to what I previously proposed in this thread (to admit a SCHED_DEADLINE task with utilization u = runtime / period and affinity to N runqueues, we can account u / N to each one of the runqueues, and check if the sum of the utilizations on each runqueue is < 1). As previously noticed by Peter, this might have some scalability issues (a naive implementation would lock the root domain while iterating on all the runqueues). Few days ago, I was discussing with Tommaso about a possible solution based on not locking the root domain structure, and eventually using a roll-back strategy if the status of the root domain changes while we are updating it. I think in a previous email you mentioned RCU, which might result in a similar solution. Anyway, I am adding Tommaso in cc so that he can comment more. > I'd also be > interested to know more about this "online semi-partition approach to > SMP" you mentioned. It is basically an implementation (and extension to arbitrary affinities) of this work: http://drops.dagstuhl.de/opus/volltexte/2017/7165/ Luca > Maybe that's a conversation we could have at the > upcoming RT summit in Prague. > > > > > But yes, that's fairly massive surgery. For now I think we'll have to > > live and accept the limitations. So failing the various cpuset > > operations when they violate rules seems fine. Relaxing rules is always > > easier than tightening them (later). > > Agreed. > > > > > One 'series' you might be interested in when respinning these is: > > > > > > https://lkml.kernel.org/r/20171011094833.pdp4torvotvjd...@hirez.programming.kicks-ass.net > > > > By doing synchronous domain rebuild we loose a bunch of funnies. > > Getting rid of the asynchronous nature of the hotplug path would be a > delight - I'll start keeping track of that effort as well. > > Thanks for the review, > Mathieu
Re: [PATCH 0/7] sched/deadline: fix cpusets bandwidth accounting
Hi Mathieu, On Thu, 12 Oct 2017 10:57:09 -0600 Mathieu Poirier wrote: [...] > >> Regardless of how we proceed (using existing CPUset list or new ones) we > >> need to deal with DL tasks that span more than one root domain, something > >> that will typically happen after a CPUset operation. For example, if we > >> split the number of available CPUs on a system in two CPUsets and then turn > >> off the 'sched_load_balance' flag on the parent CPUset, DL tasks in the > >> parent CPUset will end up spanning two root domains. > >> > >> One way to deal with this is to prevent CPUset operations from happening > >> when such condition is detected, as enacted in this set. Although simple > >> this approach feels brittle and akin to a "whack-a-mole" game. A better > >> and more reliable approach would be to teach the DL scheduler to deal with > >> tasks that span multiple root domains, a serious and substantial > >> undertaking. > >> > >> I am sending this as a starting point for discussion. I would be grateful > >> if you could take the time to comment on the approach and most importantly > >> provide input on how to deal with the open issue underlined above. > > > > Right, so teaching DEADLINE about arbitrary affinities is 'interesting'. > > > > Although the rules proposed by Tomasso; if found sufficient; would > > greatly simplify things. Also the online semi-partition approach to SMP > > could help with that. > > The "rules" proposed by Tomasso, are you referring to patches or the > deadline/cgroup extension work that he presented at OSPM? No, that is an unrelated thing... Tommaso previously proposed some improvements to the admission control mechanism to take arbitrary affinities into account. I think Tommaso's proposal is similar to what I previously proposed in this thread (to admit a SCHED_DEADLINE task with utilization u = runtime / period and affinity to N runqueues, we can account u / N to each one of the runqueues, and check if the sum of the utilizations on each runqueue is < 1). As previously noticed by Peter, this might have some scalability issues (a naive implementation would lock the root domain while iterating on all the runqueues). Few days ago, I was discussing with Tommaso about a possible solution based on not locking the root domain structure, and eventually using a roll-back strategy if the status of the root domain changes while we are updating it. I think in a previous email you mentioned RCU, which might result in a similar solution. Anyway, I am adding Tommaso in cc so that he can comment more. > I'd also be > interested to know more about this "online semi-partition approach to > SMP" you mentioned. It is basically an implementation (and extension to arbitrary affinities) of this work: http://drops.dagstuhl.de/opus/volltexte/2017/7165/ Luca > Maybe that's a conversation we could have at the > upcoming RT summit in Prague. > > > > > But yes, that's fairly massive surgery. For now I think we'll have to > > live and accept the limitations. So failing the various cpuset > > operations when they violate rules seems fine. Relaxing rules is always > > easier than tightening them (later). > > Agreed. > > > > > One 'series' you might be interested in when respinning these is: > > > > > > https://lkml.kernel.org/r/20171011094833.pdp4torvotvjd...@hirez.programming.kicks-ass.net > > > > By doing synchronous domain rebuild we loose a bunch of funnies. > > Getting rid of the asynchronous nature of the hotplug path would be a > delight - I'll start keeping track of that effort as well. > > Thanks for the review, > Mathieu
[tip:sched/core] sched/deadline: Use C bitfields for the state flags
Commit-ID: 799ba82de01e7543f6b2042e1a739f3a20255f23 Gitweb: https://git.kernel.org/tip/799ba82de01e7543f6b2042e1a739f3a20255f23 Author: luca abeni <luca.ab...@santannapisa.it> AuthorDate: Thu, 7 Sep 2017 12:09:31 +0200 Committer: Ingo Molnar <mi...@kernel.org> CommitDate: Tue, 10 Oct 2017 11:45:26 +0200 sched/deadline: Use C bitfields for the state flags Ask the compiler to use a single bit for storing true / false values, instead of wasting the size of a whole int value. Tested with gcc 5.4.0 on x86_64, and the compiler produces the expected Assembly (similar to the Assembly code generated when explicitly accessing the bits with bitmasks, "&" and "|"). Signed-off-by: luca abeni <luca.ab...@santannapisa.it> Signed-off-by: Peter Zijlstra (Intel) <pet...@infradead.org> Reviewed-by: Daniel Bristot de Oliveira <bris...@redhat.com> Cc: Juri Lelli <juri.le...@arm.com> Cc: Linus Torvalds <torva...@linux-foundation.org> Cc: Mathieu Poirier <mathieu.poir...@linaro.org> Cc: Mike Galbraith <efa...@gmx.de> Cc: Peter Zijlstra <pet...@infradead.org> Cc: Steven Rostedt <rost...@goodmis.org> Cc: Thomas Gleixner <t...@linutronix.de> Link: http://lkml.kernel.org/r/1504778971-13573-5-git-send-email-luca.ab...@santannapisa.it Signed-off-by: Ingo Molnar <mi...@kernel.org> --- include/linux/sched.h | 8 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 33a01f4..0f897df 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -474,10 +474,10 @@ struct sched_dl_entity { * conditions between the inactive timer handler and the wakeup * code. */ - int dl_throttled; - int dl_boosted; - int dl_yielded; - int dl_non_contending; + int dl_throttled : 1; + int dl_boosted: 1; + int dl_yielded: 1; + int dl_non_contending : 1; /* * Bandwidth enforcement timer. Each -deadline task has its
[tip:sched/core] sched/deadline: Use C bitfields for the state flags
Commit-ID: 799ba82de01e7543f6b2042e1a739f3a20255f23 Gitweb: https://git.kernel.org/tip/799ba82de01e7543f6b2042e1a739f3a20255f23 Author: luca abeni AuthorDate: Thu, 7 Sep 2017 12:09:31 +0200 Committer: Ingo Molnar CommitDate: Tue, 10 Oct 2017 11:45:26 +0200 sched/deadline: Use C bitfields for the state flags Ask the compiler to use a single bit for storing true / false values, instead of wasting the size of a whole int value. Tested with gcc 5.4.0 on x86_64, and the compiler produces the expected Assembly (similar to the Assembly code generated when explicitly accessing the bits with bitmasks, "&" and "|"). Signed-off-by: luca abeni Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Daniel Bristot de Oliveira Cc: Juri Lelli Cc: Linus Torvalds Cc: Mathieu Poirier Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1504778971-13573-5-git-send-email-luca.ab...@santannapisa.it Signed-off-by: Ingo Molnar --- include/linux/sched.h | 8 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 33a01f4..0f897df 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -474,10 +474,10 @@ struct sched_dl_entity { * conditions between the inactive timer handler and the wakeup * code. */ - int dl_throttled; - int dl_boosted; - int dl_yielded; - int dl_non_contending; + int dl_throttled : 1; + int dl_boosted: 1; + int dl_yielded: 1; + int dl_non_contending : 1; /* * Bandwidth enforcement timer. Each -deadline task has its
[tip:sched/core] sched/deadline: Fix switching to -deadline
Commit-ID: 295d6d5e373607729bcc8182c25afe964655714f Gitweb: https://git.kernel.org/tip/295d6d5e373607729bcc8182c25afe964655714f Author: Luca Abeni <luca.ab...@santannapisa.it> AuthorDate: Thu, 7 Sep 2017 12:09:29 +0200 Committer: Ingo Molnar <mi...@kernel.org> CommitDate: Tue, 10 Oct 2017 11:43:30 +0200 sched/deadline: Fix switching to -deadline Fix a bug introduced in: 72f9f3fdc928 ("sched/deadline: Remove dl_new from struct sched_dl_entity") After that commit, when switching to -deadline if the scheduling deadline of a task is in the past then switched_to_dl() calls setup_new_entity() to properly initialize the scheduling deadline and runtime. The problem is that the task is enqueued _before_ having its parameters initialized by setup_new_entity(), and this can cause problems. For example, a task with its out-of-date deadline in the past will potentially be enqueued as the highest priority one; however, its adjusted deadline may not be the earliest one. This patch fixes the problem by initializing the task's parameters before enqueuing it. Signed-off-by: luca abeni <luca.ab...@santannapisa.it> Signed-off-by: Peter Zijlstra (Intel) <pet...@infradead.org> Reviewed-by: Daniel Bristot de Oliveira <bris...@redhat.com> Cc: Juri Lelli <juri.le...@arm.com> Cc: Linus Torvalds <torva...@linux-foundation.org> Cc: Mathieu Poirier <mathieu.poir...@linaro.org> Cc: Mike Galbraith <efa...@gmx.de> Cc: Peter Zijlstra <pet...@infradead.org> Cc: Steven Rostedt <rost...@goodmis.org> Cc: Thomas Gleixner <t...@linutronix.de> Link: http://lkml.kernel.org/r/1504778971-13573-3-git-send-email-luca.ab...@santannapisa.it Signed-off-by: Ingo Molnar <mi...@kernel.org> --- kernel/sched/deadline.c | 11 --- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 0191ec7..9d45354 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1364,6 +1364,10 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, update_dl_entity(dl_se, pi_se); } else if (flags & ENQUEUE_REPLENISH) { replenish_dl_entity(dl_se, pi_se); + } else if ((flags & ENQUEUE_RESTORE) && + dl_time_before(dl_se->deadline, +rq_clock(rq_of_dl_rq(dl_rq_of_se(dl_se) { + setup_new_dl_entity(dl_se); } __enqueue_dl_entity(dl_se); @@ -2255,13 +2259,6 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) return; } - /* -* If p is boosted we already updated its params in -* rt_mutex_setprio()->enqueue_task(..., ENQUEUE_REPLENISH), -* p's deadline being now already after rq_clock(rq). -*/ - if (dl_time_before(p->dl.deadline, rq_clock(rq))) - setup_new_dl_entity(>dl); if (rq->curr != p) { #ifdef CONFIG_SMP
[tip:sched/core] sched/deadline: Fix switching to -deadline
Commit-ID: 295d6d5e373607729bcc8182c25afe964655714f Gitweb: https://git.kernel.org/tip/295d6d5e373607729bcc8182c25afe964655714f Author: Luca Abeni AuthorDate: Thu, 7 Sep 2017 12:09:29 +0200 Committer: Ingo Molnar CommitDate: Tue, 10 Oct 2017 11:43:30 +0200 sched/deadline: Fix switching to -deadline Fix a bug introduced in: 72f9f3fdc928 ("sched/deadline: Remove dl_new from struct sched_dl_entity") After that commit, when switching to -deadline if the scheduling deadline of a task is in the past then switched_to_dl() calls setup_new_entity() to properly initialize the scheduling deadline and runtime. The problem is that the task is enqueued _before_ having its parameters initialized by setup_new_entity(), and this can cause problems. For example, a task with its out-of-date deadline in the past will potentially be enqueued as the highest priority one; however, its adjusted deadline may not be the earliest one. This patch fixes the problem by initializing the task's parameters before enqueuing it. Signed-off-by: luca abeni Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Daniel Bristot de Oliveira Cc: Juri Lelli Cc: Linus Torvalds Cc: Mathieu Poirier Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1504778971-13573-3-git-send-email-luca.ab...@santannapisa.it Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 11 --- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 0191ec7..9d45354 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1364,6 +1364,10 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, update_dl_entity(dl_se, pi_se); } else if (flags & ENQUEUE_REPLENISH) { replenish_dl_entity(dl_se, pi_se); + } else if ((flags & ENQUEUE_RESTORE) && + dl_time_before(dl_se->deadline, +rq_clock(rq_of_dl_rq(dl_rq_of_se(dl_se) { + setup_new_dl_entity(dl_se); } __enqueue_dl_entity(dl_se); @@ -2255,13 +2259,6 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) return; } - /* -* If p is boosted we already updated its params in -* rt_mutex_setprio()->enqueue_task(..., ENQUEUE_REPLENISH), -* p's deadline being now already after rq_clock(rq). -*/ - if (dl_time_before(p->dl.deadline, rq_clock(rq))) - setup_new_dl_entity(>dl); if (rq->curr != p) { #ifdef CONFIG_SMP
[tip:sched/core] sched/headers: Remove duplicate prototype of __dl_clear_params()
Commit-ID: e964d3501b64d6930aaa4dd18955a8cd086ccb92 Gitweb: https://git.kernel.org/tip/e964d3501b64d6930aaa4dd18955a8cd086ccb92 Author: luca abeni <luca.ab...@santannapisa.it> AuthorDate: Thu, 7 Sep 2017 12:09:28 +0200 Committer: Ingo Molnar <mi...@kernel.org> CommitDate: Tue, 10 Oct 2017 11:43:30 +0200 sched/headers: Remove duplicate prototype of __dl_clear_params() Signed-off-by: luca abeni <luca.ab...@santannapisa.it> Signed-off-by: Peter Zijlstra (Intel) <pet...@infradead.org> Reviewed-by: Daniel Bristot de Oliveira <bris...@redhat.com> Cc: Juri Lelli <juri.le...@arm.com> Cc: Linus Torvalds <torva...@linux-foundation.org> Cc: Mathieu Poirier <mathieu.poir...@linaro.org> Cc: Mike Galbraith <efa...@gmx.de> Cc: Peter Zijlstra <pet...@infradead.org> Cc: Steven Rostedt <rost...@goodmis.org> Cc: Thomas Gleixner <t...@linutronix.de> Link: http://lkml.kernel.org/r/1504778971-13573-2-git-send-email-luca.ab...@santannapisa.it Signed-off-by: Ingo Molnar <mi...@kernel.org> --- kernel/sched/sched.h | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index e83d1b8..9d5aa18 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -255,7 +255,6 @@ extern int sched_dl_overflow(struct task_struct *p, int policy, extern void __setparam_dl(struct task_struct *p, const struct sched_attr *attr); extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr); extern bool __checkparam_dl(const struct sched_attr *attr); -extern void __dl_clear_params(struct task_struct *p); extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr); extern int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed);
[tip:sched/core] sched/headers: Remove duplicate prototype of __dl_clear_params()
Commit-ID: e964d3501b64d6930aaa4dd18955a8cd086ccb92 Gitweb: https://git.kernel.org/tip/e964d3501b64d6930aaa4dd18955a8cd086ccb92 Author: luca abeni AuthorDate: Thu, 7 Sep 2017 12:09:28 +0200 Committer: Ingo Molnar CommitDate: Tue, 10 Oct 2017 11:43:30 +0200 sched/headers: Remove duplicate prototype of __dl_clear_params() Signed-off-by: luca abeni Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Daniel Bristot de Oliveira Cc: Juri Lelli Cc: Linus Torvalds Cc: Mathieu Poirier Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1504778971-13573-2-git-send-email-luca.ab...@santannapisa.it Signed-off-by: Ingo Molnar --- kernel/sched/sched.h | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index e83d1b8..9d5aa18 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -255,7 +255,6 @@ extern int sched_dl_overflow(struct task_struct *p, int policy, extern void __setparam_dl(struct task_struct *p, const struct sched_attr *attr); extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr); extern bool __checkparam_dl(const struct sched_attr *attr); -extern void __dl_clear_params(struct task_struct *p); extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr); extern int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed);
[PATCH 4/4] sched/deadline: use C bitfields for the state flags
Ask the compiler to use a single bit for storing true / false values, instead of wasting the size of a whole int value. Tested with gcc 5.4.0 on x86_64, and the compiler produces the expected Assembly (similar to the Assembly code generated when explicitly accessing the bits with bitmasks, "&" and "|"). Signed-off-by: luca abeni <luca.ab...@santannapisa.it> --- include/linux/sched.h | 8 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 68b3833..e03cc69 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -472,10 +472,10 @@ struct sched_dl_entity { * conditions between the inactive timer handler and the wakeup * code. */ - int dl_throttled; - int dl_boosted; - int dl_yielded; - int dl_non_contending; + int dl_throttled : 1; + int dl_boosted: 1; + int dl_yielded: 1; + int dl_non_contending : 1; /* * Bandwidth enforcement timer. Each -deadline task has its -- 2.7.4
[PATCH 4/4] sched/deadline: use C bitfields for the state flags
Ask the compiler to use a single bit for storing true / false values, instead of wasting the size of a whole int value. Tested with gcc 5.4.0 on x86_64, and the compiler produces the expected Assembly (similar to the Assembly code generated when explicitly accessing the bits with bitmasks, "&" and "|"). Signed-off-by: luca abeni --- include/linux/sched.h | 8 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 68b3833..e03cc69 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -472,10 +472,10 @@ struct sched_dl_entity { * conditions between the inactive timer handler and the wakeup * code. */ - int dl_throttled; - int dl_boosted; - int dl_yielded; - int dl_non_contending; + int dl_throttled : 1; + int dl_boosted: 1; + int dl_yielded: 1; + int dl_non_contending : 1; /* * Bandwidth enforcement timer. Each -deadline task has its -- 2.7.4
[PATCH 2/4] sched/deadline: fix switching to -deadline
Fix a bug introduced in 72f9f3fdc928dc3ecd223e801b32d930b662b6ed: after that commit, when switching to -deadline if the scheduling deadline of a task is in the past then switched_to_dl() calls setup_new_entity() to properly initialize the scheduling deadline and runtime. The problem is that the task is enqueued _before_ having its parameters initialized by setup_new_entity(), and this can cause problems. For example, a task with its out-of-date deadline in the past will potentially be enqueued as the highest priority one; however, its adjusted deadline may not be the earliest one. This patch fixes the problem by initializing the task's parameters before enqueuing it. Signed-off-by: luca abeni <luca.ab...@santannapisa.it> Reviewed-by: Daniel Bristot de Oliveira <bris...@redhat.com> --- kernel/sched/deadline.c | 11 --- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index d05bd94..d7f9e86 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1376,6 +1376,10 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, update_dl_entity(dl_se, pi_se); } else if (flags & ENQUEUE_REPLENISH) { replenish_dl_entity(dl_se, pi_se); + } else if ((flags & ENQUEUE_RESTORE) && + dl_time_before(dl_se->deadline, +rq_clock(rq_of_dl_rq(dl_rq_of_se(dl_se) { + setup_new_dl_entity(dl_se); } __enqueue_dl_entity(dl_se); @@ -2267,13 +2271,6 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) return; } - /* -* If p is boosted we already updated its params in -* rt_mutex_setprio()->enqueue_task(..., ENQUEUE_REPLENISH), -* p's deadline being now already after rq_clock(rq). -*/ - if (dl_time_before(p->dl.deadline, rq_clock(rq))) - setup_new_dl_entity(>dl); if (rq->curr != p) { #ifdef CONFIG_SMP -- 2.7.4
[PATCH 2/4] sched/deadline: fix switching to -deadline
Fix a bug introduced in 72f9f3fdc928dc3ecd223e801b32d930b662b6ed: after that commit, when switching to -deadline if the scheduling deadline of a task is in the past then switched_to_dl() calls setup_new_entity() to properly initialize the scheduling deadline and runtime. The problem is that the task is enqueued _before_ having its parameters initialized by setup_new_entity(), and this can cause problems. For example, a task with its out-of-date deadline in the past will potentially be enqueued as the highest priority one; however, its adjusted deadline may not be the earliest one. This patch fixes the problem by initializing the task's parameters before enqueuing it. Signed-off-by: luca abeni Reviewed-by: Daniel Bristot de Oliveira --- kernel/sched/deadline.c | 11 --- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index d05bd94..d7f9e86 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1376,6 +1376,10 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, update_dl_entity(dl_se, pi_se); } else if (flags & ENQUEUE_REPLENISH) { replenish_dl_entity(dl_se, pi_se); + } else if ((flags & ENQUEUE_RESTORE) && + dl_time_before(dl_se->deadline, +rq_clock(rq_of_dl_rq(dl_rq_of_se(dl_se) { + setup_new_dl_entity(dl_se); } __enqueue_dl_entity(dl_se); @@ -2267,13 +2271,6 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) return; } - /* -* If p is boosted we already updated its params in -* rt_mutex_setprio()->enqueue_task(..., ENQUEUE_REPLENISH), -* p's deadline being now already after rq_clock(rq). -*/ - if (dl_time_before(p->dl.deadline, rq_clock(rq))) - setup_new_dl_entity(>dl); if (rq->curr != p) { #ifdef CONFIG_SMP -- 2.7.4
[PATCH 3/4] sched/deadline: rename __dl_clear() to __dl_sub()
From: Peter Zijlstra <pet...@infradead.org> __dl_sub() is more meaningful as a name, and is more consistent with the naming of the dual function (__dl_add()). Signed-off-by: Peter Zijlstra (Intel) <pet...@infradead.org> Signed-off-by: luca abeni <luca.ab...@santannapisa.it> --- kernel/sched/deadline.c | 10 +- kernel/sched/sched.h| 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index d7f9e86..bab11dd 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -242,7 +242,7 @@ static void task_non_contending(struct task_struct *p) if (p->state == TASK_DEAD) sub_rq_bw(p->dl.dl_bw, >dl); raw_spin_lock(_b->lock); - __dl_clear(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); + __dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); __dl_clear_params(p); raw_spin_unlock(_b->lock); } @@ -1211,7 +1211,7 @@ static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer) } raw_spin_lock(_b->lock); - __dl_clear(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); + __dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); raw_spin_unlock(_b->lock); __dl_clear_params(p); @@ -2182,7 +2182,7 @@ static void set_cpus_allowed_dl(struct task_struct *p, * until we complete the update. */ raw_spin_lock(_dl_b->lock); - __dl_clear(src_dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); + __dl_sub(src_dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); raw_spin_unlock(_dl_b->lock); } @@ -2460,7 +2460,7 @@ int sched_dl_overflow(struct task_struct *p, int policy, if (dl_policy(policy) && !task_has_dl_policy(p) && !__dl_overflow(dl_b, cpus, 0, new_bw)) { if (hrtimer_active(>dl.inactive_timer)) - __dl_clear(dl_b, p->dl.dl_bw, cpus); + __dl_sub(dl_b, p->dl.dl_bw, cpus); __dl_add(dl_b, new_bw, cpus); err = 0; } else if (dl_policy(policy) && task_has_dl_policy(p) && @@ -2472,7 +2472,7 @@ int sched_dl_overflow(struct task_struct *p, int policy, * But this would require to set the task's "inactive * timer" when the task is not inactive. */ - __dl_clear(dl_b, p->dl.dl_bw, cpus); + __dl_sub(dl_b, p->dl.dl_bw, cpus); __dl_add(dl_b, new_bw, cpus); dl_change_utilization(p, new_bw); err = 0; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 0b93e4b..061ec3b 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -226,7 +226,7 @@ struct dl_bw { static inline void __dl_update(struct dl_bw *dl_b, s64 bw); static inline -void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw, int cpus) +void __dl_sub(struct dl_bw *dl_b, u64 tsk_bw, int cpus) { dl_b->total_bw -= tsk_bw; __dl_update(dl_b, (s32)tsk_bw / cpus); -- 2.7.4
[PATCH 3/4] sched/deadline: rename __dl_clear() to __dl_sub()
From: Peter Zijlstra __dl_sub() is more meaningful as a name, and is more consistent with the naming of the dual function (__dl_add()). Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: luca abeni --- kernel/sched/deadline.c | 10 +- kernel/sched/sched.h| 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index d7f9e86..bab11dd 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -242,7 +242,7 @@ static void task_non_contending(struct task_struct *p) if (p->state == TASK_DEAD) sub_rq_bw(p->dl.dl_bw, >dl); raw_spin_lock(_b->lock); - __dl_clear(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); + __dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); __dl_clear_params(p); raw_spin_unlock(_b->lock); } @@ -1211,7 +1211,7 @@ static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer) } raw_spin_lock(_b->lock); - __dl_clear(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); + __dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); raw_spin_unlock(_b->lock); __dl_clear_params(p); @@ -2182,7 +2182,7 @@ static void set_cpus_allowed_dl(struct task_struct *p, * until we complete the update. */ raw_spin_lock(_dl_b->lock); - __dl_clear(src_dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); + __dl_sub(src_dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); raw_spin_unlock(_dl_b->lock); } @@ -2460,7 +2460,7 @@ int sched_dl_overflow(struct task_struct *p, int policy, if (dl_policy(policy) && !task_has_dl_policy(p) && !__dl_overflow(dl_b, cpus, 0, new_bw)) { if (hrtimer_active(>dl.inactive_timer)) - __dl_clear(dl_b, p->dl.dl_bw, cpus); + __dl_sub(dl_b, p->dl.dl_bw, cpus); __dl_add(dl_b, new_bw, cpus); err = 0; } else if (dl_policy(policy) && task_has_dl_policy(p) && @@ -2472,7 +2472,7 @@ int sched_dl_overflow(struct task_struct *p, int policy, * But this would require to set the task's "inactive * timer" when the task is not inactive. */ - __dl_clear(dl_b, p->dl.dl_bw, cpus); + __dl_sub(dl_b, p->dl.dl_bw, cpus); __dl_add(dl_b, new_bw, cpus); dl_change_utilization(p, new_bw); err = 0; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 0b93e4b..061ec3b 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -226,7 +226,7 @@ struct dl_bw { static inline void __dl_update(struct dl_bw *dl_b, s64 bw); static inline -void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw, int cpus) +void __dl_sub(struct dl_bw *dl_b, u64 tsk_bw, int cpus) { dl_b->total_bw -= tsk_bw; __dl_update(dl_b, (s32)tsk_bw / cpus); -- 2.7.4
[PATCH 1/4] sched/sched.h: remove duplicate prototype of __dl_clear_params()
Signed-off-by: luca abeni <luca.ab...@santannapisa.it> --- kernel/sched/sched.h | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 1043c8b..0b93e4b 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -255,7 +255,6 @@ extern int sched_dl_overflow(struct task_struct *p, int policy, extern void __setparam_dl(struct task_struct *p, const struct sched_attr *attr); extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr); extern bool __checkparam_dl(const struct sched_attr *attr); -extern void __dl_clear_params(struct task_struct *p); extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr); extern int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed); -- 2.7.4