From: Kirill Tkhai <ktk...@virtuozzo.com> Extracted from "Initial patch".
Signed-off-by: Kirill Tkhai <ktk...@virtuozzo.com> +++ sched: fix cfs_rq::nr_iowait accounting After recent RedHat (b6be9ae "rh7: import RHEL7 kernel-3.10.0-957.12.2.el7") following sequence: update_stats_dequeue() dequeue_sleeper() cfs_rq->nr_iowait++ is called conditionally and cfs_rq::nr_iowait incremented if schedstat_enabled() is true. However, it is expected that this counter handled independently on other scheduler statistics gathering. To fix it, move cfs_rq::nr_iowait incrementing out of schedstat_enabled() checking. https://jira.sw.ru/browse/PSBM-93850 Signed-off-by: Jan Dakinevich <jan.dakinev...@virtuozzo.com> Reviewed-by: Kirill Tkhai <ktk...@virtuozzo.com> Reviewed-by: Konstantin Khorenko <khore...@virtuozzo.com> khorenko@ note: after this patch "nr_iowait" should be accounted properly until disk io limits are set for a Container and throttling is activated. Taking into account at the moment "nr_iowait" is always broken, let's apply current patch and rework "nr_iowait" accounting to honor throttle code later. At the moment throttle_cfs_rq() will inc nr_iowait (in dequeue_entity()) while unthrottle_cfs_rq() won't decrement it in enqueue_entity(). Changes when porting to VZ8: - Drop hunk in try_to_wake_up_local() as old code path: schedule __schedule try_to_wake_up_local nr_iowait_dec is now replaced by mainstream with: schedule sched_submit_work wq_worker_sleeping wake_up_process try_to_wake_up nr_iowait_dec and there is no more try_to_wake_up_local(). - Replace removal hunk in dequeue_sleeper() with corresponding hunk in update_stats_dequeue. https://jira.sw.ru/browse/PSBM-127846 (cherry-picked from vz7 commit 0bf288fedba7 ("sched: fix cfs_rq::nr_iowait accounting")) mFixes: ebd33cb22f39 ("sched: Account cfs_rq::nr_iowait") Signed-off-by: Pavel Tikhomirov <ptikhomi...@virtuozzo.com> https://jira.sw.ru/browse/PSBM-133986 task->state -> READ_ONCE(task->__state) (cherry picked from commit 30967ce528450629853dc71362fdd1aef21a3245) Signed-off-by: Alexander Mikhalitsyn <alexander.mikhalit...@virtuozzo.com> --- kernel/sched/core.c | 17 +++++++++++++++++ kernel/sched/fair.c | 25 +++++++++++++++++++++++++ kernel/sched/sched.h | 3 +++ 3 files changed, 45 insertions(+) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b7a1d5d09ade..c0c6a90ea32c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3854,6 +3854,15 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) */ smp_cond_load_acquire(&p->on_cpu, !VAL); + if (p->in_iowait && p->sched_class->nr_iowait_dec) { + struct rq_flags rf; + struct rq *rq; + + rq = __task_rq_lock(p, &rf); + p->sched_class->nr_iowait_dec(p); + __task_rq_unlock(rq, &rf); + } + cpu = select_task_rq(p, p->wake_cpu, wake_flags | WF_TTWU); if (task_cpu(p) != cpu) { if (p->in_iowait) { @@ -9546,6 +9555,10 @@ void sched_move_task(struct task_struct *tsk) if (queued) dequeue_task(rq, tsk, queue_flags); else { + if (!(READ_ONCE(tsk->__state) == TASK_WAKING) && tsk->in_iowait && + tsk->sched_class->nr_iowait_dec) + tsk->sched_class->nr_iowait_dec(tsk); + if (tsk->sched_contributes_to_load) task_cfs_rq(tsk)->nr_unint--; @@ -9560,6 +9573,10 @@ void sched_move_task(struct task_struct *tsk) if (queued) enqueue_task(rq, tsk, queue_flags); else { + if (!(READ_ONCE(tsk->__state) == TASK_WAKING) && tsk->in_iowait && + tsk->sched_class->nr_iowait_inc) + tsk->sched_class->nr_iowait_inc(tsk); + if (tsk->sched_contributes_to_load) task_cfs_rq(tsk)->nr_unint++; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 44c452072a1b..fb32b3480e19 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4372,6 +4372,13 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) update_stats_dequeue(cfs_rq, se, flags); + if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) { + struct task_struct *tsk = task_of(se); + + if (tsk->in_iowait) + cfs_rq->nr_iowait++; + } + clear_buddies(cfs_rq, se); if (se != cfs_rq->curr) @@ -11483,6 +11490,22 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task return rr_interval; } +#ifdef CONFIG_FAIR_GROUP_SCHED +static void nr_iowait_dec_fair(struct task_struct *p) +{ + struct cfs_rq *cfs_rq = task_cfs_rq(p); + + cfs_rq->nr_iowait--; +} + +static void nr_iowait_inc_fair(struct task_struct *p) +{ + struct cfs_rq *cfs_rq = task_cfs_rq(p); + + cfs_rq->nr_iowait++; +} +#endif /* CONFIG_FAIR_GROUP_SCHED */ + /* * All the scheduling class methods: */ @@ -11525,6 +11548,8 @@ DEFINE_SCHED_CLASS(fair) = { #ifdef CONFIG_FAIR_GROUP_SCHED .task_change_group = task_change_group_fair, + .nr_iowait_inc = nr_iowait_inc_fair, + .nr_iowait_dec = nr_iowait_dec_fair, #endif #ifdef CONFIG_UCLAMP_TASK diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 649210b93e11..ed6e12e3eb65 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -525,6 +525,7 @@ struct cfs_rq { unsigned int h_nr_running; /* SCHED_{NORMAL,BATCH,IDLE} */ unsigned int idle_h_nr_running; /* SCHED_IDLE */ + unsigned int nr_iowait; unsigned int nr_unint; u64 exec_clock; @@ -2165,6 +2166,8 @@ struct sched_class { #ifdef CONFIG_FAIR_GROUP_SCHED void (*task_change_group)(struct task_struct *p, int type); #endif + void (*nr_iowait_inc) (struct task_struct *p); + void (*nr_iowait_dec) (struct task_struct *p); }; static inline void put_prev_task(struct rq *rq, struct task_struct *prev) -- 2.31.1 _______________________________________________ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel