The commit is pushed to "branch-rh9-5.14.vz9.1.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git after ark-5.14 ------> commit cce10f3b8b9d37ded29f40672502c213de4c22f5 Author: Kirill Tkhai <ktk...@virtuozzo.com> Date: Fri Sep 24 14:49:32 2021 +0300
sched: Port CONFIG_CFS_CPULIMIT feature Add posibility to limit cpus used by cgroup/container. Signed-off-by: Vladimir Davydov <vdavy...@parallels.com> Signed-off-by: Kirill Tkhai <ktk...@virtuozzo.com> +++ sched: Allow configuring sched_vcpu_hotslice and sched_cpulimit_scale_cpufreq Let's make our sysctls ported from vz8 to be really configurable. These are lost hunks from vz7 commits: f06fef25c0859 ("sched: Add cpulimit base interfaces") 4805ea1432210 ("ve/sched: port vcpu hotslice") https://jira.sw.ru/browse/PSBM-127780 mFixes: ddbb18ac80519 ("sched: Port CONFIG_CFS_CPULIMIT feature") Signed-off-by: Pavel Tikhomirov <ptikhomi...@virtuozzo.com> +++ kernel/sched/fair.c: Add missing update_rq_clock() calls We've got a hard lockup which seems to be caused by mgag200 console printk code calling to schedule_work from scheduler with rq->lock held: #5 [ffffb79e034239a8] native_queued_spin_lock_slowpath at ffffffff8b50c6c6 #6 [ffffb79e034239a8] _raw_spin_lock at ffffffff8bc96e5c #7 [ffffb79e034239b0] try_to_wake_up at ffffffff8b4e26ff #8 [ffffb79e03423a10] __queue_work at ffffffff8b4ce3f3 #9 [ffffb79e03423a58] queue_work_on at ffffffff8b4ce714 #10 [ffffb79e03423a68] mga_imageblit at ffffffffc026d666 [mgag200] #11 [ffffb79e03423a80] soft_cursor at ffffffff8b8a9d84 #12 [ffffb79e03423ad8] bit_cursor at ffffffff8b8a99b2 #13 [ffffb79e03423ba0] hide_cursor at ffffffff8b93bc7a #14 [ffffb79e03423bb0] vt_console_print at ffffffff8b93e07d #15 [ffffb79e03423c18] console_unlock at ffffffff8b518f0e #16 [ffffb79e03423c68] vprintk_emit_log at ffffffff8b51acf7 #17 [ffffb79e03423cc0] vprintk_default at ffffffff8b51adcd #18 [ffffb79e03423cd0] printk at ffffffff8b51b3d6 #19 [ffffb79e03423d30] __warn_printk at ffffffff8b4b13a0 #20 [ffffb79e03423d98] assert_clock_updated at ffffffff8b4dd293 #21 [ffffb79e03423da0] deactivate_task at ffffffff8b4e12d1 #22 [ffffb79e03423dc8] move_task_group at ffffffff8b4eaa5b #23 [ffffb79e03423e00] cpulimit_balance_cpu_stop at ffffffff8b4f02f3 #24 [ffffb79e03423eb0] cpu_stopper_thread at ffffffff8b576b67 #25 [ffffb79e03423ee8] smpboot_thread_fn at ffffffff8b4d9125 #26 [ffffb79e03423f10] kthread at ffffffff8b4d4fc2 #27 [ffffb79e03423f50] ret_from_fork at ffffffff8be00255 The printk called because assert_clock_updated() triggered SCHED_WARN_ON(rq->clock_update_flags < RQCF_ACT_SKIP); This means that we missing necessary update_rq_clock() call. Add one to cpulimit_balance_cpu_stop() to fix the warning. Also add one in load_balance() before move_task_groups() call. It seems to be another place missing this call. https://jira.sw.ru/browse/PSBM-108013 Signed-off-by: Andrey Ryabinin <aryabi...@virtuozzo.com> +++ kernel/sched/fair.c: Add more missing update_rq_clock() calls Add update_rq_clock() for 'target_rq' to avoid WARN() coming from attach_task(). Also add rq_repin_lock(busiest, &rf); in load_balance() for detach_task(). The update_rq_clock() isn't necessary since it was updated before, but we need the repin since rq lock was released after update. https://jira.sw.ru/browse/PSBM-108013 Reported-by: Kirill Tkhai <ktk...@virtuozzo.com> Signed-off-by: Andrey Ryabinin <aryabi...@virtuozzo.com> Acked-by: Kirill Tkhai <ktk...@virtuozzo.com> https://jira.sw.ru/browse/PSBM-133986 See also: 5cb9eaa3d ("sched: Wrap rq::lock access") 36c5bdc43 ("sched/topology: Kill SD_LOAD_BALANCE") e669ac8ab ("sched: Remove checks against SD_LOAD_BALANCE") 9818427c6 ("sched/debug: Make sd->flags sysctl read-only") (cherry picked from commit fbafc1d55798fb54805164bb79a99aba859b294d) Signed-off-by: Alexander Mikhalitsyn <alexander.mikhalit...@virtuozzo.com> --- include/linux/sched.h | 29 +++ include/linux/sched/sysctl.h | 5 + include/linux/sched/topology.h | 5 + init/Kconfig | 4 + kernel/sched/core.c | 44 +++++ kernel/sched/fair.c | 396 +++++++++++++++++++++++++++++++++++++++++ kernel/sched/sched.h | 16 ++ kernel/sysctl.c | 19 ++ 8 files changed, 518 insertions(+) diff --git a/include/linux/sched.h b/include/linux/sched.h index 31e9e41b9d9d..c91d4777aedd 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -451,6 +451,9 @@ struct sched_statistics { u64 nr_migrations_cold; u64 nr_failed_migrations_affine; u64 nr_failed_migrations_running; +#ifdef CONFIG_CFS_CPULIMIT + u64 nr_failed_migrations_cpulimit; +#endif u64 nr_failed_migrations_hot; u64 nr_forced_migrations; @@ -471,6 +474,9 @@ struct sched_entity { struct load_weight load; struct rb_node run_node; struct list_head group_node; +#ifdef CONFIG_CFS_CPULIMIT + struct list_head cfs_rq_node; +#endif unsigned int on_rq; u64 exec_start; @@ -2053,6 +2059,29 @@ static inline bool vcpu_is_preempted(int cpu) } #endif +#ifdef CONFIG_CFS_CPULIMIT +extern unsigned int task_nr_cpus(struct task_struct *p); +extern unsigned int task_vcpu_id(struct task_struct *p); +extern unsigned int sched_cpulimit_scale_cpufreq(unsigned int freq); +#else +static inline unsigned int task_nr_cpus(struct task_struct *p) +{ + return num_online_cpus(); +} + +static inline unsigned int task_vcpu_id(struct task_struct *p) +{ + return task_cpu(p); +} + +static inline unsigned int sched_cpulimit_scale_cpufreq(unsigned int freq) +{ + return freq; +} +#endif + +#define num_online_vcpus() task_nr_cpus(current) + extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask); extern long sched_getaffinity(pid_t pid, struct cpumask *mask); diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index db2c0f34aaaf..b6adb2b82e52 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -99,4 +99,9 @@ int sched_energy_aware_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); #endif +#ifdef CONFIG_CFS_CPULIMIT +extern unsigned int sysctl_sched_vcpu_hotslice; +extern unsigned int sysctl_sched_cpulimit_scale_cpufreq; +#endif + #endif /* _LINUX_SCHED_SYSCTL_H */ diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 8f0f778b7c91..379fd57f665e 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -118,6 +118,11 @@ struct sched_domain { unsigned int alb_failed; unsigned int alb_pushed; + /* cpulimit balancing */ + unsigned int clb_count; + unsigned int clb_failed; + unsigned int clb_pushed; + /* SD_BALANCE_EXEC stats */ unsigned int sbe_count; unsigned int sbe_balanced; diff --git a/init/Kconfig b/init/Kconfig index 564553afb251..157a015393ac 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -966,9 +966,13 @@ config FAIR_GROUP_SCHED depends on CGROUP_SCHED default CGROUP_SCHED +config CFS_CPULIMIT + bool + config CFS_BANDWIDTH bool "CPU bandwidth provisioning for FAIR_GROUP_SCHED" depends on FAIR_GROUP_SCHED + select CFS_CPULIMIT default n help This option allows users to define CPU bandwidth rates (limits) for diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ebb6dd99b442..d824282e942b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -371,6 +371,47 @@ static inline void sched_core_dequeue(struct rq *rq, struct task_struct *p) { } */ int sysctl_sched_rt_runtime = 950000; +#ifdef CONFIG_CFS_CPULIMIT +unsigned int task_nr_cpus(struct task_struct *p) +{ + unsigned int nr_cpus = 0; + unsigned int max_nr_cpus = num_online_cpus(); + + rcu_read_lock(); + nr_cpus = task_group(p)->nr_cpus; + rcu_read_unlock(); + + if (!nr_cpus || nr_cpus > max_nr_cpus) + nr_cpus = max_nr_cpus; + + return nr_cpus; +} + +unsigned int task_vcpu_id(struct task_struct *p) +{ + return task_cpu(p) % task_nr_cpus(p); +} + +unsigned int sysctl_sched_cpulimit_scale_cpufreq = 1; + +unsigned int sched_cpulimit_scale_cpufreq(unsigned int freq) +{ + unsigned long rate, max_rate; + + if (!sysctl_sched_cpulimit_scale_cpufreq) + return freq; + + rcu_read_lock(); + rate = task_group(current)->cpu_rate; + rcu_read_unlock(); + + max_rate = num_online_vcpus() * MAX_CPU_RATE; + if (!rate || rate >= max_rate) + return freq; + + return div_u64((u64)freq * rate, max_rate); /* avoid 32bit overflow */ +} +#endif /* * Serialization rules: @@ -9085,6 +9126,9 @@ void __init sched_init(void) INIT_LIST_HEAD(&root_task_group.children); INIT_LIST_HEAD(&root_task_group.siblings); autogroup_init(&init_task); +#ifdef CONFIG_CFS_CPULIMIT + root_task_group.topmost_limited_ancestor = &root_task_group; +#endif #endif /* CONFIG_CGROUP_SCHED */ for_each_possible_cpu(i) { diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index fb30663db2fe..c42ff00885c0 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -134,6 +134,11 @@ int __weak arch_asym_cpu_priority(int cpu) * (default: 5 msec, units: microseconds) */ unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; + +#endif + +#ifdef CONFIG_CFS_CPULIMIT +unsigned int sysctl_sched_vcpu_hotslice = 5000000UL; #endif static inline void update_load_add(struct load_weight *lw, unsigned long inc) @@ -470,6 +475,88 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse) #endif /* CONFIG_FAIR_GROUP_SCHED */ +#ifdef CONFIG_CFS_CPULIMIT +static int cfs_rq_active(struct cfs_rq *cfs_rq) +{ + return cfs_rq->active; +} + +static void inc_nr_active_cfs_rqs(struct cfs_rq *cfs_rq) +{ + /* if we canceled delayed dec, there is no need to do inc */ + if (hrtimer_try_to_cancel(&cfs_rq->active_timer) != 1) + atomic_inc(&cfs_rq->tg->nr_cpus_active); + cfs_rq->active = 1; +} + +static void dec_nr_active_cfs_rqs(struct cfs_rq *cfs_rq, int postpone) +{ + if (!cfs_rq->runtime_enabled || !sysctl_sched_vcpu_hotslice) + postpone = 0; + + if (!postpone) { + cfs_rq->active = 0; + atomic_dec(&cfs_rq->tg->nr_cpus_active); + } else { + hrtimer_start_range_ns(&cfs_rq->active_timer, + ns_to_ktime(sysctl_sched_vcpu_hotslice), 0, + HRTIMER_MODE_REL_PINNED); + } +} + +static enum hrtimer_restart sched_cfs_active_timer(struct hrtimer *timer) +{ + struct cfs_rq *cfs_rq = + container_of(timer, struct cfs_rq, active_timer); + struct rq *rq = rq_of(cfs_rq); + unsigned long flags; + + raw_spin_rq_lock_irqsave(rq, flags); + cfs_rq->active = !list_empty(&cfs_rq->tasks); + raw_spin_rq_unlock_irqrestore(rq, flags); + + atomic_dec(&cfs_rq->tg->nr_cpus_active); + + return HRTIMER_NORESTART; +} + +static int check_cpulimit_spread(struct task_group *tg, int target_cpu) +{ + int nr_cpus_active = atomic_read(&tg->nr_cpus_active); + int nr_cpus_limit = DIV_ROUND_UP(tg->cpu_rate, MAX_CPU_RATE); + + nr_cpus_limit = nr_cpus_limit && tg->nr_cpus ? + min_t(int, nr_cpus_limit, tg->nr_cpus) : + max_t(int, nr_cpus_limit, tg->nr_cpus); + + if (!nr_cpus_limit || nr_cpus_active < nr_cpus_limit) + return 1; + + if (nr_cpus_active > nr_cpus_limit) + return -1; + + return cfs_rq_active(tg->cfs_rq[target_cpu]) ? 0 : -1; +} +#else /* !CONFIG_CFS_CPULIMIT */ +static inline void inc_nr_active_cfs_rqs(struct cfs_rq *cfs_rq) +{ +} + +static inline void dec_nr_active_cfs_rqs(struct cfs_rq *cfs_rq, int postpone) +{ +} + +static inline enum hrtimer_restart sched_cfs_active_timer(struct hrtimer *timer) +{ + return 0; +} + +static inline int check_cpulimit_spread(struct task_group *tg, int target_cpu) +{ + return 1; +} +#endif /* CONFIG_CFS_CPULIMIT */ + static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec); @@ -2960,6 +3047,9 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) account_numa_enqueue(rq, task_of(se)); list_add(&se->group_node, &rq->cfs_tasks); +#ifdef CONFIG_CFS_CPULIMIT + list_add(&se->cfs_rq_node, &cfs_rq->tasks); +#endif } #endif cfs_rq->nr_running++; @@ -2973,6 +3063,9 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) if (entity_is_task(se)) { account_numa_dequeue(rq_of(cfs_rq), task_of(se)); list_del_init(&se->group_node); +#ifdef CONFIG_CFS_CPULIMIT + list_del(&se->cfs_rq_node); +#endif } #endif cfs_rq->nr_running--; @@ -4251,6 +4344,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATED); bool curr = cfs_rq->curr == se; + if (!cfs_rq->load.weight) + inc_nr_active_cfs_rqs(cfs_rq); /* * If we're the current task, we must renormalise before calling * update_curr(). @@ -4408,6 +4503,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) */ if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE) update_min_vruntime(cfs_rq); + + if (!cfs_rq->load.weight) + dec_nr_active_cfs_rqs(cfs_rq, flags & DEQUEUE_TASK_SLEEP); } /* @@ -5332,6 +5430,10 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) { cfs_rq->runtime_enabled = 0; INIT_LIST_HEAD(&cfs_rq->throttled_list); +#ifdef CONFIG_CFS_CPULIMIT + hrtimer_init(&cfs_rq->active_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + cfs_rq->active_timer.function = sched_cfs_active_timer; +#endif } void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) @@ -5727,6 +5829,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) /* Working cpumask for: load_balance, load_balance_newidle. */ DEFINE_PER_CPU(cpumask_var_t, load_balance_mask); DEFINE_PER_CPU(cpumask_var_t, select_idle_mask); +#ifdef CONFIG_CFS_CPULIMIT +static DEFINE_PER_CPU(struct callback_head, cpulimit_cb_head); +#endif #ifdef CONFIG_NO_HZ_COMMON @@ -6844,6 +6949,38 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) return target; } +static bool select_runnable_cpu(struct task_struct *p, int *new_cpu) +{ +#ifdef CONFIG_CFS_CPULIMIT + struct task_group *tg; + struct sched_domain *sd; + int prev_cpu = task_cpu(p); + int cpu; + + tg = cfs_rq_of(&p->se)->tg->topmost_limited_ancestor; + if (check_cpulimit_spread(tg, *new_cpu) > 0) + return false; + + if (cfs_rq_active(tg->cfs_rq[*new_cpu])) + return true; + + if (cfs_rq_active(tg->cfs_rq[prev_cpu])) { + *new_cpu = prev_cpu; + return true; + } + + for_each_domain(*new_cpu, sd) { + for_each_cpu_and(cpu, sched_domain_span(sd), p->cpus_ptr) { + if (cfs_rq_active(tg->cfs_rq[cpu])) { + *new_cpu = cpu; + return true; + } + } + } +#endif + return false; +} + /* * select_task_rq_fair: Select target runqueue for the waking task in domains * that have the relevant SD flag set. In practice, this is SD_BALANCE_WAKE, @@ -6903,6 +7040,9 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) break; } + if (select_runnable_cpu(p, &new_cpu)) + goto unlock; + if (unlikely(sd)) { /* Slow path */ new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag); @@ -6913,6 +7053,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) if (want_affine) current->recent_used_cpu = cpu; } +unlock: rcu_read_unlock(); return new_cpu; @@ -7195,6 +7336,51 @@ static struct task_struct *pick_task_fair(struct rq *rq) } #endif +#if defined(CONFIG_SMP) && defined(CONFIG_CFS_CPULIMIT) +static int cpulimit_balance_cpu_stop(void *data); + +static void trigger_cpulimit_balance(struct rq *this_rq) +{ + struct task_struct *p = this_rq->curr; + struct task_group *tg; + int this_cpu, cpu, target_cpu = -1; + struct sched_domain *sd; + + this_cpu = cpu_of(this_rq); + + if (!p->se.on_rq || this_rq->active_balance) + return; + + tg = cfs_rq_of(&p->se)->tg->topmost_limited_ancestor; + if (check_cpulimit_spread(tg, this_cpu) >= 0) + return; + + rcu_read_lock(); + for_each_domain(this_cpu, sd) { + for_each_cpu_and(cpu, sched_domain_span(sd), + p->cpus_ptr) { + if (cpu != this_cpu && + cfs_rq_active(tg->cfs_rq[cpu])) { + target_cpu = cpu; + goto unlock; + } + } + } +unlock: + rcu_read_unlock(); + + if (target_cpu >= 0) { + this_rq->active_balance = 1; + this_rq->push_cpu = target_cpu; + raw_spin_rq_unlock(this_rq); + stop_one_cpu_nowait(this_rq->cpu, + cpulimit_balance_cpu_stop, this_rq, + &this_rq->active_balance_work); + raw_spin_rq_lock(this_rq); + } +} +#endif + struct task_struct * pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { @@ -7282,6 +7468,9 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf set_next_entity(cfs_rq, se); } +#ifdef CONFIG_CFS_CPULIMIT + queue_balance_callback(rq, &per_cpu(cpulimit_cb_head, rq->cpu), trigger_cpulimit_balance); +#endif goto done; simple: #endif @@ -7311,6 +7500,9 @@ done: __maybe_unused; update_misfit_status(p, rq); +#ifdef CONFIG_CFS_CPULIMIT + queue_balance_callback(rq, &per_cpu(cpulimit_cb_head, rq->cpu), trigger_cpulimit_balance); +#endif return p; idle: @@ -7716,6 +7908,37 @@ static inline int migrate_degrades_locality(struct task_struct *p, } #endif +static int can_migrate_task_cpulimit(struct task_struct *p, struct lb_env *env) +{ +#ifdef CONFIG_CFS_CPULIMIT + struct task_group *tg = cfs_rq_of(&p->se)->tg->topmost_limited_ancestor; + + if (check_cpulimit_spread(tg, env->dst_cpu) < 0) { + int cpu; + + schedstat_inc(p->se.statistics.nr_failed_migrations_cpulimit); + + env->flags |= LBF_SOME_PINNED; + + if (check_cpulimit_spread(tg, env->src_cpu) != 0) + return 0; + + if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED)) + return 0; + + for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) { + if (cfs_rq_active(tg->cfs_rq[cpu])) { + env->flags |= LBF_DST_PINNED; + env->new_dst_cpu = cpu; + break; + } + } + return 0; + } +#endif + return 1; +} + /* * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? */ @@ -7726,6 +7949,8 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) lockdep_assert_rq_held(env->src_rq); + if (!can_migrate_task_cpulimit(p, env)) + return 0; /* * We do not migrate tasks that are: * 1) throttled_lb_pair, or @@ -8087,6 +8312,161 @@ static inline void update_blocked_load_tick(struct rq *rq) {} static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {} #endif +#ifdef CONFIG_CFS_CPULIMIT +static unsigned long entity_h_load(struct sched_entity *se); + +static int can_migrate_task_group(struct cfs_rq *cfs_rq, struct lb_env *env) +{ + struct sched_entity *se; + struct task_struct *p; + + list_for_each_entry(se, &cfs_rq->tasks, cfs_rq_node) { + p = task_of(se); + if (task_curr(p) || + !cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) + return 0; + } + env->flags &= ~LBF_ALL_PINNED; + return 1; +} + +static int move_task_group(struct cfs_rq *cfs_rq, struct lb_env *env) +{ + struct sched_entity *se, *tmp; + int moved = 0; + + list_for_each_entry_safe(se, tmp, &cfs_rq->tasks, cfs_rq_node) { + struct task_struct *p = task_of(se); + detach_task(p, env); + attach_task(env->dst_rq, p); + moved++; + } + return moved; +} + +static int move_task_groups(struct lb_env *env) +{ + struct cfs_rq *cfs_rq, *pos; + struct task_group *tg; + unsigned long load; + int cur_pulled, pulled = 0; + + if (env->imbalance <= 0) + return 0; + + for_each_leaf_cfs_rq_safe(env->src_rq, cfs_rq, pos) { + if (cfs_rq->tg == &root_task_group) + continue; + /* + * A child always goes before its parent in a leaf_cfs_rq_list. + * Therefore, if we encounter a cfs_rq that has a child cfs_rq, + * we could not migrate the child and therefore we should not + * even try to migrate the parent. + */ + if (cfs_rq->nr_running != cfs_rq->h_nr_running) + continue; + + tg = cfs_rq->tg->topmost_limited_ancestor; + + if (check_cpulimit_spread(tg, env->src_cpu) != 0 || + cfs_rq_active(tg->cfs_rq[env->dst_cpu])) + continue; + + load = entity_h_load(tg->se[env->src_cpu]); + if ((load / 2) > env->imbalance) + continue; + + if (!can_migrate_task_group(cfs_rq, env)) + continue; + + cur_pulled = move_task_group(cfs_rq, env); + pulled += cur_pulled; + env->imbalance -= load; + + env->loop += cur_pulled; + if (env->loop > env->loop_max) + break; + + if (env->imbalance <= 0) + break; + } + return pulled; +} + +static int do_cpulimit_balance(struct lb_env *env) +{ + struct cfs_rq *cfs_rq, *pos; + struct task_group *tg; + int pushed = 0; + + for_each_leaf_cfs_rq_safe(env->src_rq, cfs_rq, pos) { + if (cfs_rq->tg == &root_task_group) + continue; + /* see move_task_groups for why we skip such groups */ + if (cfs_rq->nr_running != cfs_rq->h_nr_running) + continue; + tg = cfs_rq->tg->topmost_limited_ancestor; + if (check_cpulimit_spread(tg, env->src_cpu) < 0 && + cfs_rq_active(tg->cfs_rq[env->dst_cpu]) && + can_migrate_task_group(cfs_rq, env)) + pushed += move_task_group(cfs_rq, env); + } + return pushed; +} + +static int cpulimit_balance_cpu_stop(void *data) +{ + struct rq *rq = data; + int cpu = cpu_of(rq); + int target_cpu = rq->push_cpu; + struct rq *target_rq = cpu_rq(target_cpu); + struct sched_domain *sd; + + raw_spin_rq_lock_irq(rq); + + if (unlikely(cpu != smp_processor_id() || !rq->active_balance || + !cpu_online(target_cpu))) + goto out_unlock; + + if (unlikely(!rq->nr_running)) + goto out_unlock; + + BUG_ON(rq == target_rq); + + double_lock_balance(rq, target_rq); + rcu_read_lock(); + for_each_domain(target_cpu, sd) { + if (cpumask_test_cpu(cpu, sched_domain_span(sd))) + break; + } + if (likely(sd)) { + struct lb_env env = { + .sd = sd, + .dst_cpu = target_cpu, + .dst_rq = target_rq, + .src_cpu = cpu, + .src_rq = rq, + }; + + schedstat_inc(sd->clb_count); + + update_rq_clock(rq); + update_rq_clock(target_rq); + if (do_cpulimit_balance(&env)) + schedstat_inc(sd->clb_pushed); + else + schedstat_inc(sd->clb_failed); + } + rcu_read_unlock(); + double_unlock_balance(rq, target_rq); + +out_unlock: + rq->active_balance = 0; + raw_spin_rq_unlock_irq(rq); + return 0; +} +#endif /* CONFIG_CFS_CPULIMIT */ + static bool __update_blocked_others(struct rq *rq, bool *done) { const struct sched_class *curr_class; @@ -9812,6 +10192,19 @@ static int load_balance(int this_cpu, struct rq *this_rq, local_irq_restore(rf.flags); +#ifdef CONFIG_CFS_CPULIMIT + if (!ld_moved && (env.flags & LBF_ALL_PINNED)) { + env.loop = 0; + local_irq_save(rf.flags); + double_rq_lock(env.dst_rq, busiest); + rq_repin_lock(env.src_rq, &rf); + update_rq_clock(env.dst_rq); + cur_ld_moved = ld_moved = move_task_groups(&env); + double_rq_unlock(env.dst_rq, busiest); + local_irq_restore(rf.flags); + } +#endif + if (env.flags & LBF_NEED_BREAK) { env.flags &= ~LBF_NEED_BREAK; goto more_balance; @@ -11251,6 +11644,9 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first) void init_cfs_rq(struct cfs_rq *cfs_rq) { cfs_rq->tasks_timeline = RB_ROOT_CACHED; +#ifdef CONFIG_CFS_CPULIMIT + INIT_LIST_HEAD(&cfs_rq->tasks); +#endif cfs_rq->min_vruntime = (u64)(-(1LL << 20)); #ifndef CONFIG_64BIT cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ed6e12e3eb65..9cddbc9920f8 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -433,6 +433,14 @@ struct task_group { struct uclamp_se uclamp[UCLAMP_CNT]; #endif +#ifdef CONFIG_CFS_CPULIMIT +#define MAX_CPU_RATE 1024 + unsigned long cpu_rate; + unsigned int nr_cpus; + atomic_t nr_cpus_active; + struct task_group *topmost_limited_ancestor; /* self if none of the + ancestors is limited */ +#endif }; #ifdef CONFIG_FAIR_GROUP_SCHED @@ -540,6 +548,9 @@ struct cfs_rq { #endif struct rb_root_cached tasks_timeline; +#ifdef CONFIG_CFS_CPULIMIT + struct list_head tasks; +#endif /* * 'curr' points to currently running entity on this cfs_rq. @@ -613,6 +624,10 @@ struct cfs_rq { int throttle_count; struct list_head throttled_list; #endif /* CONFIG_CFS_BANDWIDTH */ +#ifdef CONFIG_CFS_CPULIMIT + int active; + struct hrtimer active_timer; +#endif /* CONFIG_CFS_CPULIMIT */ #endif /* CONFIG_FAIR_GROUP_SCHED */ }; @@ -2087,6 +2102,7 @@ extern const u32 sched_prio_to_wmult[40]; #define DEQUEUE_SAVE 0x02 /* Matches ENQUEUE_RESTORE */ #define DEQUEUE_MOVE 0x04 /* Matches ENQUEUE_MOVE */ #define DEQUEUE_NOCLOCK 0x08 /* Matches ENQUEUE_NOCLOCK */ +#define DEQUEUE_TASK_SLEEP 0x10 #define ENQUEUE_WAKEUP 0x01 #define ENQUEUE_RESTORE 0x02 diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 21d00e6954dd..5824d5dd2e1d 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1866,6 +1866,25 @@ static struct ctl_table kern_table[] = { .extra2 = SYSCTL_ONE, }, #endif +#ifdef CONFIG_CFS_CPULIMIT + { + .procname = "sched_vcpu_hotslice", + .data = &sysctl_sched_vcpu_hotslice, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + }, + { + .procname = "sched_cpulimit_scale_cpufreq", + .data = &sysctl_sched_cpulimit_scale_cpufreq, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif #ifdef CONFIG_PROVE_LOCKING { .procname = "prove_locking", _______________________________________________ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel