[Devel] [PATCH RHEL COMMIT] sched: Port CONFIG_CFS_CPULIMIT feature

Konstantin Khorenko Fri, 24 Sep 2021 08:51:38 -0700

The commit is pushed to "branch-rh9-5.14.vz9.1.x-ovz" and will appear at 
https://src.openvz.org/scm/ovz/vzkernel.git
after ark-5.14
------>
commit cce10f3b8b9d37ded29f40672502c213de4c22f5
Author: Kirill Tkhai <ktk...@virtuozzo.com>
Date:   Fri Sep 24 14:49:32 2021 +0300


    sched: Port CONFIG_CFS_CPULIMIT feature
    
    Add posibility to limit cpus used by cgroup/container.
    
    Signed-off-by: Vladimir Davydov <vdavy...@parallels.com>
    
    Signed-off-by: Kirill Tkhai <ktk...@virtuozzo.com>
    
    +++
    sched: Allow configuring sched_vcpu_hotslice and 
sched_cpulimit_scale_cpufreq
    
    Let's make our sysctls ported from vz8 to be really configurable.
    
    These are lost hunks from vz7 commits:
    f06fef25c0859 ("sched: Add cpulimit base interfaces")
    4805ea1432210 ("ve/sched: port vcpu hotslice")
    
    https://jira.sw.ru/browse/PSBM-127780
    mFixes: ddbb18ac80519 ("sched: Port CONFIG_CFS_CPULIMIT feature")
    
    Signed-off-by: Pavel Tikhomirov <ptikhomi...@virtuozzo.com>
    
    +++
    kernel/sched/fair.c: Add missing update_rq_clock() calls
    
    We've got a hard lockup which seems to be caused by mgag200
    console printk code calling to schedule_work from scheduler
    with rq->lock held:
    
      #5 [ffffb79e034239a8] native_queued_spin_lock_slowpath at ffffffff8b50c6c6
      #6 [ffffb79e034239a8] _raw_spin_lock at ffffffff8bc96e5c
      #7 [ffffb79e034239b0] try_to_wake_up at ffffffff8b4e26ff
      #8 [ffffb79e03423a10] __queue_work at ffffffff8b4ce3f3
      #9 [ffffb79e03423a58] queue_work_on at ffffffff8b4ce714
     #10 [ffffb79e03423a68] mga_imageblit at ffffffffc026d666 [mgag200]
     #11 [ffffb79e03423a80] soft_cursor at ffffffff8b8a9d84
     #12 [ffffb79e03423ad8] bit_cursor at ffffffff8b8a99b2
     #13 [ffffb79e03423ba0] hide_cursor at ffffffff8b93bc7a
     #14 [ffffb79e03423bb0] vt_console_print at ffffffff8b93e07d
     #15 [ffffb79e03423c18] console_unlock at ffffffff8b518f0e
     #16 [ffffb79e03423c68] vprintk_emit_log at ffffffff8b51acf7
     #17 [ffffb79e03423cc0] vprintk_default at ffffffff8b51adcd
     #18 [ffffb79e03423cd0] printk at ffffffff8b51b3d6
     #19 [ffffb79e03423d30] __warn_printk at ffffffff8b4b13a0
     #20 [ffffb79e03423d98] assert_clock_updated at ffffffff8b4dd293
     #21 [ffffb79e03423da0] deactivate_task at ffffffff8b4e12d1
     #22 [ffffb79e03423dc8] move_task_group at ffffffff8b4eaa5b
     #23 [ffffb79e03423e00] cpulimit_balance_cpu_stop at ffffffff8b4f02f3
     #24 [ffffb79e03423eb0] cpu_stopper_thread at ffffffff8b576b67
     #25 [ffffb79e03423ee8] smpboot_thread_fn at ffffffff8b4d9125
     #26 [ffffb79e03423f10] kthread at ffffffff8b4d4fc2
     #27 [ffffb79e03423f50] ret_from_fork at ffffffff8be00255
    
    The printk called because assert_clock_updated() triggered
            SCHED_WARN_ON(rq->clock_update_flags < RQCF_ACT_SKIP);
    
    This means that we missing necessary update_rq_clock() call.
    Add one to cpulimit_balance_cpu_stop() to fix the warning.
    Also add one in load_balance() before move_task_groups() call.
    It seems to be another place missing this call.
    
    https://jira.sw.ru/browse/PSBM-108013
    Signed-off-by: Andrey Ryabinin <aryabi...@virtuozzo.com>
    
    +++
    kernel/sched/fair.c: Add more missing update_rq_clock() calls
    
    Add update_rq_clock() for 'target_rq' to avoid WARN() coming
    from attach_task(). Also add rq_repin_lock(busiest, &rf); in
    load_balance() for detach_task(). The update_rq_clock() isn't
    necessary since it was updated before, but we need the repin
    since rq lock was released after update.
    
    https://jira.sw.ru/browse/PSBM-108013
    
    Reported-by: Kirill Tkhai <ktk...@virtuozzo.com>
    Signed-off-by: Andrey Ryabinin <aryabi...@virtuozzo.com>
    
    Acked-by: Kirill Tkhai <ktk...@virtuozzo.com>
    
    https://jira.sw.ru/browse/PSBM-133986
    
    See also:
    5cb9eaa3d ("sched: Wrap rq::lock access")
    36c5bdc43 ("sched/topology: Kill SD_LOAD_BALANCE")
    e669ac8ab ("sched: Remove checks against SD_LOAD_BALANCE")
    9818427c6 ("sched/debug: Make sd->flags sysctl read-only")
    
    (cherry picked from commit fbafc1d55798fb54805164bb79a99aba859b294d)
    Signed-off-by: Alexander Mikhalitsyn <alexander.mikhalit...@virtuozzo.com>
---
 include/linux/sched.h          |  29 +++
 include/linux/sched/sysctl.h   |   5 +
 include/linux/sched/topology.h |   5 +
 init/Kconfig                   |   4 +
 kernel/sched/core.c            |  44 +++++
 kernel/sched/fair.c            | 396 +++++++++++++++++++++++++++++++++++++++++
 kernel/sched/sched.h           |  16 ++
 kernel/sysctl.c                |  19 ++
 8 files changed, 518 insertions(+)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 31e9e41b9d9d..c91d4777aedd 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -451,6 +451,9 @@ struct sched_statistics {
        u64                             nr_migrations_cold;
        u64                             nr_failed_migrations_affine;
        u64                             nr_failed_migrations_running;
+#ifdef CONFIG_CFS_CPULIMIT
+       u64                             nr_failed_migrations_cpulimit;
+#endif
        u64                             nr_failed_migrations_hot;
        u64                             nr_forced_migrations;
 
@@ -471,6 +474,9 @@ struct sched_entity {
        struct load_weight              load;
        struct rb_node                  run_node;
        struct list_head                group_node;
+#ifdef CONFIG_CFS_CPULIMIT
+       struct list_head                cfs_rq_node;
+#endif
        unsigned int                    on_rq;
 
        u64                             exec_start;
@@ -2053,6 +2059,29 @@ static inline bool vcpu_is_preempted(int cpu)
 }
 #endif
 
+#ifdef CONFIG_CFS_CPULIMIT
+extern unsigned int task_nr_cpus(struct task_struct *p);
+extern unsigned int task_vcpu_id(struct task_struct *p);
+extern unsigned int sched_cpulimit_scale_cpufreq(unsigned int freq);
+#else
+static inline unsigned int task_nr_cpus(struct task_struct *p)
+{
+       return num_online_cpus();
+}
+
+static inline unsigned int task_vcpu_id(struct task_struct *p)
+{
+       return task_cpu(p);
+}
+
+static inline unsigned int sched_cpulimit_scale_cpufreq(unsigned int freq)
+{
+       return freq;
+}
+#endif
+
+#define num_online_vcpus() task_nr_cpus(current)
+
 extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask);
 extern long sched_getaffinity(pid_t pid, struct cpumask *mask);
 
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index db2c0f34aaaf..b6adb2b82e52 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -99,4 +99,9 @@ int sched_energy_aware_handler(struct ctl_table *table, int 
write,
                void *buffer, size_t *lenp, loff_t *ppos);
 #endif
 
+#ifdef CONFIG_CFS_CPULIMIT
+extern unsigned int sysctl_sched_vcpu_hotslice;
+extern unsigned int sysctl_sched_cpulimit_scale_cpufreq;
+#endif
+
 #endif /* _LINUX_SCHED_SYSCTL_H */
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 8f0f778b7c91..379fd57f665e 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -118,6 +118,11 @@ struct sched_domain {
        unsigned int alb_failed;
        unsigned int alb_pushed;
 
+       /* cpulimit balancing */
+       unsigned int clb_count;
+       unsigned int clb_failed;
+       unsigned int clb_pushed;
+
        /* SD_BALANCE_EXEC stats */
        unsigned int sbe_count;
        unsigned int sbe_balanced;
diff --git a/init/Kconfig b/init/Kconfig
index 564553afb251..157a015393ac 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -966,9 +966,13 @@ config FAIR_GROUP_SCHED
        depends on CGROUP_SCHED
        default CGROUP_SCHED
 
+config CFS_CPULIMIT
+       bool
+
 config CFS_BANDWIDTH
        bool "CPU bandwidth provisioning for FAIR_GROUP_SCHED"
        depends on FAIR_GROUP_SCHED
+       select CFS_CPULIMIT
        default n
        help
          This option allows users to define CPU bandwidth rates (limits) for
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ebb6dd99b442..d824282e942b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -371,6 +371,47 @@ static inline void sched_core_dequeue(struct rq *rq, 
struct task_struct *p) { }
  */
 int sysctl_sched_rt_runtime = 950000;
 
+#ifdef CONFIG_CFS_CPULIMIT
+unsigned int task_nr_cpus(struct task_struct *p)
+{
+       unsigned int nr_cpus = 0;
+       unsigned int max_nr_cpus = num_online_cpus();
+
+       rcu_read_lock();
+       nr_cpus = task_group(p)->nr_cpus;
+       rcu_read_unlock();
+
+       if (!nr_cpus || nr_cpus > max_nr_cpus)
+               nr_cpus = max_nr_cpus;
+
+       return nr_cpus;
+}
+
+unsigned int task_vcpu_id(struct task_struct *p)
+{
+       return task_cpu(p) % task_nr_cpus(p);
+}
+
+unsigned int sysctl_sched_cpulimit_scale_cpufreq = 1;
+
+unsigned int sched_cpulimit_scale_cpufreq(unsigned int freq)
+{
+       unsigned long rate, max_rate;
+
+       if (!sysctl_sched_cpulimit_scale_cpufreq)
+               return freq;
+
+       rcu_read_lock();
+       rate = task_group(current)->cpu_rate;
+       rcu_read_unlock();
+
+       max_rate = num_online_vcpus() * MAX_CPU_RATE;
+       if (!rate || rate >= max_rate)
+               return freq;
+
+       return div_u64((u64)freq * rate, max_rate); /* avoid 32bit overflow */
+}
+#endif
 
 /*
  * Serialization rules:
@@ -9085,6 +9126,9 @@ void __init sched_init(void)
        INIT_LIST_HEAD(&root_task_group.children);
        INIT_LIST_HEAD(&root_task_group.siblings);
        autogroup_init(&init_task);
+#ifdef CONFIG_CFS_CPULIMIT
+       root_task_group.topmost_limited_ancestor = &root_task_group;
+#endif
 #endif /* CONFIG_CGROUP_SCHED */
 
        for_each_possible_cpu(i) {
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index fb30663db2fe..c42ff00885c0 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -134,6 +134,11 @@ int __weak arch_asym_cpu_priority(int cpu)
  * (default: 5 msec, units: microseconds)
  */
 unsigned int sysctl_sched_cfs_bandwidth_slice          = 5000UL;
+
+#endif
+
+#ifdef CONFIG_CFS_CPULIMIT
+unsigned int sysctl_sched_vcpu_hotslice = 5000000UL;
 #endif
 
 static inline void update_load_add(struct load_weight *lw, unsigned long inc)
@@ -470,6 +475,88 @@ find_matching_se(struct sched_entity **se, struct 
sched_entity **pse)
 
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
+#ifdef CONFIG_CFS_CPULIMIT
+static int cfs_rq_active(struct cfs_rq *cfs_rq)
+{
+       return cfs_rq->active;
+}
+
+static void inc_nr_active_cfs_rqs(struct cfs_rq *cfs_rq)
+{
+       /* if we canceled delayed dec, there is no need to do inc */
+       if (hrtimer_try_to_cancel(&cfs_rq->active_timer) != 1)
+               atomic_inc(&cfs_rq->tg->nr_cpus_active);
+       cfs_rq->active = 1;
+}
+
+static void dec_nr_active_cfs_rqs(struct cfs_rq *cfs_rq, int postpone)
+{
+       if (!cfs_rq->runtime_enabled || !sysctl_sched_vcpu_hotslice)
+               postpone = 0;
+
+       if (!postpone) {
+               cfs_rq->active = 0;
+               atomic_dec(&cfs_rq->tg->nr_cpus_active);
+       } else {
+               hrtimer_start_range_ns(&cfs_rq->active_timer,
+                               ns_to_ktime(sysctl_sched_vcpu_hotslice), 0,
+                               HRTIMER_MODE_REL_PINNED);
+       }
+}
+
+static enum hrtimer_restart sched_cfs_active_timer(struct hrtimer *timer)
+{
+       struct cfs_rq *cfs_rq =
+               container_of(timer, struct cfs_rq, active_timer);
+       struct rq *rq = rq_of(cfs_rq);
+       unsigned long flags;
+
+       raw_spin_rq_lock_irqsave(rq, flags);
+       cfs_rq->active = !list_empty(&cfs_rq->tasks);
+       raw_spin_rq_unlock_irqrestore(rq, flags);
+
+       atomic_dec(&cfs_rq->tg->nr_cpus_active);
+
+       return HRTIMER_NORESTART;
+}
+
+static int check_cpulimit_spread(struct task_group *tg, int target_cpu)
+{
+       int nr_cpus_active = atomic_read(&tg->nr_cpus_active);
+       int nr_cpus_limit = DIV_ROUND_UP(tg->cpu_rate, MAX_CPU_RATE);
+
+       nr_cpus_limit = nr_cpus_limit && tg->nr_cpus ?
+               min_t(int, nr_cpus_limit, tg->nr_cpus) :
+               max_t(int, nr_cpus_limit, tg->nr_cpus);
+
+       if (!nr_cpus_limit || nr_cpus_active < nr_cpus_limit)
+               return 1;
+
+       if (nr_cpus_active > nr_cpus_limit)
+               return -1;
+
+       return cfs_rq_active(tg->cfs_rq[target_cpu]) ? 0 : -1;
+}
+#else /* !CONFIG_CFS_CPULIMIT */
+static inline void inc_nr_active_cfs_rqs(struct cfs_rq *cfs_rq)
+{
+}
+
+static inline void dec_nr_active_cfs_rqs(struct cfs_rq *cfs_rq, int postpone)
+{
+}
+
+static inline enum hrtimer_restart sched_cfs_active_timer(struct hrtimer 
*timer)
+{
+       return 0;
+}
+
+static inline int check_cpulimit_spread(struct task_group *tg, int target_cpu)
+{
+       return 1;
+}
+#endif /* CONFIG_CFS_CPULIMIT */
+
 static __always_inline
 void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
 
@@ -2960,6 +3047,9 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct 
sched_entity *se)
 
                account_numa_enqueue(rq, task_of(se));
                list_add(&se->group_node, &rq->cfs_tasks);
+#ifdef CONFIG_CFS_CPULIMIT
+               list_add(&se->cfs_rq_node, &cfs_rq->tasks);
+#endif
        }
 #endif
        cfs_rq->nr_running++;
@@ -2973,6 +3063,9 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct 
sched_entity *se)
        if (entity_is_task(se)) {
                account_numa_dequeue(rq_of(cfs_rq), task_of(se));
                list_del_init(&se->group_node);
+#ifdef CONFIG_CFS_CPULIMIT
+               list_del(&se->cfs_rq_node);
+#endif
        }
 #endif
        cfs_rq->nr_running--;
@@ -4251,6 +4344,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity 
*se, int flags)
        bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATED);
        bool curr = cfs_rq->curr == se;
 
+       if (!cfs_rq->load.weight)
+               inc_nr_active_cfs_rqs(cfs_rq);
        /*
         * If we're the current task, we must renormalise before calling
         * update_curr().
@@ -4408,6 +4503,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity 
*se, int flags)
         */
        if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE)
                update_min_vruntime(cfs_rq);
+
+       if (!cfs_rq->load.weight)
+               dec_nr_active_cfs_rqs(cfs_rq, flags & DEQUEUE_TASK_SLEEP);
 }
 
 /*
@@ -5332,6 +5430,10 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 {
        cfs_rq->runtime_enabled = 0;
        INIT_LIST_HEAD(&cfs_rq->throttled_list);
+#ifdef CONFIG_CFS_CPULIMIT
+       hrtimer_init(&cfs_rq->active_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+       cfs_rq->active_timer.function = sched_cfs_active_timer;
+#endif
 }
 
 void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
@@ -5727,6 +5829,9 @@ static void dequeue_task_fair(struct rq *rq, struct 
task_struct *p, int flags)
 /* Working cpumask for: load_balance, load_balance_newidle. */
 DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
 DEFINE_PER_CPU(cpumask_var_t, select_idle_mask);
+#ifdef CONFIG_CFS_CPULIMIT
+static DEFINE_PER_CPU(struct callback_head, cpulimit_cb_head);
+#endif
 
 #ifdef CONFIG_NO_HZ_COMMON
 
@@ -6844,6 +6949,38 @@ static int find_energy_efficient_cpu(struct task_struct 
*p, int prev_cpu)
        return target;
 }
 
+static bool select_runnable_cpu(struct task_struct *p, int *new_cpu)
+{
+#ifdef CONFIG_CFS_CPULIMIT
+       struct task_group *tg;
+       struct sched_domain *sd;
+       int prev_cpu = task_cpu(p);
+       int cpu;
+
+       tg = cfs_rq_of(&p->se)->tg->topmost_limited_ancestor;
+       if (check_cpulimit_spread(tg, *new_cpu) > 0)
+               return false;
+
+       if (cfs_rq_active(tg->cfs_rq[*new_cpu]))
+               return true;
+
+       if (cfs_rq_active(tg->cfs_rq[prev_cpu])) {
+               *new_cpu = prev_cpu;
+               return true;
+       }
+
+       for_each_domain(*new_cpu, sd) {
+               for_each_cpu_and(cpu, sched_domain_span(sd), p->cpus_ptr) {
+                       if (cfs_rq_active(tg->cfs_rq[cpu])) {
+                               *new_cpu = cpu;
+                               return true;
+                       }
+               }
+       }
+#endif
+       return false;
+}
+
 /*
  * select_task_rq_fair: Select target runqueue for the waking task in domains
  * that have the relevant SD flag set. In practice, this is SD_BALANCE_WAKE,
@@ -6903,6 +7040,9 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, 
int wake_flags)
                        break;
        }
 
+       if (select_runnable_cpu(p, &new_cpu))
+               goto unlock;
+
        if (unlikely(sd)) {
                /* Slow path */
                new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
@@ -6913,6 +7053,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, 
int wake_flags)
                if (want_affine)
                        current->recent_used_cpu = cpu;
        }
+unlock:
        rcu_read_unlock();
 
        return new_cpu;
@@ -7195,6 +7336,51 @@ static struct task_struct *pick_task_fair(struct rq *rq)
 }
 #endif
 
+#if defined(CONFIG_SMP) && defined(CONFIG_CFS_CPULIMIT)
+static int cpulimit_balance_cpu_stop(void *data);
+
+static void trigger_cpulimit_balance(struct rq *this_rq)
+{
+       struct task_struct *p = this_rq->curr;
+       struct task_group *tg;
+       int this_cpu, cpu, target_cpu = -1;
+       struct sched_domain *sd;
+
+       this_cpu = cpu_of(this_rq);
+
+       if (!p->se.on_rq || this_rq->active_balance)
+               return;
+
+       tg = cfs_rq_of(&p->se)->tg->topmost_limited_ancestor;
+       if (check_cpulimit_spread(tg, this_cpu) >= 0)
+               return;
+
+       rcu_read_lock();
+       for_each_domain(this_cpu, sd) {
+               for_each_cpu_and(cpu, sched_domain_span(sd),
+                                p->cpus_ptr) {
+                       if (cpu != this_cpu &&
+                           cfs_rq_active(tg->cfs_rq[cpu])) {
+                               target_cpu = cpu;
+                               goto unlock;
+                       }
+               }
+       }
+unlock:
+       rcu_read_unlock();
+
+       if (target_cpu >= 0) {
+               this_rq->active_balance = 1;
+               this_rq->push_cpu = target_cpu;
+               raw_spin_rq_unlock(this_rq);
+               stop_one_cpu_nowait(this_rq->cpu,
+                                   cpulimit_balance_cpu_stop, this_rq,
+                                   &this_rq->active_balance_work);
+               raw_spin_rq_lock(this_rq);
+       }
+}
+#endif
+
 struct task_struct *
 pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags 
*rf)
 {
@@ -7282,6 +7468,9 @@ pick_next_task_fair(struct rq *rq, struct task_struct 
*prev, struct rq_flags *rf
                set_next_entity(cfs_rq, se);
        }
 
+#ifdef CONFIG_CFS_CPULIMIT
+       queue_balance_callback(rq, &per_cpu(cpulimit_cb_head, rq->cpu), 
trigger_cpulimit_balance);
+#endif
        goto done;
 simple:
 #endif
@@ -7311,6 +7500,9 @@ done: __maybe_unused;
 
        update_misfit_status(p, rq);
 
+#ifdef CONFIG_CFS_CPULIMIT
+       queue_balance_callback(rq, &per_cpu(cpulimit_cb_head, rq->cpu), 
trigger_cpulimit_balance);
+#endif
        return p;
 
 idle:
@@ -7716,6 +7908,37 @@ static inline int migrate_degrades_locality(struct 
task_struct *p,
 }
 #endif
 
+static int can_migrate_task_cpulimit(struct task_struct *p, struct lb_env *env)
+{
+#ifdef CONFIG_CFS_CPULIMIT
+       struct task_group *tg = cfs_rq_of(&p->se)->tg->topmost_limited_ancestor;
+
+       if (check_cpulimit_spread(tg, env->dst_cpu) < 0) {
+               int cpu;
+
+               schedstat_inc(p->se.statistics.nr_failed_migrations_cpulimit);
+
+               env->flags |= LBF_SOME_PINNED;
+
+               if (check_cpulimit_spread(tg, env->src_cpu) != 0)
+                       return 0;
+
+               if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED))
+                       return 0;
+
+               for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
+                       if (cfs_rq_active(tg->cfs_rq[cpu])) {
+                               env->flags |= LBF_DST_PINNED;
+                               env->new_dst_cpu = cpu;
+                               break;
+                       }
+               }
+               return 0;
+       }
+#endif
+       return 1;
+}
+
 /*
  * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
  */
@@ -7726,6 +7949,8 @@ int can_migrate_task(struct task_struct *p, struct lb_env 
*env)
 
        lockdep_assert_rq_held(env->src_rq);
 
+        if (!can_migrate_task_cpulimit(p, env))
+                return 0;
        /*
         * We do not migrate tasks that are:
         * 1) throttled_lb_pair, or
@@ -8087,6 +8312,161 @@ static inline void update_blocked_load_tick(struct rq 
*rq) {}
 static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) 
{}
 #endif
 
+#ifdef CONFIG_CFS_CPULIMIT
+static unsigned long entity_h_load(struct sched_entity *se);
+
+static int can_migrate_task_group(struct cfs_rq *cfs_rq, struct lb_env *env)
+{
+       struct sched_entity *se;
+       struct task_struct *p;
+
+       list_for_each_entry(se, &cfs_rq->tasks, cfs_rq_node) {
+               p = task_of(se);
+               if (task_curr(p) ||
+                   !cpumask_test_cpu(env->dst_cpu, p->cpus_ptr))
+                       return 0;
+       }
+       env->flags &= ~LBF_ALL_PINNED;
+       return 1;
+}
+
+static int move_task_group(struct cfs_rq *cfs_rq, struct lb_env *env)
+{
+       struct sched_entity *se, *tmp;
+       int moved = 0;
+
+       list_for_each_entry_safe(se, tmp, &cfs_rq->tasks, cfs_rq_node) {
+               struct task_struct *p = task_of(se);
+               detach_task(p, env);
+               attach_task(env->dst_rq, p);
+               moved++;
+       }
+       return moved;
+}
+
+static int move_task_groups(struct lb_env *env)
+{
+       struct cfs_rq *cfs_rq, *pos;
+       struct task_group *tg;
+       unsigned long load;
+       int cur_pulled, pulled = 0;
+
+       if (env->imbalance <= 0)
+               return 0;
+
+       for_each_leaf_cfs_rq_safe(env->src_rq, cfs_rq, pos) {
+               if (cfs_rq->tg == &root_task_group)
+                       continue;
+               /*
+                * A child always goes before its parent in a leaf_cfs_rq_list.
+                * Therefore, if we encounter a cfs_rq that has a child cfs_rq,
+                * we could not migrate the child and therefore we should not
+                * even try to migrate the parent.
+                */
+               if (cfs_rq->nr_running != cfs_rq->h_nr_running)
+                       continue;
+
+               tg = cfs_rq->tg->topmost_limited_ancestor;
+
+               if (check_cpulimit_spread(tg, env->src_cpu) != 0 ||
+                   cfs_rq_active(tg->cfs_rq[env->dst_cpu]))
+                       continue;
+
+               load = entity_h_load(tg->se[env->src_cpu]);
+               if ((load / 2) > env->imbalance)
+                       continue;
+
+               if (!can_migrate_task_group(cfs_rq, env))
+                       continue;
+
+               cur_pulled = move_task_group(cfs_rq, env);
+               pulled += cur_pulled;
+               env->imbalance -= load;
+
+               env->loop += cur_pulled;
+               if (env->loop > env->loop_max)
+                       break;
+
+               if (env->imbalance <= 0)
+                       break;
+       }
+       return pulled;
+}
+
+static int do_cpulimit_balance(struct lb_env *env)
+{
+       struct cfs_rq *cfs_rq, *pos;
+       struct task_group *tg;
+       int pushed = 0;
+
+       for_each_leaf_cfs_rq_safe(env->src_rq, cfs_rq, pos) {
+               if (cfs_rq->tg == &root_task_group)
+                       continue;
+               /* see move_task_groups for why we skip such groups */
+               if (cfs_rq->nr_running != cfs_rq->h_nr_running)
+                       continue;
+               tg = cfs_rq->tg->topmost_limited_ancestor;
+               if (check_cpulimit_spread(tg, env->src_cpu) < 0 &&
+                   cfs_rq_active(tg->cfs_rq[env->dst_cpu]) &&
+                   can_migrate_task_group(cfs_rq, env))
+                       pushed += move_task_group(cfs_rq, env);
+       }
+       return pushed;
+}
+
+static int cpulimit_balance_cpu_stop(void *data)
+{
+       struct rq *rq = data;
+       int cpu = cpu_of(rq);
+       int target_cpu = rq->push_cpu;
+       struct rq *target_rq = cpu_rq(target_cpu);
+       struct sched_domain *sd;
+
+       raw_spin_rq_lock_irq(rq);
+
+       if (unlikely(cpu != smp_processor_id() || !rq->active_balance ||
+                    !cpu_online(target_cpu)))
+               goto out_unlock;
+
+       if (unlikely(!rq->nr_running))
+               goto out_unlock;
+
+       BUG_ON(rq == target_rq);
+
+       double_lock_balance(rq, target_rq);
+       rcu_read_lock();
+       for_each_domain(target_cpu, sd) {
+               if (cpumask_test_cpu(cpu, sched_domain_span(sd)))
+                               break;
+       }
+       if (likely(sd)) {
+               struct lb_env env = {
+                       .sd             = sd,
+                       .dst_cpu        = target_cpu,
+                       .dst_rq         = target_rq,
+                       .src_cpu        = cpu,
+                       .src_rq         = rq,
+               };
+
+               schedstat_inc(sd->clb_count);
+
+               update_rq_clock(rq);
+               update_rq_clock(target_rq);
+               if (do_cpulimit_balance(&env))
+                       schedstat_inc(sd->clb_pushed);
+               else
+                       schedstat_inc(sd->clb_failed);
+       }
+       rcu_read_unlock();
+       double_unlock_balance(rq, target_rq);
+
+out_unlock:
+       rq->active_balance = 0;
+       raw_spin_rq_unlock_irq(rq);
+       return 0;
+}
+#endif /* CONFIG_CFS_CPULIMIT */
+
 static bool __update_blocked_others(struct rq *rq, bool *done)
 {
        const struct sched_class *curr_class;
@@ -9812,6 +10192,19 @@ static int load_balance(int this_cpu, struct rq 
*this_rq,
 
                local_irq_restore(rf.flags);
 
+#ifdef CONFIG_CFS_CPULIMIT
+               if (!ld_moved && (env.flags & LBF_ALL_PINNED)) {
+                       env.loop = 0;
+                       local_irq_save(rf.flags);
+                       double_rq_lock(env.dst_rq, busiest);
+                       rq_repin_lock(env.src_rq, &rf);
+                       update_rq_clock(env.dst_rq);
+                       cur_ld_moved = ld_moved = move_task_groups(&env);
+                       double_rq_unlock(env.dst_rq, busiest);
+                       local_irq_restore(rf.flags);
+                }
+#endif
+
                if (env.flags & LBF_NEED_BREAK) {
                        env.flags &= ~LBF_NEED_BREAK;
                        goto more_balance;
@@ -11251,6 +11644,9 @@ static void set_next_task_fair(struct rq *rq, struct 
task_struct *p, bool first)
 void init_cfs_rq(struct cfs_rq *cfs_rq)
 {
        cfs_rq->tasks_timeline = RB_ROOT_CACHED;
+#ifdef CONFIG_CFS_CPULIMIT
+       INIT_LIST_HEAD(&cfs_rq->tasks);
+#endif
        cfs_rq->min_vruntime = (u64)(-(1LL << 20));
 #ifndef CONFIG_64BIT
        cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ed6e12e3eb65..9cddbc9920f8 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -433,6 +433,14 @@ struct task_group {
        struct uclamp_se        uclamp[UCLAMP_CNT];
 #endif
 
+#ifdef CONFIG_CFS_CPULIMIT
+#define MAX_CPU_RATE 1024
+       unsigned long cpu_rate;
+       unsigned int nr_cpus;
+       atomic_t nr_cpus_active;
+       struct task_group *topmost_limited_ancestor; /* self if none of the
+                                                       ancestors is limited */
+#endif
 };
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -540,6 +548,9 @@ struct cfs_rq {
 #endif
 
        struct rb_root_cached   tasks_timeline;
+#ifdef CONFIG_CFS_CPULIMIT
+       struct list_head tasks;
+#endif
 
        /*
         * 'curr' points to currently running entity on this cfs_rq.
@@ -613,6 +624,10 @@ struct cfs_rq {
        int                     throttle_count;
        struct list_head        throttled_list;
 #endif /* CONFIG_CFS_BANDWIDTH */
+#ifdef CONFIG_CFS_CPULIMIT
+       int active;
+       struct hrtimer active_timer;
+#endif /* CONFIG_CFS_CPULIMIT */
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 };
 
@@ -2087,6 +2102,7 @@ extern const u32          sched_prio_to_wmult[40];
 #define DEQUEUE_SAVE           0x02 /* Matches ENQUEUE_RESTORE */
 #define DEQUEUE_MOVE           0x04 /* Matches ENQUEUE_MOVE */
 #define DEQUEUE_NOCLOCK                0x08 /* Matches ENQUEUE_NOCLOCK */
+#define DEQUEUE_TASK_SLEEP     0x10
 
 #define ENQUEUE_WAKEUP         0x01
 #define ENQUEUE_RESTORE                0x02
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 21d00e6954dd..5824d5dd2e1d 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1866,6 +1866,25 @@ static struct ctl_table kern_table[] = {
                .extra2         = SYSCTL_ONE,
        },
 #endif
+#ifdef CONFIG_CFS_CPULIMIT
+       {
+               .procname       = "sched_vcpu_hotslice",
+               .data           = &sysctl_sched_vcpu_hotslice,
+               .maxlen         = sizeof(unsigned int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec_minmax,
+               .extra1         = SYSCTL_ZERO,
+       },
+       {
+               .procname       = "sched_cpulimit_scale_cpufreq",
+               .data           = &sysctl_sched_cpulimit_scale_cpufreq,
+               .maxlen         = sizeof(unsigned int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec,
+               .extra1         = SYSCTL_ZERO,
+               .extra2         = SYSCTL_ONE,
+       },
+#endif
 #ifdef CONFIG_PROVE_LOCKING
        {
                .procname       = "prove_locking",
_______________________________________________
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RHEL COMMIT] sched: Port CONFIG_CFS_CPULIMIT feature

Reply via email to