rt: make it configurable

Nicolas Pitre Mon, 29 May 2017 14:19:55 -0700

On most small systems where user space is tightly controlled, the realtime
scheduling class can often be dispensed with to reduce the kernel footprint.
Let's make it configurable.


Signed-off-by: Nicolas Pitre <[email protected]>
---
 include/linux/init_task.h      | 15 +++++++++++----
 include/linux/sched.h          |  2 ++
 include/linux/sched/rt.h       |  4 ++--
 init/Kconfig                   | 14 ++++++++++++--
 kernel/sched/Makefile          |  4 ++--
 kernel/sched/core.c            | 42 +++++++++++++++++++++++++++++++++++++++---
 kernel/sched/debug.c           |  2 ++
 kernel/sched/sched.h           |  7 +++++--
 kernel/sched/stop_task.c       |  4 +++-
 kernel/sysctl.c                |  4 +++-
 kernel/time/posix-cpu-timers.c |  6 +++++-
 11 files changed, 86 insertions(+), 18 deletions(-)

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index e049526bc1..6befc0aa61 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -225,6 +225,16 @@ extern struct cred init_cred;
 #define INIT_TASK_SECURITY
 #endif
 
+#ifdef CONFIG_SCHED_RT
+#define INIT_TASK_RT(tsk)                                              \
+       .rt             = {                                             \
+               .run_list       = LIST_HEAD_INIT(tsk.rt.run_list),      \
+               .time_slice     = RR_TIMESLICE,                         \
+       },
+#else
+#define INIT_TASK_RT(tsk)
+#endif
+
 /*
  *  INIT_TASK is used to set up the first task table, touch at
  * your own risk!. Base=0, limit=0x1fffff (=2MB)
@@ -250,10 +260,7 @@ extern struct cred init_cred;
        .se             = {                                             \
                .group_node     = LIST_HEAD_INIT(tsk.se.group_node),    \
        },                                                              \
-       .rt             = {                                             \
-               .run_list       = LIST_HEAD_INIT(tsk.rt.run_list),      \
-               .time_slice     = RR_TIMESLICE,                         \
-       },                                                              \
+       INIT_TASK_RT(tsk)                                               \
        .tasks          = LIST_HEAD_INIT(tsk.tasks),                    \
        INIT_PUSHABLE_TASKS(tsk)                                        \
        INIT_CGROUP_SCHED(tsk)                                          \
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ba0c203669..71a43480ed 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -518,7 +518,9 @@ struct task_struct {
 
        const struct sched_class        *sched_class;
        struct sched_entity             se;
+#ifdef CONFIG_SCHED_RT
        struct sched_rt_entity          rt;
+#endif
 #ifdef CONFIG_CGROUP_SCHED
        struct task_group               *sched_task_group;
 #endif
diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h
index f93329aba3..f2d636582d 100644
--- a/include/linux/sched/rt.h
+++ b/include/linux/sched/rt.h
@@ -7,7 +7,7 @@ struct task_struct;
 
 static inline int rt_prio(int prio)
 {
-       if (unlikely(prio < MAX_RT_PRIO))
+       if (IS_ENABLED(CONFIG_SCHED_RT) && unlikely(prio < MAX_RT_PRIO))
                return 1;
        return 0;
 }
@@ -17,7 +17,7 @@ static inline int rt_task(struct task_struct *p)
        return rt_prio(p->prio);
 }
 
-#ifdef CONFIG_RT_MUTEXES
+#if defined(CONFIG_RT_MUTEXES) && defined(CONFIG_SCHED_RT)
 /*
  * Must hold either p->pi_lock or task_rq(p)->lock.
  */
diff --git a/init/Kconfig b/init/Kconfig
index f73e3f0940..3bcd49f576 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -687,7 +687,7 @@ config TREE_RCU_TRACE
 
 config RCU_BOOST
        bool "Enable RCU priority boosting"
-       depends on RT_MUTEXES && PREEMPT_RCU && RCU_EXPERT
+       depends on SCHED_RT && RT_MUTEXES && PREEMPT_RCU && RCU_EXPERT
        default n
        help
          This option boosts the priority of preempted RCU readers that
@@ -1090,7 +1090,7 @@ config CFS_BANDWIDTH
 
 config RT_GROUP_SCHED
        bool "Group scheduling for SCHED_RR/FIFO"
-       depends on CGROUP_SCHED
+       depends on CGROUP_SCHED && SCHED_RT
        default n
        help
          This feature lets you explicitly allocate real CPU bandwidth
@@ -1303,8 +1303,17 @@ config SCHED_AUTOGROUP
          desktop applications.  Task group autogeneration is currently based
          upon task session.
 
+config SCHED_RT
+       bool "Real Time Task Scheduling" if EXPERT
+       default y
+       help
+         This adds the sched_rt scheduling class to the kernel providing
+         support for the SCHED_FIFO and SCHED_RR policies. You might want
+         to disable this to reduce the kernel size. If unsure say y.
+
 config SCHED_DL
        bool "Deadline Task Scheduling" if EXPERT
+       depends on SCHED_RT
        default y
        help
          This adds the sched_dl scheduling class to the kernel providing
@@ -1632,6 +1641,7 @@ config BASE_FULL
 config FUTEX
        bool "Enable futex support" if EXPERT
        default y
+       depends on SCHED_RT
        select RT_MUTEXES
        help
          Disabling this option will cause the kernel to be built without
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 3bd6a7c1cc..bccbef85e5 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -16,8 +16,8 @@ CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
 endif
 
 obj-y += core.o loadavg.o clock.o cputime.o
-obj-y += wait.o swait.o completion.o idle.o
-obj-y += idle_task.o fair.o rt.o
+obj-y += wait.o swait.o completion.o idle.o idle_task.o fair.o
+obj-$(CONFIG_SCHED_RT) += rt.o
 obj-$(CONFIG_SCHED_DL) += deadline.o $(if $(CONFIG_SMP),cpudeadline.o)
 obj-$(CONFIG_SMP) += cpupri.o topology.o stop_task.o
 obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a7b004e440..3dd6fce750 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -640,6 +640,7 @@ bool sched_can_stop_tick(struct rq *rq)
                return false;
 #endif
 
+#ifdef CONFIG_SCHED_RT
        /*
         * If there are more than one RR tasks, we need the tick to effect the
         * actual RR behaviour.
@@ -658,6 +659,7 @@ bool sched_can_stop_tick(struct rq *rq)
        fifo_nr_running = rq->rt.rt_nr_running - rq->rt.rr_nr_running;
        if (fifo_nr_running)
                return true;
+#endif
 
        /*
         * If there are no DL,RR/FIFO tasks, there must only be CFS tasks left;
@@ -1586,7 +1588,7 @@ void sched_set_stop_task(int cpu, struct task_struct 
*stop)
                 * Reset it back to a normal scheduling class so that
                 * it can die in pieces.
                 */
-               old_stop->sched_class = &rt_sched_class;
+               old_stop->sched_class = stop_sched_class.next;
        }
 }
 
@@ -2182,11 +2184,13 @@ static void __sched_fork(unsigned long clone_flags, 
struct task_struct *p)
        __dl_clear_params(p);
 #endif
 
+#ifdef CONFIG_SCHED_RT
        INIT_LIST_HEAD(&p->rt.run_list);
        p->rt.timeout           = 0;
        p->rt.time_slice        = sched_rr_timeslice;
        p->rt.on_rq             = 0;
        p->rt.on_list           = 0;
+#endif
 
 #ifdef CONFIG_PREEMPT_NOTIFIERS
        INIT_HLIST_HEAD(&p->preempt_notifiers);
@@ -3716,13 +3720,18 @@ void rt_mutex_setprio(struct task_struct *p, struct 
task_struct *pi_task)
                p->sched_class = &dl_sched_class;
        } else
 #endif
+#ifdef CONFIG_SCHED_RT
        if (rt_prio(prio)) {
                if (oldprio < prio)
                        queue_flag |= ENQUEUE_HEAD;
                p->sched_class = &rt_sched_class;
-       } else {
+       } else
+#endif
+       {
+#ifdef CONFIG_SCHED_RT
                if (rt_prio(oldprio))
                        p->rt.timeout = 0;
+#endif
                p->sched_class = &fair_sched_class;
        }
 
@@ -3997,6 +4006,23 @@ static int __sched_setscheduler(struct task_struct *p,
 
        /* May grab non-irq protected spin_locks: */
        BUG_ON(in_interrupt());
+
+       /*
+        * When the RT scheduling class is disabled, let's make sure kernel 
threads
+        * wanting RT still get lowest nice value to give them highest available
+        * priority rather than simply returning an error. Obviously we can't 
test
+        * rt_policy() here as it is always false in that case.
+        */
+       if (!IS_ENABLED(CONFIG_SCHED_RT) && !user &&
+           (policy == SCHED_FIFO || policy == SCHED_RR)) {
+               static const struct sched_attr k_attr = {
+                       .sched_policy = SCHED_NORMAL,
+                       .sched_nice = MIN_NICE,
+               };
+               attr = &k_attr;
+               policy = SCHED_NORMAL;
+       }
+
 recheck:
        /* Double check policy once rq lock held: */
        if (policy < 0) {
@@ -5726,7 +5752,9 @@ void __init sched_init_smp(void)
        sched_init_granularity();
        free_cpumask_var(non_isolated_cpus);
 
+#ifdef CONFIG_SCHED_RT
        init_sched_rt_class();
+#endif
 #ifdef CONFIG_SCHED_DL
        init_sched_dl_class();
 #endif
@@ -5832,7 +5860,9 @@ void __init sched_init(void)
        }
 #endif /* CONFIG_CPUMASK_OFFSTACK */
 
+#ifdef CONFIG_SCHED_RT
        init_rt_bandwidth(&def_rt_bandwidth, global_rt_period(), 
global_rt_runtime());
+#endif
 #ifdef CONFIG_SCHED_DL
        init_dl_bandwidth(&def_dl_bandwidth, global_rt_period(), 
global_rt_runtime());
 #endif
@@ -5864,7 +5894,10 @@ void __init sched_init(void)
                rq->calc_load_active = 0;
                rq->calc_load_update = jiffies + LOAD_FREQ;
                init_cfs_rq(&rq->cfs);
+#ifdef CONFIG_SCHED_RT
                init_rt_rq(&rq->rt);
+               rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
+#endif
 #ifdef CONFIG_SCHED_DL
                init_dl_rq(&rq->dl);
 #endif
@@ -5895,7 +5928,6 @@ void __init sched_init(void)
                init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
-               rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
 #ifdef CONFIG_RT_GROUP_SCHED
                init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
 #endif
@@ -6132,7 +6164,9 @@ static DEFINE_SPINLOCK(task_group_lock);
 static void sched_free_group(struct task_group *tg)
 {
        free_fair_sched_group(tg);
+#ifdef CONFIG_SCHED_RT
        free_rt_sched_group(tg);
+#endif
        autogroup_free(tg);
        kmem_cache_free(task_group_cache, tg);
 }
@@ -6149,8 +6183,10 @@ struct task_group *sched_create_group(struct task_group 
*parent)
        if (!alloc_fair_sched_group(tg, parent))
                goto err;
 
+#ifdef CONFIG_SCHED_RT
        if (!alloc_rt_sched_group(tg, parent))
                goto err;
+#endif
 
        return tg;
 
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 84f80a81ab..c550723ce9 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -645,7 +645,9 @@ do {                                                        
                \
 
        spin_lock_irqsave(&sched_debug_lock, flags);
        print_cfs_stats(m, cpu);
+#ifdef CONFIG_SCHED_RT
        print_rt_stats(m, cpu);
+#endif
 #ifdef CONFIG_SCHED_DL
        print_dl_stats(m, cpu);
 #endif
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 41dc10b707..38439eefd3 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -132,7 +132,8 @@ static inline int fair_policy(int policy)
 
 static inline int rt_policy(int policy)
 {
-       return policy == SCHED_FIFO || policy == SCHED_RR;
+       return IS_ENABLED(CONFIG_SCHED_RT) &&
+              (policy == SCHED_FIFO || policy == SCHED_RR);
 }
 
 static inline int dl_policy(int policy)
@@ -1447,8 +1448,10 @@ static inline void set_curr_task(struct rq *rq, struct 
task_struct *curr)
 #define sched_class_highest (&stop_sched_class)
 #elif defined(CONFIG_SCHED_DL)
 #define sched_class_highest (&dl_sched_class)
-#else
+#elif defined(CONFIG_SCHED_RT)
 #define sched_class_highest (&rt_sched_class)
+#else
+#define sched_class_highest (&fair_sched_class)
 #endif
 
 #define for_each_class(class) \
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 5632dc3e63..7cad8c1540 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -112,8 +112,10 @@ static void update_curr_stop(struct rq *rq)
 const struct sched_class stop_sched_class = {
 #ifdef CONFIG_SCHED_DL
        .next                   = &dl_sched_class,
-#else
+#elif defined(CONFIG_SCHED_RT)
        .next                   = &rt_sched_class,
+#else
+       .next                   = &fair_sched_class,
 #endif
 
        .enqueue_task           = enqueue_task_stop,
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 4dfba1a76c..1c670f4053 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -401,6 +401,7 @@ static struct ctl_table kern_table[] = {
        },
 #endif /* CONFIG_NUMA_BALANCING */
 #endif /* CONFIG_SCHED_DEBUG */
+#ifdef CONFIG_SCHED_RT
        {
                .procname       = "sched_rt_period_us",
                .data           = &sysctl_sched_rt_period,
@@ -422,6 +423,7 @@ static struct ctl_table kern_table[] = {
                .mode           = 0644,
                .proc_handler   = sched_rr_handler,
        },
+#endif
 #ifdef CONFIG_SCHED_AUTOGROUP
        {
                .procname       = "sched_autogroup_enabled",
@@ -1071,7 +1073,7 @@ static struct ctl_table kern_table[] = {
                .extra1         = &neg_one,
        },
 #endif
-#ifdef CONFIG_RT_MUTEXES
+#if defined(CONFIG_RT_MUTEXES) && defined(CONFIG_SCHED_RT)
        {
                .procname       = "max_lock_depth",
                .data           = &max_lock_depth,
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index d2a1e6dd02..010efb0e91 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -790,10 +790,12 @@ static void check_thread_timers(struct task_struct *tsk,
                                struct list_head *firing)
 {
        struct list_head *timers = tsk->cpu_timers;
-       struct signal_struct *const sig = tsk->signal;
        struct task_cputime *tsk_expires = &tsk->cputime_expires;
        u64 expires;
+#ifdef CONFIG_SCHED_RT
+       struct signal_struct *const sig = tsk->signal;
        unsigned long soft;
+#endif
 
        /*
         * If cputime_expires is zero, then there are no active
@@ -811,6 +813,7 @@ static void check_thread_timers(struct task_struct *tsk,
        tsk_expires->sched_exp = check_timers_list(++timers, firing,
                                                   tsk->se.sum_exec_runtime);
 
+#ifdef CONFIG_SCHED_RT
        /*
         * Check for the special case thread timers.
         */
@@ -847,6 +850,7 @@ static void check_thread_timers(struct task_struct *tsk,
                        __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk);
                }
        }
+#endif
        if (task_cputime_zero(tsk_expires))
                tick_dep_clear_task(tsk, TICK_DEP_BIT_POSIX_TIMER);
 }
-- 
2.9.4

[PATCH 6/7] sched/rt: make it configurable

Reply via email to