Re: [PATCH] sched: Make schedstats a runtime tunable that is disabled by default

Mel Gorman Mon, 25 Jan 2016 12:45:50 -0800

On Mon, Jan 25, 2016 at 08:11:42PM +0000, Mel Gorman wrote:
> The vast majority of the checks are in the schedstat_[inc|dec|set) helpers.
> update_stats_wait_end() is an oddity in that it's a large function
> that only exists in schedstat and does a number of calculations. Ideally
> update_stats_wait would be updated too.  In the next revision, I'll create
> a schedstats_enabled() helper that is an alias of static_branch_unlikely()
> and returns 0 if stats are not configured in for use at the call-site
> of large functions like this. That has the added benefit of avoiding any
> function call overhead.
>


Not even build tested but this is the general direction I'm thinking. It
adds the schedstat_enabled() helper and disables more schedstat-only code.

diff --git a/Documentation/kernel-parameters.txt 
b/Documentation/kernel-parameters.txt
index 87d40a72f6a1..846956abfe85 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -3523,6 +3523,11 @@ bytes respectively. Such letter suffixes can also be 
entirely omitted.
 
        sched_debug     [KNL] Enables verbose scheduler debug messages.
 
+       schedstats=     [KNL,X86] Enable or disable scheduled statistics.
+                       Allowed values are enable and disable. This feature
+                       incurs a small amount of overhead in the scheduler
+                       but is useful for debugging and performance tuning.
+
        skew_tick=      [KNL] Offset the periodic timer tick per cpu to mitigate
                        xtime_lock contention on larger systems, and/or RCU lock
                        contention on all systems with CONFIG_MAXSMP set.
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index a93b414672a7..be7c3b720adf 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -760,6 +760,14 @@ rtsig-nr shows the number of RT signals currently queued.
 
 ==============================================================
 
+schedstats:
+
+Enables/disables scheduler statistics. Enabling this feature
+incurs a small amount of overhead in the scheduler but is
+useful for debugging and performance tuning.
+
+==============================================================
+
 sg-big-buff:
 
 This file shows the size of the generic SCSI (sg) buffer.
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index c9e4731cf10b..4f080ab4f2cd 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -95,4 +95,8 @@ extern int sysctl_numa_balancing(struct ctl_table *table, int 
write,
                                 void __user *buffer, size_t *lenp,
                                 loff_t *ppos);
 
+extern int sysctl_schedstats(struct ctl_table *table, int write,
+                                void __user *buffer, size_t *lenp,
+                                loff_t *ppos);
+
 #endif /* _SCHED_SYSCTL_H */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 63d3a24e081a..42ea7b28a47b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2093,7 +2093,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, 
int wake_flags)
 
        ttwu_queue(p, cpu);
 stat:
-       ttwu_stat(p, cpu, wake_flags);
+       if (schedstat_enabled())
+               ttwu_stat(p, cpu, wake_flags);
 out:
        raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 
@@ -2141,7 +2142,8 @@ static void try_to_wake_up_local(struct task_struct *p)
                ttwu_activate(rq, p, ENQUEUE_WAKEUP);
 
        ttwu_do_wakeup(rq, p, 0);
-       ttwu_stat(p, smp_processor_id(), 0);
+       if (schedstat_enabled())
+               ttwu_stat(p, smp_processor_id(), 0);
 out:
        raw_spin_unlock(&p->pi_lock);
 }
@@ -2210,6 +2212,7 @@ static void __sched_fork(unsigned long clone_flags, 
struct task_struct *p)
 #endif
 
 #ifdef CONFIG_SCHEDSTATS
+       /* Even if schedstat is disabled, there should not be garbage */
        memset(&p->se.statistics, 0, sizeof(p->se.statistics));
 #endif
 
@@ -2281,6 +2284,61 @@ int sysctl_numa_balancing(struct ctl_table *table, int 
write,
 #endif
 #endif
 
+DEFINE_STATIC_KEY_FALSE(sched_schedstats);
+
+#ifdef CONFIG_SCHEDSTATS
+void set_schedstats(bool enabled)
+{
+       if (enabled)
+               static_branch_enable(&sched_schedstats);
+       else
+               static_branch_disable(&sched_schedstats);
+}
+
+static int __init setup_schedstats(char *str)
+{
+       int ret = 0;
+       if (!str)
+               goto out;
+
+       if (!strcmp(str, "enable")) {
+               set_schedstats(true);
+               ret = 1;
+       } else if (!strcmp(str, "disable")) {
+               set_schedstats(false);
+               ret = 1;
+       }
+out:
+       if (!ret)
+               pr_warn("Unable to parse schedstats=\n");
+
+       return ret;
+}
+__setup("schedstats=", setup_schedstats);
+
+#ifdef CONFIG_PROC_SYSCTL
+int sysctl_schedstats(struct ctl_table *table, int write,
+                        void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+       struct ctl_table t;
+       int err;
+       int state = static_branch_likely(&sched_schedstats);
+
+       if (write && !capable(CAP_SYS_ADMIN))
+               return -EPERM;
+
+       t = *table;
+       t.data = &state;
+       err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
+       if (err < 0)
+               return err;
+       if (write)
+               set_schedstats(state);
+       return err;
+}
+#endif
+#endif
+
 /*
  * fork()/clone()-time setup:
  */
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 641511771ae6..79a3d91f554d 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -75,16 +75,18 @@ static void print_cfs_group_stats(struct seq_file *m, int 
cpu, struct task_group
        PN(se->vruntime);
        PN(se->sum_exec_runtime);
 #ifdef CONFIG_SCHEDSTATS
-       PN(se->statistics.wait_start);
-       PN(se->statistics.sleep_start);
-       PN(se->statistics.block_start);
-       PN(se->statistics.sleep_max);
-       PN(se->statistics.block_max);
-       PN(se->statistics.exec_max);
-       PN(se->statistics.slice_max);
-       PN(se->statistics.wait_max);
-       PN(se->statistics.wait_sum);
-       P(se->statistics.wait_count);
+       if (schedstat_enabled()) {
+               PN(se->statistics.wait_start);
+               PN(se->statistics.sleep_start);
+               PN(se->statistics.block_start);
+               PN(se->statistics.sleep_max);
+               PN(se->statistics.block_max);
+               PN(se->statistics.exec_max);
+               PN(se->statistics.slice_max);
+               PN(se->statistics.wait_max);
+               PN(se->statistics.wait_sum);
+               P(se->statistics.wait_count);
+       }
 #endif
        P(se->load.weight);
 #ifdef CONFIG_SMP
@@ -122,10 +124,12 @@ print_task(struct seq_file *m, struct rq *rq, struct 
task_struct *p)
                (long long)(p->nvcsw + p->nivcsw),
                p->prio);
 #ifdef CONFIG_SCHEDSTATS
-       SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
-               SPLIT_NS(p->se.statistics.wait_sum),
-               SPLIT_NS(p->se.sum_exec_runtime),
-               SPLIT_NS(p->se.statistics.sum_sleep_runtime));
+       if (schedstat_enabled()) {
+               SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
+                       SPLIT_NS(p->se.statistics.wait_sum),
+                       SPLIT_NS(p->se.sum_exec_runtime),
+                       SPLIT_NS(p->se.statistics.sum_sleep_runtime));
+       }
 #else
        SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
                0LL, 0L,
@@ -313,17 +317,19 @@ do {                                                      
                \
 #define P(n) SEQ_printf(m, "  .%-30s: %d\n", #n, rq->n);
 #define P64(n) SEQ_printf(m, "  .%-30s: %Ld\n", #n, rq->n);
 
-       P(yld_count);
+       if (schedstat_enabled()) {
+               P(yld_count);
 
-       P(sched_count);
-       P(sched_goidle);
+               P(sched_count);
+               P(sched_goidle);
 #ifdef CONFIG_SMP
-       P64(avg_idle);
-       P64(max_idle_balance_cost);
+               P64(avg_idle);
+               P64(max_idle_balance_cost);
 #endif
 
-       P(ttwu_count);
-       P(ttwu_local);
+               P(ttwu_count);
+               P(ttwu_local);
+       }
 
 #undef P
 #undef P64
@@ -569,38 +575,38 @@ void proc_sched_show_task(struct task_struct *p, struct 
seq_file *m)
        nr_switches = p->nvcsw + p->nivcsw;
 
 #ifdef CONFIG_SCHEDSTATS
-       PN(se.statistics.sum_sleep_runtime);
-       PN(se.statistics.wait_start);
-       PN(se.statistics.sleep_start);
-       PN(se.statistics.block_start);
-       PN(se.statistics.sleep_max);
-       PN(se.statistics.block_max);
-       PN(se.statistics.exec_max);
-       PN(se.statistics.slice_max);
-       PN(se.statistics.wait_max);
-       PN(se.statistics.wait_sum);
-       P(se.statistics.wait_count);
-       PN(se.statistics.iowait_sum);
-       P(se.statistics.iowait_count);
-       P(se.nr_migrations);
-       P(se.statistics.nr_migrations_cold);
-       P(se.statistics.nr_failed_migrations_affine);
-       P(se.statistics.nr_failed_migrations_running);
-       P(se.statistics.nr_failed_migrations_hot);
-       P(se.statistics.nr_forced_migrations);
-       P(se.statistics.nr_wakeups);
-       P(se.statistics.nr_wakeups_sync);
-       P(se.statistics.nr_wakeups_migrate);
-       P(se.statistics.nr_wakeups_local);
-       P(se.statistics.nr_wakeups_remote);
-       P(se.statistics.nr_wakeups_affine);
-       P(se.statistics.nr_wakeups_affine_attempts);
-       P(se.statistics.nr_wakeups_passive);
-       P(se.statistics.nr_wakeups_idle);
-
-       {
+       if (schedstat_enabled()) {
                u64 avg_atom, avg_per_cpu;
 
+               PN(se.statistics.sum_sleep_runtime);
+               PN(se.statistics.wait_start);
+               PN(se.statistics.sleep_start);
+               PN(se.statistics.block_start);
+               PN(se.statistics.sleep_max);
+               PN(se.statistics.block_max);
+               PN(se.statistics.exec_max);
+               PN(se.statistics.slice_max);
+               PN(se.statistics.wait_max);
+               PN(se.statistics.wait_sum);
+               P(se.statistics.wait_count);
+               PN(se.statistics.iowait_sum);
+               P(se.statistics.iowait_count);
+               P(se.nr_migrations);
+               P(se.statistics.nr_migrations_cold);
+               P(se.statistics.nr_failed_migrations_affine);
+               P(se.statistics.nr_failed_migrations_running);
+               P(se.statistics.nr_failed_migrations_hot);
+               P(se.statistics.nr_forced_migrations);
+               P(se.statistics.nr_wakeups);
+               P(se.statistics.nr_wakeups_sync);
+               P(se.statistics.nr_wakeups_migrate);
+               P(se.statistics.nr_wakeups_local);
+               P(se.statistics.nr_wakeups_remote);
+               P(se.statistics.nr_wakeups_affine);
+               P(se.statistics.nr_wakeups_affine_attempts);
+               P(se.statistics.nr_wakeups_passive);
+               P(se.statistics.nr_wakeups_idle);
+
                avg_atom = p->se.sum_exec_runtime;
                if (nr_switches)
                        avg_atom = div64_ul(avg_atom, nr_switches);
@@ -610,7 +616,7 @@ void proc_sched_show_task(struct task_struct *p, struct 
seq_file *m)
                avg_per_cpu = p->se.sum_exec_runtime;
                if (p->se.nr_migrations) {
                        avg_per_cpu = div64_u64(avg_per_cpu,
-                                               p->se.nr_migrations);
+                                       p->se.nr_migrations);
                } else {
                        avg_per_cpu = -1LL;
                }
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 1926606ece80..e6ae52fd203f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -755,7 +755,9 @@ static void
 update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
        struct task_struct *p;
-       u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start;
+       u64 delta;
+
+       delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start;
 
        if (entity_is_task(se)) {
                p = task_of(se);
@@ -776,22 +778,12 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct 
sched_entity *se)
        se->statistics.wait_sum += delta;
        se->statistics.wait_start = 0;
 }
-#else
-static inline void
-update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-}
-
-static inline void
-update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-}
-#endif
 
 /*
  * Task is being enqueued - update stats:
  */
-static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity 
*se)
+static inline void
+update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
        /*
         * Are we enqueueing a waiting task? (for current tasks
@@ -801,8 +793,8 @@ static void update_stats_enqueue(struct cfs_rq *cfs_rq, 
struct sched_entity *se)
                update_stats_wait_start(cfs_rq, se);
 }
 
-static inline void
-update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static void
+update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 {
        /*
         * Mark the end of the wait period if dequeueing a
@@ -810,7 +802,40 @@ update_stats_dequeue(struct cfs_rq *cfs_rq, struct 
sched_entity *se)
         */
        if (se != cfs_rq->curr)
                update_stats_wait_end(cfs_rq, se);
+
+       if (flags & DEQUEUE_SLEEP) {
+               if (entity_is_task(se)) {
+                       struct task_struct *tsk = task_of(se);
+
+                       if (tsk->state & TASK_INTERRUPTIBLE)
+                               se->statistics.sleep_start = 
rq_clock(rq_of(cfs_rq));
+                       if (tsk->state & TASK_UNINTERRUPTIBLE)
+                               se->statistics.block_start = 
rq_clock(rq_of(cfs_rq));
+               }
+       }
+
+}
+#else
+static inline void
+update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+}
+
+static inline void
+update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+}
+
+static inline void
+update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+}
+
+static inline void
+update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+{
 }
+#endif
 
 /*
  * We are picking a new current task - update its stats:
@@ -3106,11 +3131,14 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct 
sched_entity *se, int flags)
 
        if (flags & ENQUEUE_WAKEUP) {
                place_entity(cfs_rq, se, 0);
-               enqueue_sleeper(cfs_rq, se);
+               if (schedstat_enabled())
+                       enqueue_sleeper(cfs_rq, se);
        }
 
-       update_stats_enqueue(cfs_rq, se);
-       check_spread(cfs_rq, se);
+       if (schedstat_enabled()) {
+               update_stats_enqueue(cfs_rq, se);
+               check_spread(cfs_rq, se);
+       }
        if (se != cfs_rq->curr)
                __enqueue_entity(cfs_rq, se);
        se->on_rq = 1;
@@ -3177,19 +3205,8 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct 
sched_entity *se, int flags)
        update_curr(cfs_rq);
        dequeue_entity_load_avg(cfs_rq, se);
 
-       update_stats_dequeue(cfs_rq, se);
-       if (flags & DEQUEUE_SLEEP) {
-#ifdef CONFIG_SCHEDSTATS
-               if (entity_is_task(se)) {
-                       struct task_struct *tsk = task_of(se);
-
-                       if (tsk->state & TASK_INTERRUPTIBLE)
-                               se->statistics.sleep_start = 
rq_clock(rq_of(cfs_rq));
-                       if (tsk->state & TASK_UNINTERRUPTIBLE)
-                               se->statistics.block_start = 
rq_clock(rq_of(cfs_rq));
-               }
-#endif
-       }
+       if (schedstat_enabled())
+               update_stats_dequeue(cfs_rq, se, flags);
 
        clear_buddies(cfs_rq, se);
 
@@ -3263,7 +3280,8 @@ set_next_entity(struct cfs_rq *cfs_rq, struct 
sched_entity *se)
                 * a CPU. So account for the time it spent waiting on the
                 * runqueue.
                 */
-               update_stats_wait_end(cfs_rq, se);
+               if (schedstat_enabled())
+                       update_stats_wait_end(cfs_rq, se);
                __dequeue_entity(cfs_rq, se);
                update_load_avg(se, 1);
        }
@@ -3276,7 +3294,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct 
sched_entity *se)
         * least twice that of our own weight (i.e. dont track it
         * when there are only lesser-weight tasks around):
         */
-       if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
+       if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 
2*se->load.weight) {
                se->statistics.slice_max = max(se->statistics.slice_max,
                        se->sum_exec_runtime - se->prev_sum_exec_runtime);
        }
@@ -3359,9 +3377,11 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, 
struct sched_entity *prev)
        /* throttle cfs_rqs exceeding runtime */
        check_cfs_rq_runtime(cfs_rq);
 
-       check_spread(cfs_rq, prev);
+       if (schedstat_enabled())
+               check_spread(cfs_rq, prev);
        if (prev->on_rq) {
-               update_stats_wait_start(cfs_rq, prev);
+               if (schedstat_enabled())
+                       update_stats_wait_start(cfs_rq, prev);
                /* Put 'current' back into the tree. */
                __enqueue_entity(cfs_rq, prev);
                /* in !on_rq case, update occurred at dequeue */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 10f16374df7f..1d583870e1a6 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1022,6 +1022,7 @@ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];
 #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */
 
 extern struct static_key_false sched_numa_balancing;
+extern struct static_key_false sched_schedstats;
 
 static inline u64 global_rt_period(void)
 {
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index b0fbc7632de5..70b3b6a20fb0 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -29,9 +29,10 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long 
delta)
        if (rq)
                rq->rq_sched_info.run_delay += delta;
 }
-# define schedstat_inc(rq, field)      do { (rq)->field++; } while (0)
-# define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0)
-# define schedstat_set(var, val)       do { var = (val); } while (0)
+# define schedstat_enabled()           
static_branch_unlikely(&sched_schedstats)
+# define schedstat_inc(rq, field)      do { if (schedstat_enabled()) { 
(rq)->field++; } } while (0)
+# define schedstat_add(rq, field, amt) do { if (schedstat_enabled()) { 
(rq)->field += (amt); } } while (0)
+# define schedstat_set(var, val)       do { if (schedstat_enabled()) { var = 
(val); } } while (0)
 #else /* !CONFIG_SCHEDSTATS */
 static inline void
 rq_sched_info_arrive(struct rq *rq, unsigned long long delta)
@@ -42,6 +43,7 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long 
delta)
 static inline void
 rq_sched_info_depart(struct rq *rq, unsigned long long delta)
 {}
+# define schedstat_enabled()           0
 # define schedstat_inc(rq, field)      do { } while (0)
 # define schedstat_add(rq, field, amt) do { } while (0)
 # define schedstat_set(var, val)       do { } while (0)
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 97715fd9e790..6fe70ccdf2ac 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -350,6 +350,17 @@ static struct ctl_table kern_table[] = {
                .mode           = 0644,
                .proc_handler   = proc_dointvec,
        },
+#ifdef CONFIG_SCHEDSTATS
+       {
+               .procname       = "sched_schedstats",
+               .data           = NULL,
+               .maxlen         = sizeof(unsigned int),
+               .mode           = 0644,
+               .proc_handler   = sysctl_schedstats,
+               .extra1         = &zero,
+               .extra2         = &one,
+       },
+#endif /* CONFIG_SCHEDSTATS */
 #endif /* CONFIG_SMP */
 #ifdef CONFIG_NUMA_BALANCING
        {
-- 
Mel Gorman
SUSE Labs

Re: [PATCH] sched: Make schedstats a runtime tunable that is disabled by default

Reply via email to