The 'sched_stat_*' tracepoints have a dependency on schedstats being
enabled at both compile-time (CONFIG_SCHEDSTATS) and runtime
(sched_schedstats).

In addition to causing increased code complexity, it also causes
confusion:

- When schedstats are disabled, the tracepoints don't fire at all.

- When schedstats are enabled at runtime, existing tasks will have stale
  or missing statistics, which can cause the tracepoints to either not
  fire, or to fire with corrupt data.

Decouple them so that the tracepoints will always fire correctly,
independent of schedstats.  A side benefit is that this also means that
latencytop and profiling no longer have a dependency on schedstats.

Code size:

  !CONFIG_SCHEDSTATS defconfig:

        text       data     bss      dec            hex filename
    10209818    4368184 1105920 15683922         ef5152 vmlinux.before.nostats
    10209818    4368312 1105920 15684050         ef51d2 vmlinux.after.nostats

  CONFIG_SCHEDSTATS defconfig:

        text       data     bss      dec            hex filename
    10214210    4370680 1105920 15690810         ef6c3a vmlinux.before.stats
    10214261    4370104 1105920 15690285         ef6a2d vmlinux.after.stats

Suggested-by: Ingo Molnar <[email protected]>
Signed-off-by: Josh Poimboeuf <[email protected]>
---
 include/linux/sched.h | 11 +++----
 kernel/latencytop.c   |  2 --
 kernel/profile.c      |  5 ----
 kernel/sched/core.c   | 16 +++--------
 kernel/sched/debug.c  | 13 +++++----
 kernel/sched/fair.c   | 80 ++++++++++++---------------------------------------
 lib/Kconfig.debug     |  1 -
 7 files changed, 34 insertions(+), 94 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index e31758b..0ee8f04 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -940,10 +940,6 @@ static inline int sched_info_on(void)
 #endif
 }
 
-#ifdef CONFIG_SCHEDSTATS
-void force_schedstat_enabled(void);
-#endif
-
 enum cpu_idle_type {
        CPU_IDLE,
        CPU_NOT_IDLE,
@@ -1285,18 +1281,15 @@ struct sched_avg {
 
 #ifdef CONFIG_SCHEDSTATS
 struct sched_statistics {
-       u64                     wait_start;
        u64                     wait_max;
        u64                     wait_count;
        u64                     wait_sum;
        u64                     iowait_count;
        u64                     iowait_sum;
 
-       u64                     sleep_start;
        u64                     sleep_max;
        s64                     sum_sleep_runtime;
 
-       u64                     block_start;
        u64                     block_max;
        u64                     exec_max;
        u64                     slice_max;
@@ -1332,6 +1325,10 @@ struct sched_entity {
 
        u64                     nr_migrations;
 
+       u64                     wait_start;
+       u64                     sleep_start;
+       u64                     block_start;
+
 #ifdef CONFIG_SCHEDSTATS
        struct sched_statistics statistics;
 #endif
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index b5c30d9..5ff6c54 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -296,8 +296,6 @@ int sysctl_latencytop(struct ctl_table *table, int write,
        int err;
 
        err = proc_dointvec(table, write, buffer, lenp, ppos);
-       if (latencytop_enabled)
-               force_schedstat_enabled();
 
        return err;
 }
diff --git a/kernel/profile.c b/kernel/profile.c
index c2199e9..f033d79 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -58,8 +58,6 @@ int profile_setup(char *str)
        int par;
 
        if (!strncmp(str, sleepstr, strlen(sleepstr))) {
-#ifdef CONFIG_SCHEDSTATS
-               force_schedstat_enabled();
                prof_on = SLEEP_PROFILING;
                if (str[strlen(sleepstr)] == ',')
                        str += strlen(sleepstr) + 1;
@@ -67,9 +65,6 @@ int profile_setup(char *str)
                        prof_shift = par;
                pr_info("kernel sleep profiling enabled (shift: %ld)\n",
                        prof_shift);
-#else
-               pr_warn("kernel sleep profiling requires CONFIG_SCHEDSTATS\n");
-#endif /* CONFIG_SCHEDSTATS */
        } else if (!strncmp(str, schedstr, strlen(schedstr))) {
                prof_on = SCHED_PROFILING;
                if (str[strlen(schedstr)] == ',')
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0aee5dd..9befffb 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2267,14 +2267,6 @@ static void set_schedstats(bool enabled)
                static_branch_disable(&sched_schedstats);
 }
 
-void force_schedstat_enabled(void)
-{
-       if (!schedstat_enabled()) {
-               pr_info("kernel profiling enabled schedstats, disable via 
kernel.sched_schedstats.\n");
-               static_branch_enable(&sched_schedstats);
-       }
-}
-
 static int __init setup_schedstats(char *str)
 {
        int ret = 0;
@@ -7589,10 +7581,10 @@ void normalize_rt_tasks(void)
                if (p->flags & PF_KTHREAD)
                        continue;
 
-               p->se.exec_start = 0;
-               schedstat_set(p->se.statistics.wait_start,  0);
-               schedstat_set(p->se.statistics.sleep_start, 0);
-               schedstat_set(p->se.statistics.block_start, 0);
+               p->se.exec_start  = 0;
+               p->se.wait_start  = 0;
+               p->se.sleep_start = 0;
+               p->se.block_start = 0;
 
                if (!dl_task(p) && !rt_task(p)) {
                        /*
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 1393588..06be44a 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -382,10 +382,10 @@ static void print_cfs_group_stats(struct seq_file *m, int 
cpu, struct task_group
        PN(se->exec_start);
        PN(se->vruntime);
        PN(se->sum_exec_runtime);
+       PN(se->wait_start);
+       PN(se->sleep_start);
+       PN(se->block_start);
        if (schedstat_enabled()) {
-               PN_SCHEDSTAT(se->statistics.wait_start);
-               PN_SCHEDSTAT(se->statistics.sleep_start);
-               PN_SCHEDSTAT(se->statistics.block_start);
                PN_SCHEDSTAT(se->statistics.sleep_max);
                PN_SCHEDSTAT(se->statistics.block_max);
                PN_SCHEDSTAT(se->statistics.exec_max);
@@ -887,13 +887,14 @@ void proc_sched_show_task(struct task_struct *p, struct 
seq_file *m)
 
        P(se.nr_migrations);
 
+       PN(se.wait_start);
+       PN(se.sleep_start);
+       PN(se.block_start);
+
        if (schedstat_enabled()) {
                u64 avg_atom, avg_per_cpu;
 
                PN_SCHEDSTAT(se.statistics.sum_sleep_runtime);
-               PN_SCHEDSTAT(se.statistics.wait_start);
-               PN_SCHEDSTAT(se.statistics.sleep_start);
-               PN_SCHEDSTAT(se.statistics.block_start);
                PN_SCHEDSTAT(se.statistics.sleep_max);
                PN_SCHEDSTAT(se.statistics.block_max);
                PN_SCHEDSTAT(se.statistics.exec_max);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ae70600..3c85949 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -792,19 +792,15 @@ static void update_curr_fair(struct rq *rq)
 static inline void
 update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-       u64 wait_start, prev_wait_start;
-
-       if (!schedstat_enabled())
-               return;
+       u64 wait_start;
 
        wait_start = rq_clock(rq_of(cfs_rq));
-       prev_wait_start = schedstat_val(se->statistics.wait_start);
 
        if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) &&
-           likely(wait_start > prev_wait_start))
-               wait_start -= prev_wait_start;
+           likely(wait_start > se->wait_start))
+               wait_start -= se->wait_start;
 
-       schedstat_set(se->statistics.wait_start, wait_start);
+       se->wait_start = wait_start;
 }
 
 static inline void
@@ -813,10 +809,7 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct 
sched_entity *se)
        struct task_struct *p;
        u64 delta;
 
-       if (!schedstat_enabled())
-               return;
-
-       delta = rq_clock(rq_of(cfs_rq)) - 
schedstat_val(se->statistics.wait_start);
+       delta = rq_clock(rq_of(cfs_rq)) - se->wait_start;
 
        if (entity_is_task(se)) {
                p = task_of(se);
@@ -826,44 +819,38 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct 
sched_entity *se)
                         * time stamp can be adjusted to accumulate wait time
                         * prior to migration.
                         */
-                       schedstat_set(se->statistics.wait_start, delta);
+                       se->wait_start = delta;
                        return;
                }
                trace_sched_stat_wait(p, delta);
        }
 
+       se->wait_start = 0;
        schedstat_set(se->statistics.wait_max,
                      max(schedstat_val(se->statistics.wait_max), delta));
        schedstat_inc(se->statistics.wait_count);
        schedstat_add(se->statistics.wait_sum, delta);
-       schedstat_set(se->statistics.wait_start, 0);
 }
 
 static inline void
 update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
        struct task_struct *tsk = NULL;
-       u64 sleep_start, block_start;
-
-       if (!schedstat_enabled())
-               return;
-
-       sleep_start = schedstat_val(se->statistics.sleep_start);
-       block_start = schedstat_val(se->statistics.block_start);
 
        if (entity_is_task(se))
                tsk = task_of(se);
 
-       if (sleep_start) {
-               u64 delta = rq_clock(rq_of(cfs_rq)) - sleep_start;
+       if (se->sleep_start) {
+               u64 delta = rq_clock(rq_of(cfs_rq)) - se->sleep_start;
 
                if ((s64)delta < 0)
                        delta = 0;
 
-               if (unlikely(delta > schedstat_val(se->statistics.sleep_max)))
+               if (schedstat_enabled() &&
+                   unlikely(delta > schedstat_val(se->statistics.sleep_max)))
                        schedstat_set(se->statistics.sleep_max, delta);
 
-               schedstat_set(se->statistics.sleep_start, 0);
+               se->sleep_start = 0;
                schedstat_add(se->statistics.sum_sleep_runtime, delta);
 
                if (tsk) {
@@ -871,16 +858,17 @@ update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, 
struct sched_entity *se)
                        trace_sched_stat_sleep(tsk, delta);
                }
        }
-       if (block_start) {
-               u64 delta = rq_clock(rq_of(cfs_rq)) - block_start;
+       if (se->block_start) {
+               u64 delta = rq_clock(rq_of(cfs_rq)) - se->block_start;
 
                if ((s64)delta < 0)
                        delta = 0;
 
-               if (unlikely(delta > schedstat_val(se->statistics.block_max)))
+               if (schedstat_enabled() &&
+                   unlikely(delta > schedstat_val(se->statistics.block_max)))
                        schedstat_set(se->statistics.block_max, delta);
 
-               schedstat_set(se->statistics.block_start, 0);
+               se->block_start = 0;
                schedstat_add(se->statistics.sum_sleep_runtime, delta);
 
                if (tsk) {
@@ -913,9 +901,6 @@ update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct 
sched_entity *se)
 static inline void
 update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 {
-       if (!schedstat_enabled())
-               return;
-
        /*
         * Are we enqueueing a waiting task? (for current tasks
         * a dequeue/enqueue event is a NOP)
@@ -931,9 +916,6 @@ static inline void
 update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 {
 
-       if (!schedstat_enabled())
-               return;
-
        /*
         * Mark the end of the wait period if dequeueing a
         * waiting task:
@@ -945,11 +927,9 @@ update_stats_dequeue(struct cfs_rq *cfs_rq, struct 
sched_entity *se, int flags)
                struct task_struct *tsk = task_of(se);
 
                if (tsk->state & TASK_INTERRUPTIBLE)
-                       schedstat_set(se->statistics.sleep_start,
-                                     rq_clock(rq_of(cfs_rq)));
+                       se->sleep_start = rq_clock(rq_of(cfs_rq));
                if (tsk->state & TASK_UNINTERRUPTIBLE)
-                       schedstat_set(se->statistics.block_start,
-                                     rq_clock(rq_of(cfs_rq)));
+                       se->block_start = rq_clock(rq_of(cfs_rq));
        }
 }
 
@@ -3211,27 +3191,6 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity 
*se, int initial)
 
 static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
 
-static inline void check_schedstat_required(void)
-{
-#ifdef CONFIG_SCHEDSTATS
-       if (schedstat_enabled())
-               return;
-
-       /* Force schedstat enabled if a dependent tracepoint is active */
-       if (trace_sched_stat_wait_enabled()    ||
-                       trace_sched_stat_sleep_enabled()   ||
-                       trace_sched_stat_iowait_enabled()  ||
-                       trace_sched_stat_blocked_enabled() ||
-                       trace_sched_stat_runtime_enabled())  {
-               printk_deferred_once("Scheduler tracepoints stat_sleep, 
stat_iowait, "
-                            "stat_blocked and stat_runtime require the "
-                            "kernel parameter schedstats=enabled or "
-                            "kernel.sched_schedstats=1\n");
-       }
-#endif
-}
-
-
 /*
  * MIGRATION
  *
@@ -3293,7 +3252,6 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity 
*se, int flags)
        if (flags & ENQUEUE_WAKEUP)
                place_entity(cfs_rq, se, 0);
 
-       check_schedstat_required();
        update_stats_enqueue(cfs_rq, se, flags);
        check_spread(cfs_rq, se);
        if (!curr)
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index b9cfdbf..e8adfac 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1699,7 +1699,6 @@ config LATENCYTOP
        select KALLSYMS
        select KALLSYMS_ALL
        select STACKTRACE
-       select SCHEDSTATS
        select SCHED_DEBUG
        help
          Enable this option if you want to use the LatencyTOP tool
-- 
2.4.11

Reply via email to