sched_stat fails on very very large machines.

Nathan Zimmer Thu, 08 Nov 2012 07:14:45 -0800

On systems with 4096 cores doing a cat /proc/sched_stat fails.
We are trying to push all the data into a single kmalloc buffer.
The issue is on these very large machines all the data will not fit in 4mb.


A better solution is to not us the single_open mechanism but to provide
our own seq_operations.

The output should be identical to previous version and thus not need the
version number.

Signed-off-by: Nathan Zimmer <[email protected]>
CC: Ingo Molnar <[email protected]>
CC: Peter Zijlstra <[email protected]>
CC: [email protected]
CC: Al Viro <[email protected]>

---
 kernel/sched/stats.c |  154 ++++++++++++++++++++++++++++++-------------------
 1 files changed, 94 insertions(+), 60 deletions(-)

diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
index 903ffa9..21f7bc2 100644
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -21,86 +21,120 @@ static int show_schedstat(struct seq_file *seq, void *v)
        if (mask_str == NULL)
                return -ENOMEM;
 
-       seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
-       seq_printf(seq, "timestamp %lu\n", jiffies);
-       for_each_online_cpu(cpu) {
-               struct rq *rq = cpu_rq(cpu);
+       if (v == (void *)1) {
+               seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
+               seq_printf(seq, "timestamp %lu\n", jiffies);
+               return 0;
+       }
+
+       cpu = (unsigned long)(v - 2);
+
+       struct rq *rq = cpu_rq(cpu);
 #ifdef CONFIG_SMP
-               struct sched_domain *sd;
-               int dcount = 0;
+       struct sched_domain *sd;
+       int dcount = 0;
 #endif
 
-               /* runqueue-specific stats */
-               seq_printf(seq,
-                   "cpu%d %u 0 %u %u %u %u %llu %llu %lu",
-                   cpu, rq->yld_count,
-                   rq->sched_count, rq->sched_goidle,
-                   rq->ttwu_count, rq->ttwu_local,
-                   rq->rq_cpu_time,
-                   rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount);
+       /* runqueue-specific stats */
+       seq_printf(seq,
+           "cpu%d %u 0 %u %u %u %u %llu %llu %lu",
+           cpu, rq->yld_count,
+           rq->sched_count, rq->sched_goidle,
+           rq->ttwu_count, rq->ttwu_local,
+           rq->rq_cpu_time,
+           rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount);
 
-               seq_printf(seq, "\n");
+       seq_printf(seq, "\n");
 
 #ifdef CONFIG_SMP
-               /* domain-specific stats */
-               rcu_read_lock();
-               for_each_domain(cpu, sd) {
-                       enum cpu_idle_type itype;
-
-                       cpumask_scnprintf(mask_str, mask_len,
-                                         sched_domain_span(sd));
-                       seq_printf(seq, "domain%d %s", dcount++, mask_str);
-                       for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
-                                       itype++) {
-                               seq_printf(seq, " %u %u %u %u %u %u %u %u",
-                                   sd->lb_count[itype],
-                                   sd->lb_balanced[itype],
-                                   sd->lb_failed[itype],
-                                   sd->lb_imbalance[itype],
-                                   sd->lb_gained[itype],
-                                   sd->lb_hot_gained[itype],
-                                   sd->lb_nobusyq[itype],
-                                   sd->lb_nobusyg[itype]);
-                       }
-                       seq_printf(seq,
-                                  " %u %u %u %u %u %u %u %u %u %u %u %u\n",
-                           sd->alb_count, sd->alb_failed, sd->alb_pushed,
-                           sd->sbe_count, sd->sbe_balanced, sd->sbe_pushed,
-                           sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed,
-                           sd->ttwu_wake_remote, sd->ttwu_move_affine,
-                           sd->ttwu_move_balance);
+       /* domain-specific stats */
+       rcu_read_lock();
+       for_each_domain(cpu, sd) {
+               enum cpu_idle_type itype;
+
+               cpumask_scnprintf(mask_str, mask_len,
+                                 sched_domain_span(sd));
+               seq_printf(seq, "domain%d %s", dcount++, mask_str);
+               for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
+                               itype++) {
+                       seq_printf(seq, " %u %u %u %u %u %u %u %u",
+                           sd->lb_count[itype],
+                           sd->lb_balanced[itype],
+                           sd->lb_failed[itype],
+                           sd->lb_imbalance[itype],
+                           sd->lb_gained[itype],
+                           sd->lb_hot_gained[itype],
+                           sd->lb_nobusyq[itype],
+                           sd->lb_nobusyg[itype]);
                }
-               rcu_read_unlock();
-#endif
+               seq_printf(seq,
+                          " %u %u %u %u %u %u %u %u %u %u %u %u\n",
+                   sd->alb_count, sd->alb_failed, sd->alb_pushed,
+                   sd->sbe_count, sd->sbe_balanced, sd->sbe_pushed,
+                   sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed,
+                   sd->ttwu_wake_remote, sd->ttwu_move_affine,
+                   sd->ttwu_move_balance);
        }
+       rcu_read_unlock();
+#endif
        kfree(mask_str);
        return 0;
 }
 
-static int schedstat_open(struct inode *inode, struct file *file)
+static void *schedstat_start(struct seq_file *file, loff_t *offset)
 {
-       unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32);
-       char *buf = kmalloc(size, GFP_KERNEL);
-       struct seq_file *m;
-       int res;
+       unsigned long n = *offset;
 
-       if (!buf)
-               return -ENOMEM;
-       res = single_open(file, show_schedstat, NULL);
-       if (!res) {
-               m = file->private_data;
-               m->buf = buf;
-               m->size = size;
-       } else
-               kfree(buf);
-       return res;
+       if (n == 0)
+               return (void *) 1;
+
+       n--;
+
+       if (n > 0)
+               n = cpumask_next(n - 1, cpu_online_mask);
+       else
+               n = cpumask_first(cpu_online_mask);
+
+       *offset = n + 1;
+
+       if (n < nr_cpu_ids)
+               return (void *)(unsigned long)(n + 2);
+
+       return NULL;
 }
 
+static void *schedstat_next(struct seq_file *file, void *data, loff_t *offset)
+{
+       (*offset)++;
+       return schedstat_start(file, offset);
+}
+
+static void schedstat_stop(struct seq_file *file, void *data)
+{
+}
+
+static const struct seq_operations schedstat_sops = {
+       .start = schedstat_start,
+       .next  = schedstat_next,
+       .stop  = schedstat_stop,
+       .show  = show_schedstat,
+};
+
+static int schedstat_open(struct inode *inode, struct file *file)
+{
+       return seq_open(file, &schedstat_sops);
+}
+
+static int schedstat_release(struct inode *inode, struct file *file)
+{
+       return 0;
+};
+
 static const struct file_operations proc_schedstat_operations = {
        .open    = schedstat_open,
        .read    = seq_read,
        .llseek  = seq_lseek,
-       .release = single_release,
+       .release = schedstat_release,
 };
 
 static int __init proc_schedstat_init(void)
-- 
1.6.0.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[RFC v2 1/2] procfs: /proc/sched_stat fails on very very large machines.

Reply via email to