Re: Industry db benchmark result on recent 2.6 kernels

Ingo Molnar Thu, 31 Mar 2005 22:50:45 -0800

* Paul Jackson <[EMAIL PROTECTED]> wrote:

> Nick wrote:
> > Ingo had a cool patch to estimate dirty => dirty cacheline transfer latency
> > ... Unfortunately ... and it is an O(cpus^2) operation.
> 
> Yes - a cool patch.


before we get into complexities, i'd like to see whether it solves Ken's 
performance problem. The attached patch (against BK-curr, but should 
apply to vanilla 2.6.12-rc1 too) adds the autodetection feature. (For 
ia64 i've hacked in a cachesize of 9MB for Ken's testsystem.)

boots fine on x86, and gives this on a 4-way box:

 Brought up 4 CPUs
 migration cost matrix (cache_size: 524288, cpu: 2379 MHz):
         [00]  [01]  [02]  [03]
 [00]:    1.3   1.3   1.4   1.2
 [01]:    1.5   1.3   1.3   1.5
 [02]:    1.5   1.3   1.3   1.5
 [03]:    1.3   1.3   1.3   1.3
 min_delta: 1351361
 using cache_decay nsec: 1351361 (1 msec)

which is a pretty reasonable estimate on that box. (fast P4s, small 
cache)

Ken, could you give it a go?

        Ingo

--- linux/kernel/sched.c.orig
+++ linux/kernel/sched.c
@@ -4699,6 +4699,232 @@ static void check_sibling_maps(void)
 #endif
 
 /*
+ * Task migration cost measurement between source and target CPUs.
+ *
+ * This is done by measuring the worst-case cost. Here are the
+ * steps that are taken:
+ *
+ * 1) the source CPU dirties its L2 cache with a shared buffer
+ * 2) the target CPU dirties its L2 cache with a local buffer
+ * 3) the target CPU dirties the shared buffer
+ *
+ * We measure the time step #3 takes - this is the cost of migrating
+ * a cache-hot task that has a large, dirty dataset in the L2 cache,
+ * to another CPU.
+ */
+
+
+/*
+ * Dirty a big buffer in a hard-to-predict (for the L2 cache) way. This
+ * is the operation that is timed, so we try to generate unpredictable
+ * cachemisses that still end up filling the L2 cache:
+ */
+__init static void fill_cache(void *__cache, unsigned long __size)
+{
+       unsigned long size = __size/sizeof(long);
+       unsigned long *cache = __cache;
+       unsigned long data = 0xdeadbeef;
+       int i;
+
+       for (i = 0; i < size/4; i++) {
+               if ((i & 3) == 0)
+                       cache[i] = data;
+               if ((i & 3) == 1)
+                       cache[size-1-i] = data;
+               if ((i & 3) == 2)
+                       cache[size/2-i] = data;
+               if ((i & 3) == 3)
+                       cache[size/2+i] = data;
+       }
+}
+
+struct flush_data {
+       unsigned long source, target;
+       void (*fn)(void *, unsigned long);
+       void *cache;
+       void *local_cache;
+       unsigned long size;
+       unsigned long long delta;
+};
+
+/*
+ * Dirty L2 on the source CPU:
+ */
+__init static void source_handler(void *__data)
+{
+       struct flush_data *data = __data;
+
+       if (smp_processor_id() != data->source)
+               return;
+
+       memset(data->cache, 0, data->size);
+}
+
+/*
+ * Dirty the L2 cache on this CPU and then access the shared
+ * buffer. (which represents the working set of the migrated task.)
+ */
+__init static void target_handler(void *__data)
+{
+       struct flush_data *data = __data;
+       unsigned long long t0, t1;
+       unsigned long flags;
+
+       if (smp_processor_id() != data->target)
+               return;
+
+       memset(data->local_cache, 0, data->size);
+       local_irq_save(flags);
+       t0 = sched_clock();
+       fill_cache(data->cache, data->size);
+       t1 = sched_clock();
+       local_irq_restore(flags);
+
+       data->delta = t1 - t0;
+}
+
+/*
+ * Measure the cache-cost of one task migration:
+ */
+__init static unsigned long long measure_one(void *cache, unsigned long size,
+                                            int source, int target)
+{
+       struct flush_data data;
+       unsigned long flags;
+       void *local_cache;
+
+       local_cache = vmalloc(size);
+       if (!local_cache) {
+               printk("couldnt allocate local cache ...\n");
+               return 0;
+       }
+       memset(local_cache, 0, size);
+
+       local_irq_save(flags);
+       local_irq_enable();
+
+       data.source = source;
+       data.target = target;
+       data.size = size;
+       data.cache = cache;
+       data.local_cache = local_cache;
+
+       if (on_each_cpu(source_handler, &data, 1, 1) != 0) {
+               printk("measure_one: timed out waiting for other CPUs\n");
+               local_irq_restore(flags);
+               return -1;
+       }
+       if (on_each_cpu(target_handler, &data, 1, 1) != 0) {
+               printk("measure_one: timed out waiting for other CPUs\n");
+               local_irq_restore(flags);
+               return -1;
+       }
+
+       vfree(local_cache);
+
+       return data.delta;
+}
+
+__initdata unsigned long sched_cache_size;
+
+/*
+ * Measure a series of task migrations and return the maximum
+ * result - the worst-case. Since this code runs early during
+ * bootup the system is 'undisturbed' and the maximum latency
+ * makes sense.
+ *
+ * As the working set we use 2.1 times the L2 cache size, this is
+ * chosen in such a nonsymmetric way so that fill_cache() doesnt
+ * iterate at power-of-2 boundaries (which might hit cache mapping
+ * artifacts and pessimise the results).
+ */
+__init static unsigned long long measure_cacheflush_time(int cpu1, int cpu2)
+{
+       unsigned long size = sched_cache_size*21/10;
+       unsigned long long delta, max = 0;
+       void *cache;
+       int i;
+
+       if (!size) {
+               printk("arch has not set cachesize - using default.\n");
+               return 0;
+       }
+       if (!cpu_online(cpu1) || !cpu_online(cpu2)) {
+               printk("cpu %d and %d not both online!\n", cpu1, cpu2);
+               return 0;
+       }
+       cache = vmalloc(size);
+       if (!cache) {
+               printk("could not vmalloc %ld bytes for cache!\n", size);
+               return 0;
+       }
+       memset(cache, 0, size);
+       for (i = 0; i < 20; i++) {
+               delta = measure_one(cache, size, cpu1, cpu2);
+               if (delta > max)
+                       max = delta;
+       }
+
+       vfree(cache);
+
+       /*
+        * A task is considered 'cache cold' if at least 2 times
+        * the worst-case cost of migration has passed.
+        * (this limit is only listened to if the load-balancing
+        * situation is 'nice' - if there is a large imbalance we
+        * ignore it for the sake of CPU utilization and
+        * processing fairness.)
+        *
+        * (We use 2.1 times the L2 cachesize in our measurement,
+        *  we keep this factor when returning.)
+        */
+       return max;
+}
+
+unsigned long long cache_decay_nsec;
+
+void __devinit calibrate_cache_decay(void)
+{
+       int cpu1 = -1, cpu2 = -1;
+       unsigned long long min_delta = -1ULL;
+
+       printk("migration cost matrix (cache_size: %ld, cpu: %ld MHz):\n",
+               sched_cache_size, cpu_khz/1000);
+       printk("      ");
+       for (cpu1 = 0; cpu1 < NR_CPUS; cpu1++) {
+               if (!cpu_online(cpu1))
+                       continue;
+               printk("  [%02d]", cpu1);
+       }
+       printk("\n");
+       for (cpu1 = 0; cpu1 < NR_CPUS; cpu1++) {
+               if (!cpu_online(cpu1))
+                       continue;
+               printk("[%02d]: ", cpu1);
+               for (cpu2 = 0; cpu2 < NR_CPUS; cpu2++) {
+                       unsigned long long delta;
+
+                       if (!cpu_online(cpu2))
+                               continue;
+                       delta = measure_cacheflush_time(cpu1, cpu2);
+                       
+                       printk(" %3Ld.%ld", delta >> 20,
+                               (((long)delta >> 10) / 102) % 10);
+                       if ((cpu1 != cpu2) && (delta < min_delta))
+                               min_delta = delta;
+               }
+               printk("\n");
+       }
+       printk("min_delta: %Ld\n", min_delta);
+       if (min_delta != -1ULL)
+               cache_decay_nsec = min_delta;
+       printk("using cache_decay nsec: %Ld (%Ld msec)\n",
+               cache_decay_nsec, cache_decay_nsec >> 20);
+
+
+}
+
+/*
  * Set up scheduler domains and groups.  Callers must hold the hotplug lock.
  */
 static void __devinit arch_init_sched_domains(void)
@@ -4706,6 +4932,7 @@ static void __devinit arch_init_sched_do
        int i;
        cpumask_t cpu_default_map;
 
+       calibrate_cache_decay();
 #if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
        check_sibling_maps();
 #endif
--- linux/arch/ia64/kernel/domain.c.orig
+++ linux/arch/ia64/kernel/domain.c
@@ -139,6 +139,9 @@ void __devinit arch_init_sched_domains(v
        int i;
        cpumask_t cpu_default_map;
 
+       sched_cache_size = 9*1024*1024; // hack for Kenneth
+       calibrate_cache_decay();
+
        /*
         * Setup mask for cpus without special case scheduling requirements.
         * For now this just excludes isolated cpus, but could be used to
--- linux/arch/i386/kernel/smpboot.c.orig
+++ linux/arch/i386/kernel/smpboot.c
@@ -873,6 +873,7 @@ static void smp_tune_scheduling (void)
                        cachesize = 16; /* Pentiums, 2x8kB cache */
                        bandwidth = 100;
                }
+               sched_cache_size = cachesize * 1024;
        }
 }
 
--- linux/include/asm-ia64/topology.h.orig
+++ linux/include/asm-ia64/topology.h
@@ -51,7 +51,7 @@ void build_cpu_to_node_map(void);
        .max_interval           = 320,                  \
        .busy_factor            = 320,                  \
        .imbalance_pct          = 125,                  \
-       .cache_hot_time         = (10*1000000),         \
+       .cache_hot_time         = cache_decay_nsec,     \
        .cache_nice_tries       = 1,                    \
        .per_cpu_gain           = 100,                  \
        .flags                  = SD_LOAD_BALANCE       \
@@ -73,7 +73,7 @@ void build_cpu_to_node_map(void);
        .max_interval           = 320,                  \
        .busy_factor            = 320,                  \
        .imbalance_pct          = 125,                  \
-       .cache_hot_time         = (10*1000000),         \
+       .cache_hot_time         = cache_decay_nsec,     \
        .cache_nice_tries       = 1,                    \
        .per_cpu_gain           = 100,                  \
        .flags                  = SD_LOAD_BALANCE       \
--- linux/include/linux/topology.h.orig
+++ linux/include/linux/topology.h
@@ -61,6 +61,12 @@
 #endif
 
 /*
+ * total time penalty to migrate a typical application's cache contents
+ * from one CPU to another. Measured by the boot-time code.
+ */
+extern unsigned long long cache_decay_nsec;
+
+/*
  * Below are the 3 major initializers used in building sched_domains:
  * SD_SIBLING_INIT, for SMT domains
  * SD_CPU_INIT, for SMP domains
@@ -112,7 +118,7 @@
        .max_interval           = 4,                    \
        .busy_factor            = 64,                   \
        .imbalance_pct          = 125,                  \
-       .cache_hot_time         = (5*1000000/2),        \
+       .cache_hot_time         = cache_decay_nsec,     \
        .cache_nice_tries       = 1,                    \
        .per_cpu_gain           = 100,                  \
        .flags                  = SD_LOAD_BALANCE       \
--- linux/include/linux/sched.h.orig
+++ linux/include/linux/sched.h
@@ -527,7 +527,12 @@ extern cpumask_t cpu_isolated_map;
 extern void init_sched_build_groups(struct sched_group groups[],
                                cpumask_t span, int (*group_fn)(int cpu));
 extern void cpu_attach_domain(struct sched_domain *sd, int cpu);
+
 #endif /* ARCH_HAS_SCHED_DOMAIN */
+
+extern unsigned long sched_cache_size;
+extern void calibrate_cache_decay(void);
+
 #endif /* CONFIG_SMP */
 
 
--- linux/include/asm-i386/topology.h.orig
+++ linux/include/asm-i386/topology.h
@@ -75,7 +75,7 @@ static inline cpumask_t pcibus_to_cpumas
        .max_interval           = 32,                   \
        .busy_factor            = 32,                   \
        .imbalance_pct          = 125,                  \
-       .cache_hot_time         = (10*1000000),         \
+       .cache_hot_time         = cache_decay_nsec,     \
        .cache_nice_tries       = 1,                    \
        .per_cpu_gain           = 100,                  \
        .flags                  = SD_LOAD_BALANCE       \
--- linux/include/asm-ppc64/topology.h.orig
+++ linux/include/asm-ppc64/topology.h
@@ -46,7 +46,7 @@ static inline int node_to_first_cpu(int 
        .max_interval           = 32,                   \
        .busy_factor            = 32,                   \
        .imbalance_pct          = 125,                  \
-       .cache_hot_time         = (10*1000000),         \
+       .cache_hot_time         = cache_decay_nsec,     \
        .cache_nice_tries       = 1,                    \
        .per_cpu_gain           = 100,                  \
        .flags                  = SD_LOAD_BALANCE       \
--- linux/include/asm-x86_64/topology.h.orig
+++ linux/include/asm-x86_64/topology.h
@@ -48,7 +48,7 @@ static inline cpumask_t __pcibus_to_cpum
        .max_interval           = 32,                   \
        .busy_factor            = 32,                   \
        .imbalance_pct          = 125,                  \
-       .cache_hot_time         = (10*1000000),         \
+       .cache_hot_time         = cache_decay_nsec,     \
        .cache_nice_tries       = 1,                    \
        .per_cpu_gain           = 100,                  \
        .flags                  = SD_LOAD_BALANCE       \

Re: Industry db benchmark result on recent 2.6 kernels

Reply via email to