Workload Consolidation is completely CPU topology and policy driven. To do so,
we define SD_WORKLOAD_CONSOLIDATION, and add some fields in sched_domain struct:

1) total_groups is the group number in total in this domain
2) group_number is this CPU's group sequence number
3) consolidating_coeff is the coefficient for consolidating CPUs, and is 
changeable
   via sysctl tool to make consolidation more aggressive or less
4) first_group is the pointer to this domain's first group ordered by CPU number

This patchset enables SD_WORKLOAD_CONSOLIDATION in MC domain by default. But we 
need
to come up with a better way to determine on which architecture this flag 
should be
enabled or not. Thanks to PeterZ and Dietmar for pointing this out and help me
finally understand it.

Signed-off-by: Yuyang Du <[email protected]>
---
 include/linux/sched.h |    8 +++++++-
 kernel/sched/core.c   |   46 ++++++++++++++++++++++++++++++++++++++++++----
 kernel/sched/sched.h  |   13 ++++++++++---
 3 files changed, 59 insertions(+), 8 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1b1997d..a339467 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -870,6 +870,7 @@ enum cpu_idle_type {
 #define SD_PREFER_SIBLING      0x1000  /* Prefer to place tasks in a sibling 
domain */
 #define SD_OVERLAP             0x2000  /* sched_domains of this level overlap 
*/
 #define SD_NUMA                        0x4000  /* cross-node balancing */
+#define SD_WORKLOAD_CONSOLIDATION  0x8000  /* consolidate CPU workload */
 
 #ifdef CONFIG_SCHED_SMT
 static inline const int cpu_smt_flags(void)
@@ -881,7 +882,7 @@ static inline const int cpu_smt_flags(void)
 #ifdef CONFIG_SCHED_MC
 static inline const int cpu_core_flags(void)
 {
-       return SD_SHARE_PKG_RESOURCES;
+       return SD_SHARE_PKG_RESOURCES | SD_WORKLOAD_CONSOLIDATION;
 }
 #endif
 
@@ -973,6 +974,11 @@ struct sched_domain {
                struct rcu_head rcu;    /* used during destruction */
        };
 
+       unsigned int total_groups;                      /* total group number */
+       unsigned int group_number;                      /* this CPU's group 
sequence */
+       unsigned int consolidating_coeff;       /* consolidating coefficient */
+       struct sched_group *first_group;        /* ordered by CPU number */
+
        unsigned int span_weight;
        /*
         * Span of all CPUs in this domain.
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3bdf01b..da3cd74 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4941,7 +4941,7 @@ set_table_entry(struct ctl_table *entry,
 static struct ctl_table *
 sd_alloc_ctl_domain_table(struct sched_domain *sd)
 {
-       struct ctl_table *table = sd_alloc_ctl_entry(14);
+       struct ctl_table *table = sd_alloc_ctl_entry(15);
 
        if (table == NULL)
                return NULL;
@@ -4974,7 +4974,9 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
                sizeof(long), 0644, proc_doulongvec_minmax, false);
        set_table_entry(&table[12], "name", sd->name,
                CORENAME_MAX_SIZE, 0444, proc_dostring, false);
-       /* &table[13] is terminator */
+       set_table_entry(&table[13], "consolidating_coeff", 
&sd->consolidating_coeff,
+               sizeof(int), 0644, proc_dointvec, false);
+       /* &table[14] is terminator */
 
        return table;
 }
@@ -5586,7 +5588,7 @@ static void update_top_cache_domain(int cpu)
        int id = cpu;
        int size = 1;
 
-       sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
+       sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES, 1);
        if (sd) {
                id = cpumask_first(sched_domain_span(sd));
                size = cpumask_weight(sched_domain_span(sd));
@@ -5601,10 +5603,41 @@ static void update_top_cache_domain(int cpu)
        sd = lowest_flag_domain(cpu, SD_NUMA);
        rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
 
-       sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
+       sd = highest_flag_domain(cpu, SD_ASYM_PACKING, 1);
        rcu_assign_pointer(per_cpu(sd_asym, cpu), sd);
 }
 
+
+DEFINE_PER_CPU(struct sched_domain *, sd_wc);
+
+static void update_wc_domain(struct sched_domain *sd, int cpu)
+{
+       while (sd) {
+               int i = 0, j = 0, first, min = INT_MAX;
+               struct sched_group *group;
+
+               group = sd->groups;
+               first = group_first_cpu(group);
+               do {
+                       int k = group_first_cpu(group);
+                       i += 1;
+                       if (k < first)
+                               j += 1;
+                       if (k < min) {
+                               sd->first_group = group;
+                               min = k;
+                       }
+               } while (group = group->next, group != sd->groups);
+
+               sd->total_groups = i;
+               sd->group_number = j;
+               sd = sd->parent;
+       }
+
+       sd = highest_flag_domain(cpu, SD_WORKLOAD_CONSOLIDATION, 0);
+       rcu_assign_pointer(per_cpu(sd_wc, cpu), sd);
+}
+
 /*
  * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
  * hold the hotplug lock.
@@ -5653,6 +5686,8 @@ cpu_attach_domain(struct sched_domain *sd, struct 
root_domain *rd, int cpu)
        destroy_sched_domains(tmp, cpu);
 
        update_top_cache_domain(cpu);
+
+       update_wc_domain(sd, cpu);
 }
 
 /* cpus with isolated domains */
@@ -6069,6 +6104,7 @@ sd_init(struct sched_domain_topology_level *tl, int cpu)
 #ifdef CONFIG_SCHED_DEBUG
                .name                   = tl->name,
 #endif
+               .consolidating_coeff = 0,
        };
 
        /*
@@ -6098,6 +6134,8 @@ sd_init(struct sched_domain_topology_level *tl, int cpu)
                }
 
 #endif
+       } else if (sd->flags & SD_WORKLOAD_CONSOLIDATION) {
+               sd->consolidating_coeff = 160;
        } else {
                sd->flags |= SD_PREFER_SIBLING;
                sd->cache_nice_tries = 1;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index eb47ce2..a2a7230 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -695,16 +695,22 @@ extern void sched_ttwu_pending(void);
  *             be returned.
  * @flag:      The flag to check for the highest sched_domain
  *             for the given cpu.
+ * @all: The flag is contained by all sched_domains from the hightest down
  *
  * Returns the highest sched_domain of a cpu which contains the given flag.
  */
-static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
+static inline struct
+sched_domain *highest_flag_domain(int cpu, int flag, int all)
 {
        struct sched_domain *sd, *hsd = NULL;
 
        for_each_domain(cpu, sd) {
-               if (!(sd->flags & flag))
-                       break;
+               if (!(sd->flags & flag)) {
+                       if (all)
+                               break;
+                       else
+                               continue;
+               }
                hsd = sd;
        }
 
@@ -729,6 +735,7 @@ DECLARE_PER_CPU(int, sd_llc_id);
 DECLARE_PER_CPU(struct sched_domain *, sd_numa);
 DECLARE_PER_CPU(struct sched_domain *, sd_busy);
 DECLARE_PER_CPU(struct sched_domain *, sd_asym);
+DECLARE_PER_CPU(struct sched_domain *, sd_wc);
 
 struct sched_group_capacity {
        atomic_t ref;
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to