[PATCH 2/2] powerpc/smp: Use GFP_ATOMIC while allocating tmp mask

2020-10-18 Thread Srikar Dronamraju
Qian Cai reported a regression where CPU Hotplug fails with the latest
powerpc/next

BUG: sleeping function called from invalid context at mm/slab.h:494
in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 0, name: swapper/88
no locks held by swapper/88/0.
irq event stamp: 18074448
hardirqs last  enabled at (18074447): [] 
tick_nohz_idle_enter+0x9c/0x110
hardirqs last disabled at (18074448): [] do_idle+0x138/0x3b0
do_idle at kernel/sched/idle.c:253 (discriminator 1)
softirqs last  enabled at (18074440): [] 
irq_enter_rcu+0x94/0xa0
softirqs last disabled at (18074439): [] 
irq_enter_rcu+0x70/0xa0
CPU: 88 PID: 0 Comm: swapper/88 Tainted: GW 
5.9.0-rc8-next-20201007 #1
Call Trace:
[c0002a4bfcf0] [c0649e98] dump_stack+0xec/0x144 (unreliable)
[c0002a4bfd30] [c00f6c34] ___might_sleep+0x2f4/0x310
[c0002a4bfdb0] [c0354f94] 
slab_pre_alloc_hook.constprop.82+0x124/0x190
[c0002a4bfe00] [c035e9e8] __kmalloc_node+0x88/0x3a0
slab_alloc_node at mm/slub.c:2817
(inlined by) __kmalloc_node at mm/slub.c:4013
[c0002a4bfe80] [c06494d8] alloc_cpumask_var_node+0x38/0x80
kmalloc_node at include/linux/slab.h:577
(inlined by) alloc_cpumask_var_node at lib/cpumask.c:116
[c0002a4bfef0] [c003eedc] start_secondary+0x27c/0x800
update_mask_by_l2 at arch/powerpc/kernel/smp.c:1267
(inlined by) add_cpu_to_masks at arch/powerpc/kernel/smp.c:1387
(inlined by) start_secondary at arch/powerpc/kernel/smp.c:1420
[c0002a4bff90] [c000c468] start_secondary_resume+0x10/0x14

Allocating a temporary mask while performing a CPU Hotplug operation
with CONFIG_CPUMASK_OFFSTACK enabled, leads to calling a sleepable
function from a atomic context. Fix this by allocating the temporary
mask with GFP_ATOMIC flag. Also instead of having to allocate twice,
allocate the mask in the caller so that we only have to allocate once.
If the allocation fails, assume the mask to be same as sibling mask, which
will make the scheduler to drop this domain for this CPU.

Fixes: 70a94089d7f7 ("powerpc/smp: Optimize update_coregroup_mask")
Fixes: 3ab33d6dc3e9 ("powerpc/smp: Optimize update_mask_by_l2")
Reported-by: Qian Cai 
Signed-off-by: Srikar Dronamraju 
Cc: linuxppc-dev 
Cc: LKML 
Cc: Michael Ellerman 
Cc: Nathan Lynch 
Cc: Gautham R Shenoy 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Valentin Schneider 
Cc: Qian Cai 
---
Changelog v1->v2:
https://lore.kernel.org/linuxppc-dev/20201008034240.34059-1-sri...@linux.vnet.ibm.com/t/#u
Updated 2nd patch based on comments from Michael Ellerman
- Remove the WARN_ON.
- Handle allocation failures in a more subtle fashion
- Allocate in the caller so that we allocate once.

 arch/powerpc/kernel/smp.c | 57 +--
 1 file changed, 31 insertions(+), 26 deletions(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index a864b9b3228c..028479e9b66b 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -1257,38 +1257,33 @@ static struct device_node *cpu_to_l2cache(int cpu)
return cache;
 }
 
-static bool update_mask_by_l2(int cpu)
+static bool update_mask_by_l2(int cpu, cpumask_var_t *mask)
 {
struct cpumask *(*submask_fn)(int) = cpu_sibling_mask;
struct device_node *l2_cache, *np;
-   cpumask_var_t mask;
int i;
 
if (has_big_cores)
submask_fn = cpu_smallcore_mask;
 
l2_cache = cpu_to_l2cache(cpu);
-   if (!l2_cache) {
-   /*
-* If no l2cache for this CPU, assume all siblings to share
-* cache with this CPU.
-*/
+   if (!l2_cache || !*mask) {
+   /* Assume only core siblings share cache with this CPU */
for_each_cpu(i, submask_fn(cpu))
set_cpus_related(cpu, i, cpu_l2_cache_mask);
 
return false;
}
 
-   alloc_cpumask_var_node(, GFP_KERNEL, cpu_to_node(cpu));
-   cpumask_and(mask, cpu_online_mask, cpu_cpu_mask(cpu));
+   cpumask_and(*mask, cpu_online_mask, cpu_cpu_mask(cpu));
 
/* Update l2-cache mask with all the CPUs that are part of submask */
or_cpumasks_related(cpu, cpu, submask_fn, cpu_l2_cache_mask);
 
/* Skip all CPUs already part of current CPU l2-cache mask */
-   cpumask_andnot(mask, mask, cpu_l2_cache_mask(cpu));
+   cpumask_andnot(*mask, *mask, cpu_l2_cache_mask(cpu));
 
-   for_each_cpu(i, mask) {
+   for_each_cpu(i, *mask) {
/*
 * when updating the marks the current CPU has not been marked
 * online, but we need to update the cache masks
@@ -1298,15 +1293,14 @@ static bool update_mask_by_l2(int cpu)
/* Skip all CPUs already part of current CPU l2-cache */
if (np == l2_cache) {
or_cpumasks_related(cpu, i, submask_fn, 
cpu_l2_cache_mask);
-   cpumask_andnot(m

[PATCH 1/2] powerpc/smp: Remove unnecessary variable

2020-10-18 Thread Srikar Dronamraju
Commit 3ab33d6dc3e9 ("powerpc/smp: Optimize update_mask_by_l2")
introduced submask_fn in update_mask_by_l2 to track the right submask.
However commit f6606cfdfbcd ("powerpc/smp: Dont assume l2-cache to be
superset of sibling") introduced sibling_mask in update_mask_by_l2 to
track the same submask. Remove sibling_mask in favour of submask_fn.

Signed-off-by: Srikar Dronamraju 
Cc: linuxppc-dev 
Cc: LKML 
Cc: Michael Ellerman 
Cc: Nathan Lynch 
Cc: Gautham R Shenoy 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Valentin Schneider 
Cc: Qian Cai 
---
 arch/powerpc/kernel/smp.c | 13 -
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 8d1c401f4617..a864b9b3228c 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -1264,18 +1264,16 @@ static bool update_mask_by_l2(int cpu)
cpumask_var_t mask;
int i;
 
+   if (has_big_cores)
+   submask_fn = cpu_smallcore_mask;
+
l2_cache = cpu_to_l2cache(cpu);
if (!l2_cache) {
-   struct cpumask *(*sibling_mask)(int) = cpu_sibling_mask;
-
/*
 * If no l2cache for this CPU, assume all siblings to share
 * cache with this CPU.
 */
-   if (has_big_cores)
-   sibling_mask = cpu_smallcore_mask;
-
-   for_each_cpu(i, sibling_mask(cpu))
+   for_each_cpu(i, submask_fn(cpu))
set_cpus_related(cpu, i, cpu_l2_cache_mask);
 
return false;
@@ -1284,9 +1282,6 @@ static bool update_mask_by_l2(int cpu)
alloc_cpumask_var_node(, GFP_KERNEL, cpu_to_node(cpu));
cpumask_and(mask, cpu_online_mask, cpu_cpu_mask(cpu));
 
-   if (has_big_cores)
-   submask_fn = cpu_smallcore_mask;
-
/* Update l2-cache mask with all the CPUs that are part of submask */
or_cpumasks_related(cpu, cpu, submask_fn, cpu_l2_cache_mask);
 
-- 
2.18.2



[PATCH v2 0/2] Fixes for coregroup

2020-10-18 Thread Srikar Dronamraju
These patches fixes problems introduced by the coregroup patches.
The first patch we remove a redundant variable.
Second patch allows to boot with CONFIG_CPUMASK_OFFSTACK enabled.

Changelog v1->v2:
https://lore.kernel.org/linuxppc-dev/20201008034240.34059-1-sri...@linux.vnet.ibm.com/t/#u
1. 1st patch was not part of previous posting.
2. Updated 2nd patch based on comments from Michael Ellerman

Cc: linuxppc-dev 
Cc: LKML 
Cc: Michael Ellerman 
Cc: Nathan Lynch 
Cc: Gautham R Shenoy 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Valentin Schneider 
Cc: Qian Cai 

Srikar Dronamraju (2):
  powerpc/smp: Remove unnecessary variable
  powerpc/smp: Use GFP_ATOMIC while allocating tmp mask

 arch/powerpc/kernel/smp.c | 70 +++
 1 file changed, 35 insertions(+), 35 deletions(-)

-- 
2.18.2



[PATCH] powerpc/smp: Use GFP_ATOMIC while allocating tmp mask

2020-10-07 Thread Srikar Dronamraju
Qian Cai reported a regression where CPU Hotplug fails with the latest
powerpc/next

BUG: sleeping function called from invalid context at mm/slab.h:494
in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 0, name: swapper/88
no locks held by swapper/88/0.
irq event stamp: 18074448
hardirqs last  enabled at (18074447): [] 
tick_nohz_idle_enter+0x9c/0x110
hardirqs last disabled at (18074448): [] do_idle+0x138/0x3b0
do_idle at kernel/sched/idle.c:253 (discriminator 1)
softirqs last  enabled at (18074440): [] 
irq_enter_rcu+0x94/0xa0
softirqs last disabled at (18074439): [] 
irq_enter_rcu+0x70/0xa0
CPU: 88 PID: 0 Comm: swapper/88 Tainted: GW 
5.9.0-rc8-next-20201007 #1
Call Trace:
[c0002a4bfcf0] [c0649e98] dump_stack+0xec/0x144 (unreliable)
[c0002a4bfd30] [c00f6c34] ___might_sleep+0x2f4/0x310
[c0002a4bfdb0] [c0354f94] 
slab_pre_alloc_hook.constprop.82+0x124/0x190
[c0002a4bfe00] [c035e9e8] __kmalloc_node+0x88/0x3a0
slab_alloc_node at mm/slub.c:2817
(inlined by) __kmalloc_node at mm/slub.c:4013
[c0002a4bfe80] [c06494d8] alloc_cpumask_var_node+0x38/0x80
kmalloc_node at include/linux/slab.h:577
(inlined by) alloc_cpumask_var_node at lib/cpumask.c:116
[c0002a4bfef0] [c003eedc] start_secondary+0x27c/0x800
update_mask_by_l2 at arch/powerpc/kernel/smp.c:1267
(inlined by) add_cpu_to_masks at arch/powerpc/kernel/smp.c:1387
(inlined by) start_secondary at arch/powerpc/kernel/smp.c:1420
[c0002a4bff90] [c000c468] start_secondary_resume+0x10/0x14

Allocating a temporary mask while performing a CPU Hotplug operation
with CONFIG_CPUMASK_OFFSTACK enabled, leads to calling a sleepable
function from a atomic context. Fix this by allocating the temporary
mask with GFP_ATOMIC flag.

If there is a failure to allocate a mask, scheduler is going to observe
that this CPU's topology is broken. Instead of having to speculate why
the topology is broken, add a WARN_ON_ONCE.

Fixes: 70a94089d7f7 ("powerpc/smp: Optimize update_coregroup_mask")
Fixes: 3ab33d6dc3e9 ("powerpc/smp: Optimize update_mask_by_l2")
Reported-by: Qian Cai 
Suggested-by: Qian Cai 
Signed-off-by: Srikar Dronamraju 
Cc: linuxppc-dev 
Cc: LKML 
Cc: Michael Ellerman 
Cc: Nathan Lynch 
Cc: Gautham R Shenoy 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Valentin Schneider 
Cc: Qian Cai 
---
 arch/powerpc/kernel/smp.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 0dc1b85..1268558 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -1264,7 +1264,8 @@ static bool update_mask_by_l2(int cpu)
return false;
}
 
-   alloc_cpumask_var_node(, GFP_KERNEL, cpu_to_node(cpu));
+   /* In CPU-hotplug path, hence use GFP_ATOMIC */
+   WARN_ON_ONCE(!alloc_cpumask_var_node(, GFP_ATOMIC, 
cpu_to_node(cpu)));
cpumask_and(mask, cpu_online_mask, cpu_cpu_mask(cpu));
 
if (has_big_cores)
@@ -1344,7 +1345,8 @@ static void update_coregroup_mask(int cpu)
int coregroup_id = cpu_to_coregroup_id(cpu);
int i;
 
-   alloc_cpumask_var_node(, GFP_KERNEL, cpu_to_node(cpu));
+   /* In CPU-hotplug path, hence use GFP_ATOMIC */
+   WARN_ON_ONCE(!alloc_cpumask_var_node(, GFP_ATOMIC, 
cpu_to_node(cpu)));
cpumask_and(mask, cpu_online_mask, cpu_cpu_mask(cpu));
 
if (shared_caches)
-- 
1.8.3.1



[PATCH v3 06/11] powerpc/smp: Stop passing mask to update_mask_by_l2

2020-10-07 Thread Srikar Dronamraju
update_mask_by_l2 is called only once. But it passes cpu_l2_cache_mask
as parameter. Instead of passing cpu_l2_cache_mask, use it directly in
update_mask_by_l2.

Signed-off-by: Srikar Dronamraju 
Cc: linuxppc-dev 
Cc: LKML 
Cc: Michael Ellerman 
Cc: Nicholas Piggin 
Cc: Anton Blanchard 
Cc: Oliver O'Halloran 
Cc: Nathan Lynch 
Cc: Michael Neuling 
Cc: Gautham R Shenoy 
Cc: Satheesh Rajendran 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Valentin Schneider 
Cc: Qian Cai 
---
 arch/powerpc/kernel/smp.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index c860c4950c9f..441c9c64b1e3 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -1218,7 +1218,7 @@ static struct device_node *cpu_to_l2cache(int cpu)
return cache;
 }
 
-static bool update_mask_by_l2(int cpu, struct cpumask *(*mask_fn)(int))
+static bool update_mask_by_l2(int cpu)
 {
struct device_node *l2_cache, *np;
int i;
@@ -1240,7 +1240,7 @@ static bool update_mask_by_l2(int cpu, struct cpumask 
*(*mask_fn)(int))
return false;
}
 
-   cpumask_set_cpu(cpu, mask_fn(cpu));
+   cpumask_set_cpu(cpu, cpu_l2_cache_mask(cpu));
for_each_cpu_and(i, cpu_online_mask, cpu_cpu_mask(cpu)) {
/*
 * when updating the marks the current CPU has not been marked
@@ -1251,7 +1251,7 @@ static bool update_mask_by_l2(int cpu, struct cpumask 
*(*mask_fn)(int))
continue;
 
if (np == l2_cache)
-   set_cpus_related(cpu, i, mask_fn);
+   set_cpus_related(cpu, i, cpu_l2_cache_mask);
 
of_node_put(np);
}
@@ -1315,7 +1315,7 @@ static void add_cpu_to_masks(int cpu)
set_cpus_related(i, cpu, cpu_sibling_mask);
 
add_cpu_to_smallcore_masks(cpu);
-   update_mask_by_l2(cpu, cpu_l2_cache_mask);
+   update_mask_by_l2(cpu);
 
if (has_coregroup_support()) {
int coregroup_id = cpu_to_coregroup_id(cpu);
-- 
2.17.1



[PATCH v3 11/11] powerpc/smp: Optimize update_coregroup_mask

2020-10-07 Thread Srikar Dronamraju
All threads of a SMT4/SMT8 core can either be part of CPU's coregroup
mask or outside the coregroup. Use this relation to reduce the
number of iterations needed to find all the CPUs that share the same
coregroup

Use a temporary mask to iterate through the CPUs that may share
coregroup mask. Also instead of setting one CPU at a time into
cpu_coregroup_mask, copy the SMT4/SMT8/submask at one shot.

Cc: linuxppc-dev 
Cc: LKML 
Cc: Michael Ellerman 
Cc: Nicholas Piggin 
Cc: Anton Blanchard 
Cc: Oliver O'Halloran 
Cc: Nathan Lynch 
Cc: Michael Neuling 
Cc: Gautham R Shenoy 
Cc: Satheesh Rajendran 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Valentin Schneider 
Signed-off-by: Srikar Dronamraju 
---
Changelog v2->v3:
Use GFP_ATOMIC instead of GFP_KERNEL since allocations need to
atomic at the time of CPU HotPlug
Reported by Qian Cai 

 arch/powerpc/kernel/smp.c | 31 +++
 1 file changed, 23 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index b48ae4e306d3..bbaea93dc558 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -1340,19 +1340,34 @@ static inline void add_cpu_to_smallcore_masks(int cpu)
 
 static void update_coregroup_mask(int cpu)
 {
-   int first_thread = cpu_first_thread_sibling(cpu);
+   struct cpumask *(*submask_fn)(int) = cpu_sibling_mask;
+   cpumask_var_t mask;
int coregroup_id = cpu_to_coregroup_id(cpu);
int i;
 
-   cpumask_set_cpu(cpu, cpu_coregroup_mask(cpu));
-   for_each_cpu_and(i, cpu_online_mask, cpu_cpu_mask(cpu)) {
-   int fcpu = cpu_first_thread_sibling(i);
+   /* In CPU-hotplug path, hence use GFP_ATOMIC */
+   alloc_cpumask_var_node(, GFP_ATOMIC, cpu_to_node(cpu));
+   cpumask_and(mask, cpu_online_mask, cpu_cpu_mask(cpu));
+
+   if (shared_caches)
+   submask_fn = cpu_l2_cache_mask;
 
-   if (fcpu == first_thread)
-   set_cpus_related(cpu, i, cpu_coregroup_mask);
-   else if (coregroup_id == cpu_to_coregroup_id(i))
-   set_cpus_related(cpu, i, cpu_coregroup_mask);
+   /* Update coregroup mask with all the CPUs that are part of submask */
+   or_cpumasks_related(cpu, cpu, submask_fn, cpu_coregroup_mask);
+
+   /* Skip all CPUs already part of coregroup mask */
+   cpumask_andnot(mask, mask, cpu_coregroup_mask(cpu));
+
+   for_each_cpu(i, mask) {
+   /* Skip all CPUs not part of this coregroup */
+   if (coregroup_id == cpu_to_coregroup_id(i)) {
+   or_cpumasks_related(cpu, i, submask_fn, 
cpu_coregroup_mask);
+   cpumask_andnot(mask, mask, submask_fn(i));
+   } else {
+   cpumask_andnot(mask, mask, cpu_coregroup_mask(i));
+   }
}
+   free_cpumask_var(mask);
 }
 
 static void add_cpu_to_masks(int cpu)
-- 
2.17.1



[PATCH v3 09/11] powerpc/smp: Optimize update_mask_by_l2

2020-10-07 Thread Srikar Dronamraju
All threads of a SMT4 core can either be part of this CPU's l2-cache
mask or not related to this CPU l2-cache mask. Use this relation to
reduce the number of iterations needed to find all the CPUs that share
the same l2-cache.

Use a temporary mask to iterate through the CPUs that may share l2_cache
mask. Also instead of setting one CPU at a time into cpu_l2_cache_mask,
copy the SMT4/sub mask at one shot.

Signed-off-by: Srikar Dronamraju 
Cc: linuxppc-dev 
Cc: LKML 
Cc: Michael Ellerman 
Cc: Nicholas Piggin 
Cc: Anton Blanchard 
Cc: Oliver O'Halloran 
Cc: Nathan Lynch 
Cc: Michael Neuling 
Cc: Gautham R Shenoy 
Cc: Satheesh Rajendran 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Valentin Schneider 
Cc: Qian Cai 
---
Changelog v2->v3:
Use GFP_ATOMIC instead of GFP_KERNEL since allocations need to
atomic at the time of CPU HotPlug
Reported by Qian Cai 

 arch/powerpc/kernel/smp.c | 52 +--
 1 file changed, 46 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 6f866e6b12f8..17e90c2414af 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -671,6 +671,28 @@ static void set_cpus_unrelated(int i, int j,
 #endif
 
 /*
+ * Extends set_cpus_related. Instead of setting one CPU at a time in
+ * dstmask, set srcmask at oneshot. dstmask should be super set of srcmask.
+ */
+static void or_cpumasks_related(int i, int j, struct cpumask *(*srcmask)(int),
+   struct cpumask *(*dstmask)(int))
+{
+   struct cpumask *mask;
+   int k;
+
+   mask = srcmask(j);
+   for_each_cpu(k, srcmask(i))
+   cpumask_or(dstmask(k), dstmask(k), mask);
+
+   if (i == j)
+   return;
+
+   mask = srcmask(i);
+   for_each_cpu(k, srcmask(j))
+   cpumask_or(dstmask(k), dstmask(k), mask);
+}
+
+/*
  * parse_thread_groups: Parses the "ibm,thread-groups" device tree
  *  property for the CPU device node @dn and stores
  *  the parsed output in the thread_groups
@@ -1220,7 +1242,9 @@ static struct device_node *cpu_to_l2cache(int cpu)
 
 static bool update_mask_by_l2(int cpu)
 {
+   struct cpumask *(*submask_fn)(int) = cpu_sibling_mask;
struct device_node *l2_cache, *np;
+   cpumask_var_t mask;
int i;
 
l2_cache = cpu_to_l2cache(cpu);
@@ -1240,22 +1264,38 @@ static bool update_mask_by_l2(int cpu)
return false;
}
 
-   cpumask_set_cpu(cpu, cpu_l2_cache_mask(cpu));
-   for_each_cpu_and(i, cpu_online_mask, cpu_cpu_mask(cpu)) {
+   /* In CPU-hotplug path, hence use GFP_ATOMIC */
+   alloc_cpumask_var_node(, GFP_ATOMIC, cpu_to_node(cpu));
+   cpumask_and(mask, cpu_online_mask, cpu_cpu_mask(cpu));
+
+   if (has_big_cores)
+   submask_fn = cpu_smallcore_mask;
+
+   /* Update l2-cache mask with all the CPUs that are part of submask */
+   or_cpumasks_related(cpu, cpu, submask_fn, cpu_l2_cache_mask);
+
+   /* Skip all CPUs already part of current CPU l2-cache mask */
+   cpumask_andnot(mask, mask, cpu_l2_cache_mask(cpu));
+
+   for_each_cpu(i, mask) {
/*
 * when updating the marks the current CPU has not been marked
 * online, but we need to update the cache masks
 */
np = cpu_to_l2cache(i);
-   if (!np)
-   continue;
 
-   if (np == l2_cache)
-   set_cpus_related(cpu, i, cpu_l2_cache_mask);
+   /* Skip all CPUs already part of current CPU l2-cache */
+   if (np == l2_cache) {
+   or_cpumasks_related(cpu, i, submask_fn, 
cpu_l2_cache_mask);
+   cpumask_andnot(mask, mask, submask_fn(i));
+   } else {
+   cpumask_andnot(mask, mask, cpu_l2_cache_mask(i));
+   }
 
of_node_put(np);
}
of_node_put(l2_cache);
+   free_cpumask_var(mask);
 
return true;
 }
-- 
2.17.1



[PATCH v3 10/11] powerpc/smp: Move coregroup mask updation to a new function

2020-10-07 Thread Srikar Dronamraju
Move the logic for updating the coregroup mask of a CPU to its own
function. This will help in reworking the updation of coregroup mask in
subsequent patch.

Signed-off-by: Srikar Dronamraju 
Cc: linuxppc-dev 
Cc: LKML 
Cc: Michael Ellerman 
Cc: Nicholas Piggin 
Cc: Anton Blanchard 
Cc: Oliver O'Halloran 
Cc: Nathan Lynch 
Cc: Michael Neuling 
Cc: Gautham R Shenoy 
Cc: Satheesh Rajendran 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Valentin Schneider 
Cc: Qian Cai 
---
 arch/powerpc/kernel/smp.c | 32 +++-
 1 file changed, 19 insertions(+), 13 deletions(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 17e90c2414af..b48ae4e306d3 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -1338,6 +1338,23 @@ static inline void add_cpu_to_smallcore_masks(int cpu)
}
 }
 
+static void update_coregroup_mask(int cpu)
+{
+   int first_thread = cpu_first_thread_sibling(cpu);
+   int coregroup_id = cpu_to_coregroup_id(cpu);
+   int i;
+
+   cpumask_set_cpu(cpu, cpu_coregroup_mask(cpu));
+   for_each_cpu_and(i, cpu_online_mask, cpu_cpu_mask(cpu)) {
+   int fcpu = cpu_first_thread_sibling(i);
+
+   if (fcpu == first_thread)
+   set_cpus_related(cpu, i, cpu_coregroup_mask);
+   else if (coregroup_id == cpu_to_coregroup_id(i))
+   set_cpus_related(cpu, i, cpu_coregroup_mask);
+   }
+}
+
 static void add_cpu_to_masks(int cpu)
 {
int first_thread = cpu_first_thread_sibling(cpu);
@@ -1356,19 +1373,8 @@ static void add_cpu_to_masks(int cpu)
add_cpu_to_smallcore_masks(cpu);
update_mask_by_l2(cpu);
 
-   if (has_coregroup_support()) {
-   int coregroup_id = cpu_to_coregroup_id(cpu);
-
-   cpumask_set_cpu(cpu, cpu_coregroup_mask(cpu));
-   for_each_cpu_and(i, cpu_online_mask, cpu_cpu_mask(cpu)) {
-   int fcpu = cpu_first_thread_sibling(i);
-
-   if (fcpu == first_thread)
-   set_cpus_related(cpu, i, cpu_coregroup_mask);
-   else if (coregroup_id == cpu_to_coregroup_id(i))
-   set_cpus_related(cpu, i, cpu_coregroup_mask);
-   }
-   }
+   if (has_coregroup_support())
+   update_coregroup_mask(cpu);
 }
 
 /* Activate a secondary processor. */
-- 
2.17.1



[PATCH v3 08/11] powerpc/smp: Check for duplicate topologies and consolidate

2020-10-07 Thread Srikar Dronamraju
CACHE and COREGROUP domains are now part of default topology. However on
systems that don't support CACHE or COREGROUP, these domains will
eventually be degenerated. The degeneration happens per CPU. Do note the
current fixup_topology() logic ensures that mask of a domain that is not
supported on the current platform is set to the previous domain.

Instead of waiting for the scheduler to degenerated try to consolidate
based on their masks and sd_flags. This is done just before setting
the scheduler topology.

Signed-off-by: Srikar Dronamraju 
Cc: linuxppc-dev 
Cc: LKML 
Cc: Michael Ellerman 
Cc: Nicholas Piggin 
Cc: Anton Blanchard 
Cc: Oliver O'Halloran 
Cc: Nathan Lynch 
Cc: Michael Neuling 
Cc: Gautham R Shenoy 
Cc: Satheesh Rajendran 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Valentin Schneider 
Cc: Qian Cai 
---
 arch/powerpc/kernel/smp.c | 26 ++
 1 file changed, 26 insertions(+)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index aeb219a4bf7a..6f866e6b12f8 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -1401,6 +1401,8 @@ int setup_profiling_timer(unsigned int multiplier)
 
 static void fixup_topology(void)
 {
+   int i;
+
 #ifdef CONFIG_SCHED_SMT
if (has_big_cores) {
pr_info("Big cores detected but using small core scheduling\n");
@@ -1410,6 +1412,30 @@ static void fixup_topology(void)
 
if (!has_coregroup_support())
powerpc_topology[mc_idx].mask = 
powerpc_topology[cache_idx].mask;
+
+   /*
+* Try to consolidate topology levels here instead of
+* allowing scheduler to degenerate.
+* - Dont consolidate if masks are different.
+* - Dont consolidate if sd_flags exists and are different.
+*/
+   for (i = 1; i <= die_idx; i++) {
+   if (powerpc_topology[i].mask != powerpc_topology[i - 1].mask)
+   continue;
+
+   if (powerpc_topology[i].sd_flags && powerpc_topology[i - 
1].sd_flags &&
+   powerpc_topology[i].sd_flags != 
powerpc_topology[i - 1].sd_flags)
+   continue;
+
+   if (!powerpc_topology[i - 1].sd_flags)
+   powerpc_topology[i - 1].sd_flags = 
powerpc_topology[i].sd_flags;
+
+   powerpc_topology[i].mask = powerpc_topology[i + 1].mask;
+   powerpc_topology[i].sd_flags = powerpc_topology[i + 1].sd_flags;
+#ifdef CONFIG_SCHED_DEBUG
+   powerpc_topology[i].name = powerpc_topology[i + 1].name;
+#endif
+   }
 }
 
 void __init smp_cpus_done(unsigned int max_cpus)
-- 
2.17.1



[PATCH v3 07/11] powerpc/smp: Depend on cpu_l1_cache_map when adding CPUs

2020-10-07 Thread Srikar Dronamraju
Currently on hotplug/hotunplug, CPU iterates through all the CPUs in
its core to find threads in its thread group. However this info is
already captured in cpu_l1_cache_map. Hence reduce iterations and
cleanup add_cpu_to_smallcore_masks function.

Signed-off-by: Srikar Dronamraju 
Tested-by: Satheesh Rajendran 
Cc: linuxppc-dev 
Cc: LKML 
Cc: Michael Ellerman 
Cc: Nicholas Piggin 
Cc: Anton Blanchard 
Cc: Oliver O'Halloran 
Cc: Nathan Lynch 
Cc: Michael Neuling 
Cc: Gautham R Shenoy 
Cc: Satheesh Rajendran 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Valentin Schneider 
Cc: Qian Cai 
---
 arch/powerpc/kernel/smp.c | 7 +++
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 441c9c64b1e3..aeb219a4bf7a 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -1285,16 +1285,15 @@ static void remove_cpu_from_masks(int cpu)
 
 static inline void add_cpu_to_smallcore_masks(int cpu)
 {
-   struct cpumask *this_l1_cache_map = per_cpu(cpu_l1_cache_map, cpu);
-   int i, first_thread = cpu_first_thread_sibling(cpu);
+   int i;
 
if (!has_big_cores)
return;
 
cpumask_set_cpu(cpu, cpu_smallcore_mask(cpu));
 
-   for (i = first_thread; i < first_thread + threads_per_core; i++) {
-   if (cpu_online(i) && cpumask_test_cpu(i, this_l1_cache_map))
+   for_each_cpu(i, per_cpu(cpu_l1_cache_map, cpu)) {
+   if (cpu_online(i))
set_cpus_related(i, cpu, cpu_smallcore_mask);
}
 }
-- 
2.17.1



[PATCH v3 05/11] powerpc/smp: Limit CPUs traversed to within a node.

2020-10-07 Thread Srikar Dronamraju
All the arch specific topology cpumasks are within a node/DIE.
However when setting these per CPU cpumasks, system traverses through
all the online CPUs. This is redundant.

Reduce the traversal to only CPUs that are online in the node to which
the CPU belongs to.

Signed-off-by: Srikar Dronamraju 
Tested-by: Satheesh Rajendran 
Cc: linuxppc-dev 
Cc: LKML 
Cc: Michael Ellerman 
Cc: Nicholas Piggin 
Cc: Anton Blanchard 
Cc: Oliver O'Halloran 
Cc: Nathan Lynch 
Cc: Michael Neuling 
Cc: Gautham R Shenoy 
Cc: Satheesh Rajendran 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Valentin Schneider 
Cc: Qian Cai 
---
 arch/powerpc/kernel/smp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 2e61a81aad88..c860c4950c9f 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -1241,7 +1241,7 @@ static bool update_mask_by_l2(int cpu, struct cpumask 
*(*mask_fn)(int))
}
 
cpumask_set_cpu(cpu, mask_fn(cpu));
-   for_each_cpu(i, cpu_online_mask) {
+   for_each_cpu_and(i, cpu_online_mask, cpu_cpu_mask(cpu)) {
/*
 * when updating the marks the current CPU has not been marked
 * online, but we need to update the cache masks
-- 
2.17.1



[PATCH v3 03/11] powerpc/smp: Remove get_physical_package_id

2020-10-07 Thread Srikar Dronamraju
Now that cpu_core_mask has been removed and topology_core_cpumask has
been updated to use cpu_cpu_mask, we no more need
get_physical_package_id.

Signed-off-by: Srikar Dronamraju 
Tested-by: Satheesh Rajendran 
Cc: linuxppc-dev 
Cc: LKML 
Cc: Michael Ellerman 
Cc: Nicholas Piggin 
Cc: Anton Blanchard 
Cc: Oliver O'Halloran 
Cc: Nathan Lynch 
Cc: Michael Neuling 
Cc: Gautham R Shenoy 
Cc: Satheesh Rajendran 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Valentin Schneider 
Cc: Qian Cai 
---
 arch/powerpc/include/asm/topology.h |  5 -
 arch/powerpc/kernel/smp.c   | 20 
 2 files changed, 25 deletions(-)

diff --git a/arch/powerpc/include/asm/topology.h 
b/arch/powerpc/include/asm/topology.h
index e0f232533c9d..e45219f74be0 100644
--- a/arch/powerpc/include/asm/topology.h
+++ b/arch/powerpc/include/asm/topology.h
@@ -114,12 +114,7 @@ static inline int cpu_to_coregroup_id(int cpu)
 #ifdef CONFIG_PPC64
 #include 
 
-#ifdef CONFIG_PPC_SPLPAR
-int get_physical_package_id(int cpu);
-#define topology_physical_package_id(cpu)  (get_physical_package_id(cpu))
-#else
 #define topology_physical_package_id(cpu)  (cpu_to_chip_id(cpu))
-#endif
 
 #define topology_sibling_cpumask(cpu)  (per_cpu(cpu_sibling_map, cpu))
 #define topology_core_cpumask(cpu) (cpu_cpu_mask(cpu))
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index ec41491beca4..8c095fe237b2 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -1292,26 +1292,6 @@ static inline void add_cpu_to_smallcore_masks(int cpu)
}
 }
 
-int get_physical_package_id(int cpu)
-{
-   int pkg_id = cpu_to_chip_id(cpu);
-
-   /*
-* If the platform is PowerNV or Guest on KVM, ibm,chip-id is
-* defined. Hence we would return the chip-id as the result of
-* get_physical_package_id.
-*/
-   if (pkg_id == -1 && firmware_has_feature(FW_FEATURE_LPAR) &&
-   IS_ENABLED(CONFIG_PPC_SPLPAR)) {
-   struct device_node *np = of_get_cpu_node(cpu, NULL);
-   pkg_id = of_node_to_nid(np);
-   of_node_put(np);
-   }
-
-   return pkg_id;
-}
-EXPORT_SYMBOL_GPL(get_physical_package_id);
-
 static void add_cpu_to_masks(int cpu)
 {
int first_thread = cpu_first_thread_sibling(cpu);
-- 
2.17.1



[PATCH v3 04/11] powerpc/smp: Optimize remove_cpu_from_masks

2020-10-07 Thread Srikar Dronamraju
While offlining a CPU, system currently iterate through all the CPUs in
the DIE to clear sibling, l2_cache and smallcore maps. However if there
are more cores in a DIE, system can end up spending more time iterating
through CPUs which are completely unrelated.

Optimize this by only iterating through smaller but relevant cpumap.
If shared_cache is set, cpu_l2_cache_map should be relevant else
cpu_sibling_map would be relevant.

Signed-off-by: Srikar Dronamraju 
Tested-by: Satheesh Rajendran 
Cc: linuxppc-dev 
Cc: LKML 
Cc: Michael Ellerman 
Cc: Nicholas Piggin 
Cc: Anton Blanchard 
Cc: Oliver O'Halloran 
Cc: Nathan Lynch 
Cc: Michael Neuling 
Cc: Gautham R Shenoy 
Cc: Satheesh Rajendran 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Valentin Schneider 
Cc: Qian Cai 
---
 arch/powerpc/kernel/smp.c | 11 +--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 8c095fe237b2..2e61a81aad88 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -1263,14 +1263,21 @@ static bool update_mask_by_l2(int cpu, struct cpumask 
*(*mask_fn)(int))
 #ifdef CONFIG_HOTPLUG_CPU
 static void remove_cpu_from_masks(int cpu)
 {
+   struct cpumask *(*mask_fn)(int) = cpu_sibling_mask;
int i;
 
-   for_each_cpu(i, cpu_cpu_mask(cpu)) {
+   if (shared_caches)
+   mask_fn = cpu_l2_cache_mask;
+
+   for_each_cpu(i, mask_fn(cpu)) {
set_cpus_unrelated(cpu, i, cpu_l2_cache_mask);
set_cpus_unrelated(cpu, i, cpu_sibling_mask);
if (has_big_cores)
set_cpus_unrelated(cpu, i, cpu_smallcore_mask);
-   if (has_coregroup_support())
+   }
+
+   if (has_coregroup_support()) {
+   for_each_cpu(i, cpu_coregroup_mask(cpu))
set_cpus_unrelated(cpu, i, cpu_coregroup_mask);
}
 }
-- 
2.17.1



[PATCH v3 00/11] Optimization to improve CPU online/offline on Powerpc

2020-10-07 Thread Srikar Dronamraju
h: 113  99.5000th: 104
99.9000th: 159  99.9000th: 129
min=0, max=15221min=0, max=7666

100 interations of ppc64_cpu --smt=1 / ppc64_cpu --smt=8
Units: seconds : lesser is better
-
ppc64_cpu --smt=1
kernelNMinMaxMedian  Avg  Stddev
powerpc/next  100  13.39  17.55  14.71   14.7658  0.69184745
+patchset 100  13.3   16.27  14.33   14.4179  0.5427433

ppc64_cpu --smt=8
kernelNMinMaxMedian  Avg  Stddev
powerpc/next  100  21.65  26.17  23.71   23.7111  0.8589786
+patchset 100  21.88  25.79  23.16   23.2945  0.86394839


Observations:
Performance of ebizzy/ perf_sched_bench / schbench remain the
same with and without the patchset.

Cc: linuxppc-dev 
Cc: LKML 
Cc: Michael Ellerman 
Cc: Nicholas Piggin 
Cc: Anton Blanchard 
Cc: Oliver O'Halloran 
Cc: Nathan Lynch 
Cc: Michael Neuling 
Cc: Gautham R Shenoy 
Cc: Satheesh Rajendran 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Valentin Schneider 
Cc: Qian Cai 

Srikar Dronamraju (11):
  powerpc/topology: Update topology_core_cpumask
  powerpc/smp: Stop updating cpu_core_mask
  powerpc/smp: Remove get_physical_package_id
  powerpc/smp: Optimize remove_cpu_from_masks
  powerpc/smp: Limit CPUs traversed to within a node.
  powerpc/smp: Stop passing mask to update_mask_by_l2
  powerpc/smp: Depend on cpu_l1_cache_map when adding CPUs
  powerpc/smp: Check for duplicate topologies and consolidate
  powerpc/smp: Optimize update_mask_by_l2
  powerpc/smp: Move coregroup mask updation to a new function
  powerpc/smp: Optimize update_coregroup_mask

 arch/powerpc/include/asm/smp.h  |   5 -
 arch/powerpc/include/asm/topology.h |   7 +-
 arch/powerpc/kernel/smp.c   | 188 +++-
 3 files changed, 122 insertions(+), 78 deletions(-)

-- 
2.17.1



[PATCH v3 02/11] powerpc/smp: Stop updating cpu_core_mask

2020-10-07 Thread Srikar Dronamraju
Anton Blanchard reported that his 4096 vcpu KVM guest took around 30
minutes to boot. He also analyzed it to the time taken to iterate while
setting the cpu_core_mask.

Further analysis shows that cpu_core_mask and cpu_cpu_mask for any CPU
would be equal on Power. However updating cpu_core_mask took forever to
update as its a per cpu cpumask variable. Instead cpu_cpu_mask was a per
NODE /per DIE cpumask that was shared by all the respective CPUs.

Also cpu_cpu_mask is needed from a scheduler perspective. However
cpu_core_map is an exported symbol. Hence stop updating cpu_core_map
and make it point to cpu_cpu_mask.

Signed-off-by: Srikar Dronamraju 
Tested-by: Satheesh Rajendran 
Cc: linuxppc-dev 
Cc: LKML 
Cc: Michael Ellerman 
Cc: Nicholas Piggin 
Cc: Anton Blanchard 
Cc: Oliver O'Halloran 
Cc: Nathan Lynch 
Cc: Michael Neuling 
Cc: Gautham R Shenoy 
Cc: Satheesh Rajendran 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Valentin Schneider 
Cc: Qian Cai 
---
 arch/powerpc/include/asm/smp.h |  5 -
 arch/powerpc/kernel/smp.c  | 33 +++--
 2 files changed, 7 insertions(+), 31 deletions(-)

diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h
index 041f0b97c45b..40e121dd16af 100644
--- a/arch/powerpc/include/asm/smp.h
+++ b/arch/powerpc/include/asm/smp.h
@@ -119,11 +119,6 @@ static inline struct cpumask *cpu_sibling_mask(int cpu)
return per_cpu(cpu_sibling_map, cpu);
 }
 
-static inline struct cpumask *cpu_core_mask(int cpu)
-{
-   return per_cpu(cpu_core_map, cpu);
-}
-
 static inline struct cpumask *cpu_l2_cache_mask(int cpu)
 {
return per_cpu(cpu_l2_cache_map, cpu);
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 3d96752d6570..ec41491beca4 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -953,12 +953,17 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
local_memory_node(numa_cpu_lookup_table[cpu]));
}
 #endif
+   /*
+* cpu_core_map is now more updated and exists only since
+* its been exported for long. It only will have a snapshot
+* of cpu_cpu_mask.
+*/
+   cpumask_copy(per_cpu(cpu_core_map, cpu), cpu_cpu_mask(cpu));
}
 
/* Init the cpumasks so the boot CPU is related to itself */
cpumask_set_cpu(boot_cpuid, cpu_sibling_mask(boot_cpuid));
cpumask_set_cpu(boot_cpuid, cpu_l2_cache_mask(boot_cpuid));
-   cpumask_set_cpu(boot_cpuid, cpu_core_mask(boot_cpuid));
 
if (has_coregroup_support())
cpumask_set_cpu(boot_cpuid, cpu_coregroup_mask(boot_cpuid));
@@ -1260,9 +1265,7 @@ static void remove_cpu_from_masks(int cpu)
 {
int i;
 
-   /* NB: cpu_core_mask is a superset of the others */
-   for_each_cpu(i, cpu_core_mask(cpu)) {
-   set_cpus_unrelated(cpu, i, cpu_core_mask);
+   for_each_cpu(i, cpu_cpu_mask(cpu)) {
set_cpus_unrelated(cpu, i, cpu_l2_cache_mask);
set_cpus_unrelated(cpu, i, cpu_sibling_mask);
if (has_big_cores)
@@ -1312,7 +1315,6 @@ EXPORT_SYMBOL_GPL(get_physical_package_id);
 static void add_cpu_to_masks(int cpu)
 {
int first_thread = cpu_first_thread_sibling(cpu);
-   int pkg_id = get_physical_package_id(cpu);
int i;
 
/*
@@ -1320,7 +1322,6 @@ static void add_cpu_to_masks(int cpu)
 * add it to it's own thread sibling mask.
 */
cpumask_set_cpu(cpu, cpu_sibling_mask(cpu));
-   cpumask_set_cpu(cpu, cpu_core_mask(cpu));
 
for (i = first_thread; i < first_thread + threads_per_core; i++)
if (cpu_online(i))
@@ -1342,26 +1343,6 @@ static void add_cpu_to_masks(int cpu)
set_cpus_related(cpu, i, cpu_coregroup_mask);
}
}
-
-   if (pkg_id == -1) {
-   struct cpumask *(*mask)(int) = cpu_sibling_mask;
-
-   /*
-* Copy the sibling mask into core sibling mask and
-* mark any CPUs on the same chip as this CPU.
-*/
-   if (shared_caches)
-   mask = cpu_l2_cache_mask;
-
-   for_each_cpu(i, mask(cpu))
-   set_cpus_related(cpu, i, cpu_core_mask);
-
-   return;
-   }
-
-   for_each_cpu(i, cpu_online_mask)
-   if (get_physical_package_id(i) == pkg_id)
-   set_cpus_related(cpu, i, cpu_core_mask);
 }
 
 /* Activate a secondary processor. */
-- 
2.17.1



[PATCH v3 01/11] powerpc/topology: Update topology_core_cpumask

2020-10-07 Thread Srikar Dronamraju
On Power, cpu_core_mask and cpu_cpu_mask refer to the same set of CPUs.
cpu_cpu_mask is needed by scheduler, hence look at deprecating
cpu_core_mask. Before deleting the cpu_core_mask, ensure its only user
is moved to cpu_cpu_mask.

Signed-off-by: Srikar Dronamraju 
Tested-by: Satheesh Rajendran 
Cc: linuxppc-dev 
Cc: LKML 
Cc: Michael Ellerman 
Cc: Nicholas Piggin 
Cc: Anton Blanchard 
Cc: Oliver O'Halloran 
Cc: Nathan Lynch 
Cc: Michael Neuling 
Cc: Gautham R Shenoy 
Cc: Satheesh Rajendran 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Valentin Schneider 
Cc: Qian Cai 
---
 arch/powerpc/include/asm/topology.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/topology.h 
b/arch/powerpc/include/asm/topology.h
index 6609174918ab..e0f232533c9d 100644
--- a/arch/powerpc/include/asm/topology.h
+++ b/arch/powerpc/include/asm/topology.h
@@ -122,7 +122,7 @@ int get_physical_package_id(int cpu);
 #endif
 
 #define topology_sibling_cpumask(cpu)  (per_cpu(cpu_sibling_map, cpu))
-#define topology_core_cpumask(cpu) (per_cpu(cpu_core_map, cpu))
+#define topology_core_cpumask(cpu) (cpu_cpu_mask(cpu))
 #define topology_core_id(cpu)  (cpu_to_core_id(cpu))
 
 #endif
-- 
2.17.1



Re: [PATCH v2 09/11] powerpc/smp: Optimize update_mask_by_l2

2020-10-07 Thread Srikar Dronamraju
* Qian Cai  [2020-10-07 09:05:42]:

Hi Qian,

Thanks for testing and reporting the failure.

> On Mon, 2020-09-21 at 15:26 +0530, Srikar Dronamraju wrote:
> > All threads of a SMT4 core can either be part of this CPU's l2-cache
> > mask or not related to this CPU l2-cache mask. Use this relation to
> > reduce the number of iterations needed to find all the CPUs that share
> > the same l2-cache.
> > 
> > Use a temporary mask to iterate through the CPUs that may share l2_cache
> > mask. Also instead of setting one CPU at a time into cpu_l2_cache_mask,
> > copy the SMT4/sub mask at one shot.
> > 
> ...
> >  static bool update_mask_by_l2(int cpu)
> >  {
> > +   struct cpumask *(*submask_fn)(int) = cpu_sibling_mask;
> > struct device_node *l2_cache, *np;
> > +   cpumask_var_t mask;
> > int i;
> >  
> > l2_cache = cpu_to_l2cache(cpu);
> > @@ -1240,22 +1264,37 @@ static bool update_mask_by_l2(int cpu)
> > return false;
> > }
> >  
> > -   cpumask_set_cpu(cpu, cpu_l2_cache_mask(cpu));
> > -   for_each_cpu_and(i, cpu_online_mask, cpu_cpu_mask(cpu)) {
> > +   alloc_cpumask_var_node(, GFP_KERNEL, cpu_to_node(cpu));
> 
> Shouldn't this be GFP_ATOMIC? Otherwise, during the CPU hotplugging, we have,

Can you confirm if CONFIG_CPUMASK_OFFSTACK is enabled in your config?
Because if !CONFIG_CPUMASK_OFFSTACK, then alloc_cpumask_var_node would do
nothing but return true.

Regarding CONFIG_CPUMASK_OFFSTACK, not sure how much powerpc was tested
with that config enabled.

Please refer to
http://lore.kernel.org/lkml/87o8nv51bg@mpe.ellerman.id.au/t/#u
And we do have an issue to track the same.
https://github.com/linuxppc/issues/issues/321 for enabling/ testing /
verifying if CONFIG_CPUMASK_OFFSTACK works. I also dont see any
powerpc kconfig enabling this.

I do agree with your suggestion that we could substitute
GFP_ATOMIC/GFP_KERNEL.

> 
> (irqs were disabled in do_idle())
> 
> [  335.420001][T0] BUG: sleeping function called from invalid context at 
> mm/slab.h:494
> [  335.420003][T0] in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 
> 0, name: swapper/88
> [  335.420005][T0] no locks held by swapper/88/0.
> [  335.420007][T0] irq event stamp: 18074448
> [  335.420015][T0] hardirqs last  enabled at (18074447): 
> [] tick_nohz_idle_enter+0x9c/0x110
> [  335.420019][T0] hardirqs last disabled at (18074448): 
> [] do_idle+0x138/0x3b0
> do_idle at kernel/sched/idle.c:253 (discriminator 1)
> [  335.420023][T0] softirqs last  enabled at (18074440): 
> [] irq_enter_rcu+0x94/0xa0
> [  335.420026][T0] softirqs last disabled at (18074439): 
> [] irq_enter_rcu+0x70/0xa0
> [  335.420030][T0] CPU: 88 PID: 0 Comm: swapper/88 Tainted: GW
>  5.9.0-rc8-next-20201007 #1
> [  335.420032][T0] Call Trace:
> [  335.420037][T0] [c0002a4bfcf0] [c0649e98] 
> dump_stack+0xec/0x144 (unreliable)
> [  335.420043][T0] [c0002a4bfd30] [c00f6c34] 
> ___might_sleep+0x2f4/0x310
> [  335.420048][T0] [c0002a4bfdb0] [c0354f94] 
> slab_pre_alloc_hook.constprop.82+0x124/0x190
> [  335.420051][T0] [c0002a4bfe00] [c035e9e8] 
> __kmalloc_node+0x88/0x3a0
> slab_alloc_node at mm/slub.c:2817
> (inlined by) __kmalloc_node at mm/slub.c:4013
> [  335.420054][T0] [c0002a4bfe80] [c06494d8] 
> alloc_cpumask_var_node+0x38/0x80
> kmalloc_node at include/linux/slab.h:577
> (inlined by) alloc_cpumask_var_node at lib/cpumask.c:116
> [  335.420060][T0] [c0002a4bfef0] [c003eedc] 
> start_secondary+0x27c/0x800
> update_mask_by_l2 at arch/powerpc/kernel/smp.c:1267
> (inlined by) add_cpu_to_masks at arch/powerpc/kernel/smp.c:1387
> (inlined by) start_secondary at arch/powerpc/kernel/smp.c:1420
> [  335.420063][T0] [c0002a4bff90] [c000c468] 
> start_secondary_resume+0x10/0x14
> 
> > +   cpumask_and(mask, cpu_online_mask, cpu_cpu_mask(cpu));
> > +
> > +   if (has_big_cores)
> > +   submask_fn = cpu_smallcore_mask;
> > +
> > +   /* Update l2-cache mask with all the CPUs that are part of submask */
> > +   or_cpumasks_related(cpu, cpu, submask_fn, cpu_l2_cache_mask);
> > +
> > +   /* Skip all CPUs already part of current CPU l2-cache mask */
> > +   cpumask_andnot(mask, mask, cpu_l2_cache_mask(cpu));
> > +
> > +   for_each_cpu(i, mask) {
> > /*
> >  * when updating the marks the current CPU has not been marked
> >  * online, but we need to update the cache masks
> >  */
> > np = cpu_to_l2cache(i);
> > -   if (!np)
> > -   

Re: [PATCH] cpufreq: powernv: Fix frame-size-overflow in powernv_cpufreq_reboot_notifier

2020-09-23 Thread Srikar Dronamraju
* Daniel Axtens  [2020-09-24 12:35:05]:

> Hi Srikar,
> 
> 
> This looks a lot like commit d95fe371ecd2 ("cpufreq: powernv: Fix 
> frame-size-overflow in powernv_cpufreq_work_fn").
> 

Yes, very very similar.

> As with that patch, I have checked for matching puts/gets and that all
> uses of '&' check out.
> 
> I tried to look at the snowpatch tests: they apparently reported a
> checkpatch warning but the file has since disappeared so I can't see
> what it was. Running checkpatch locally:
> 
> $ scripts/checkpatch.pl -g HEAD -strict
> WARNING: Possible unwrapped commit description (prefer a maximum 75 chars per 
> line)
> #15: 
> make[3]: *** [./scripts/Makefile.build:316: 
> drivers/cpufreq/powernv-cpufreq.o] Error 1
> 
> This is benign and you shouldn't wrap that line anyway.
> 
> On that basis:
> 
> Reviewed-by: Daniel Axtens 
> 

Thanks Daniel.

> Kind regards,
> Daniel
> 
-- 
Thanks and Regards
Srikar Dronamraju


[PATCH] cpufreq: powernv: Fix frame-size-overflow in powernv_cpufreq_reboot_notifier

2020-09-22 Thread Srikar Dronamraju
The patch avoids allocating cpufreq_policy on stack hence fixing frame
size overflow in 'powernv_cpufreq_reboot_notifier'

./drivers/cpufreq/powernv-cpufreq.c: In function 
_powernv_cpufreq_reboot_notifier_:
./drivers/cpufreq/powernv-cpufreq.c:906:1: error: the frame size of 2064 bytes 
is larger than 2048 bytes [-Werror=frame-larger-than=]
 }
 ^
cc1: all warnings being treated as errors
make[3]: *** [./scripts/Makefile.build:316: drivers/cpufreq/powernv-cpufreq.o] 
Error 1
make[2]: *** [./scripts/Makefile.build:556: drivers/cpufreq] Error 2
make[1]: *** [./Makefile:1072: drivers] Error 2
make[1]: *** Waiting for unfinished jobs
make: *** [Makefile:157: sub-make] Error 2

Fixes: cf30af76 ("cpufreq: powernv: Set the cpus to nominal frequency during 
reboot/kexec")
Cc: Pratik Rajesh Sampat 
Cc: Daniel Axtens 
Cc: Michael Ellerman 
Cc: linuxppc-dev 
Signed-off-by: Srikar Dronamraju 
---
 drivers/cpufreq/powernv-cpufreq.c | 9 ++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/drivers/cpufreq/powernv-cpufreq.c 
b/drivers/cpufreq/powernv-cpufreq.c
index a9af15e..e439b43 100644
--- a/drivers/cpufreq/powernv-cpufreq.c
+++ b/drivers/cpufreq/powernv-cpufreq.c
@@ -885,12 +885,15 @@ static int powernv_cpufreq_reboot_notifier(struct 
notifier_block *nb,
unsigned long action, void *unused)
 {
int cpu;
-   struct cpufreq_policy cpu_policy;
+   struct cpufreq_policy *cpu_policy;
 
rebooting = true;
for_each_online_cpu(cpu) {
-   cpufreq_get_policy(_policy, cpu);
-   powernv_cpufreq_target_index(_policy, get_nominal_index());
+   cpu_policy = cpufreq_cpu_get(cpu);
+   if (!cpu_policy)
+   continue;
+   powernv_cpufreq_target_index(cpu_policy, get_nominal_index());
+   cpufreq_cpu_put(cpu_policy);
}
 
return NOTIFY_DONE;
-- 
1.8.3.1



Re: [PATCH] powerpc/64: Make VDSO32 track COMPAT on 64-bit

2020-09-22 Thread Srikar Dronamraju
* Michael Ellerman  [2020-09-17 21:28:46]:

> On Tue, 8 Sep 2020 22:58:50 +1000, Michael Ellerman wrote:
> > When we added the VDSO32 kconfig symbol, which controls building of
> > the 32-bit VDSO, we made it depend on CPU_BIG_ENDIAN (for 64-bit).
> > 
> > That was because back then COMPAT was always enabled for 64-bit, so
> > depending on it would have left the 32-bit VDSO always enabled, which
> > we didn't want.
> > 
> > [...]
> 
> Applied to powerpc/next.
> 
> [1/1] powerpc/64: Make VDSO32 track COMPAT on 64-bit
>   
> https://git.kernel.org/powerpc/c/231b232df8f67e7d37af01259c21f2a131c3911e
> 
> cheers

With this commit which is part of powerpc/next and with
/opt/at12.0/bin/gcc --version
gcc (GCC) 8.4.1 20191125 (Advance-Toolchain 12.0-3) [e25f27eea473]
throws up a compile error on a witherspoon/PowerNV with CONFIG_COMPAT.
CONFIG_COMPAT got carried from the distro config. (And looks like most
distros seem to be having this config)

cc1: error: _-m32_ not supported in this configuration
make[4]: *** [arch/powerpc/kernel/vdso32/sigtramp.o] Error 1
make[4]: *** Waiting for unfinished jobs
cc1: error: _-m32_ not supported in this configuration
make[4]: *** [arch/powerpc/kernel/vdso32/gettimeofday.o] Error 1
make[3]: *** [arch/powerpc/kernel/vdso32] Error 2
make[3]: *** Waiting for unfinished jobs
make[2]: *** [arch/powerpc/kernel] Error 2
make[2]: *** Waiting for unfinished jobs
make[1]: *** [arch/powerpc] Error 2
make[1]: *** Waiting for unfinished jobs
make: *** [__sub-make] Error 2

I don't seem to be facing with other compilers like "gcc (Ubuntu
7.4.0-1ubuntu1~18.04.1) 7.4.0" and I was able to disable CONFIG_COMPAT and
proceed with the build.

-- 
Thanks and Regards
Srikar Dronamraju


[PATCH v2 11/11] powerpc/smp: Optimize update_coregroup_mask

2020-09-21 Thread Srikar Dronamraju
All threads of a SMT4/SMT8 core can either be part of CPU's coregroup
mask or outside the coregroup. Use this relation to reduce the
number of iterations needed to find all the CPUs that share the same
coregroup

Use a temporary mask to iterate through the CPUs that may share
coregroup mask. Also instead of setting one CPU at a time into
cpu_coregroup_mask, copy the SMT4/SMT8/submask at one shot.

Cc: linuxppc-dev 
Cc: LKML 
Cc: Michael Ellerman 
Cc: Nicholas Piggin 
Cc: Anton Blanchard 
Cc: Oliver O'Halloran 
Cc: Nathan Lynch 
Cc: Michael Neuling 
Cc: Gautham R Shenoy 
Cc: Satheesh Rajendran 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Valentin Schneider 
Signed-off-by: Srikar Dronamraju 
---
 arch/powerpc/kernel/smp.c | 30 ++
 1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index b48ae4e306d3..bbaea93dc558 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -1339,19 +1339,33 @@ static inline void add_cpu_to_smallcore_masks(int cpu)
 
 static void update_coregroup_mask(int cpu)
 {
-   int first_thread = cpu_first_thread_sibling(cpu);
+   struct cpumask *(*submask_fn)(int) = cpu_sibling_mask;
+   cpumask_var_t mask;
int coregroup_id = cpu_to_coregroup_id(cpu);
int i;
 
-   cpumask_set_cpu(cpu, cpu_coregroup_mask(cpu));
-   for_each_cpu_and(i, cpu_online_mask, cpu_cpu_mask(cpu)) {
-   int fcpu = cpu_first_thread_sibling(i);
+   alloc_cpumask_var_node(, GFP_KERNEL, cpu_to_node(cpu));
+   cpumask_and(mask, cpu_online_mask, cpu_cpu_mask(cpu));
+
+   if (shared_caches)
+   submask_fn = cpu_l2_cache_mask;
+
+   /* Update coregroup mask with all the CPUs that are part of submask */
+   or_cpumasks_related(cpu, cpu, submask_fn, cpu_coregroup_mask);
+
+   /* Skip all CPUs already part of coregroup mask */
+   cpumask_andnot(mask, mask, cpu_coregroup_mask(cpu));
 
-   if (fcpu == first_thread)
-   set_cpus_related(cpu, i, cpu_coregroup_mask);
-   else if (coregroup_id == cpu_to_coregroup_id(i))
-   set_cpus_related(cpu, i, cpu_coregroup_mask);
+   for_each_cpu(i, mask) {
+   /* Skip all CPUs not part of this coregroup */
+   if (coregroup_id == cpu_to_coregroup_id(i)) {
+   or_cpumasks_related(cpu, i, submask_fn, 
cpu_coregroup_mask);
+   cpumask_andnot(mask, mask, submask_fn(i));
+   } else {
+   cpumask_andnot(mask, mask, cpu_coregroup_mask(i));
+   }
}
+   free_cpumask_var(mask);
 }
 
 static void add_cpu_to_masks(int cpu)
-- 
2.17.1



[PATCH v2 10/11] powerpc/smp: Move coregroup mask updation to a new function

2020-09-21 Thread Srikar Dronamraju
Move the logic for updating the coregroup mask of a CPU to its own
function. This will help in reworking the updation of coregroup mask in
subsequent patch.

Cc: linuxppc-dev 
Cc: LKML 
Cc: Michael Ellerman 
Cc: Nicholas Piggin 
Cc: Anton Blanchard 
Cc: Oliver O'Halloran 
Cc: Nathan Lynch 
Cc: Michael Neuling 
Cc: Gautham R Shenoy 
Cc: Satheesh Rajendran 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Valentin Schneider 
Signed-off-by: Srikar Dronamraju 
---
 arch/powerpc/kernel/smp.c | 32 +++-
 1 file changed, 19 insertions(+), 13 deletions(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 17e90c2414af..b48ae4e306d3 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -1337,6 +1337,23 @@ static inline void add_cpu_to_smallcore_masks(int cpu)
}
 }
 
+static void update_coregroup_mask(int cpu)
+{
+   int first_thread = cpu_first_thread_sibling(cpu);
+   int coregroup_id = cpu_to_coregroup_id(cpu);
+   int i;
+
+   cpumask_set_cpu(cpu, cpu_coregroup_mask(cpu));
+   for_each_cpu_and(i, cpu_online_mask, cpu_cpu_mask(cpu)) {
+   int fcpu = cpu_first_thread_sibling(i);
+
+   if (fcpu == first_thread)
+   set_cpus_related(cpu, i, cpu_coregroup_mask);
+   else if (coregroup_id == cpu_to_coregroup_id(i))
+   set_cpus_related(cpu, i, cpu_coregroup_mask);
+   }
+}
+
 static void add_cpu_to_masks(int cpu)
 {
int first_thread = cpu_first_thread_sibling(cpu);
@@ -1355,19 +1372,8 @@ static void add_cpu_to_masks(int cpu)
add_cpu_to_smallcore_masks(cpu);
update_mask_by_l2(cpu);
 
-   if (has_coregroup_support()) {
-   int coregroup_id = cpu_to_coregroup_id(cpu);
-
-   cpumask_set_cpu(cpu, cpu_coregroup_mask(cpu));
-   for_each_cpu_and(i, cpu_online_mask, cpu_cpu_mask(cpu)) {
-   int fcpu = cpu_first_thread_sibling(i);
-
-   if (fcpu == first_thread)
-   set_cpus_related(cpu, i, cpu_coregroup_mask);
-   else if (coregroup_id == cpu_to_coregroup_id(i))
-   set_cpus_related(cpu, i, cpu_coregroup_mask);
-   }
-   }
+   if (has_coregroup_support())
+   update_coregroup_mask(cpu);
 }
 
 /* Activate a secondary processor. */
-- 
2.17.1



[PATCH v2 09/11] powerpc/smp: Optimize update_mask_by_l2

2020-09-21 Thread Srikar Dronamraju
All threads of a SMT4 core can either be part of this CPU's l2-cache
mask or not related to this CPU l2-cache mask. Use this relation to
reduce the number of iterations needed to find all the CPUs that share
the same l2-cache.

Use a temporary mask to iterate through the CPUs that may share l2_cache
mask. Also instead of setting one CPU at a time into cpu_l2_cache_mask,
copy the SMT4/sub mask at one shot.

Cc: linuxppc-dev 
Cc: LKML 
Cc: Michael Ellerman 
Cc: Nicholas Piggin 
Cc: Anton Blanchard 
Cc: Oliver O'Halloran 
Cc: Nathan Lynch 
Cc: Michael Neuling 
Cc: Gautham R Shenoy 
Cc: Satheesh Rajendran 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Valentin Schneider 
Signed-off-by: Srikar Dronamraju 
---
 arch/powerpc/kernel/smp.c | 51 ++-
 1 file changed, 45 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 6f866e6b12f8..17e90c2414af 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -670,6 +670,28 @@ static void set_cpus_unrelated(int i, int j,
 }
 #endif
 
+/*
+ * Extends set_cpus_related. Instead of setting one CPU at a time in
+ * dstmask, set srcmask at oneshot. dstmask should be super set of srcmask.
+ */
+static void or_cpumasks_related(int i, int j, struct cpumask *(*srcmask)(int),
+   struct cpumask *(*dstmask)(int))
+{
+   struct cpumask *mask;
+   int k;
+
+   mask = srcmask(j);
+   for_each_cpu(k, srcmask(i))
+   cpumask_or(dstmask(k), dstmask(k), mask);
+
+   if (i == j)
+   return;
+
+   mask = srcmask(i);
+   for_each_cpu(k, srcmask(j))
+   cpumask_or(dstmask(k), dstmask(k), mask);
+}
+
 /*
  * parse_thread_groups: Parses the "ibm,thread-groups" device tree
  *  property for the CPU device node @dn and stores
@@ -1220,7 +1242,9 @@ static struct device_node *cpu_to_l2cache(int cpu)
 
 static bool update_mask_by_l2(int cpu)
 {
+   struct cpumask *(*submask_fn)(int) = cpu_sibling_mask;
struct device_node *l2_cache, *np;
+   cpumask_var_t mask;
int i;
 
l2_cache = cpu_to_l2cache(cpu);
@@ -1240,22 +1264,37 @@ static bool update_mask_by_l2(int cpu)
return false;
}
 
-   cpumask_set_cpu(cpu, cpu_l2_cache_mask(cpu));
-   for_each_cpu_and(i, cpu_online_mask, cpu_cpu_mask(cpu)) {
+   alloc_cpumask_var_node(, GFP_KERNEL, cpu_to_node(cpu));
+   cpumask_and(mask, cpu_online_mask, cpu_cpu_mask(cpu));
+
+   if (has_big_cores)
+   submask_fn = cpu_smallcore_mask;
+
+   /* Update l2-cache mask with all the CPUs that are part of submask */
+   or_cpumasks_related(cpu, cpu, submask_fn, cpu_l2_cache_mask);
+
+   /* Skip all CPUs already part of current CPU l2-cache mask */
+   cpumask_andnot(mask, mask, cpu_l2_cache_mask(cpu));
+
+   for_each_cpu(i, mask) {
/*
 * when updating the marks the current CPU has not been marked
 * online, but we need to update the cache masks
 */
np = cpu_to_l2cache(i);
-   if (!np)
-   continue;
 
-   if (np == l2_cache)
-   set_cpus_related(cpu, i, cpu_l2_cache_mask);
+   /* Skip all CPUs already part of current CPU l2-cache */
+   if (np == l2_cache) {
+   or_cpumasks_related(cpu, i, submask_fn, 
cpu_l2_cache_mask);
+   cpumask_andnot(mask, mask, submask_fn(i));
+   } else {
+   cpumask_andnot(mask, mask, cpu_l2_cache_mask(i));
+   }
 
of_node_put(np);
}
of_node_put(l2_cache);
+   free_cpumask_var(mask);
 
return true;
 }
-- 
2.17.1



[PATCH v2 08/11] powerpc/smp: Check for duplicate topologies and consolidate

2020-09-21 Thread Srikar Dronamraju
CACHE and COREGROUP domains are now part of default topology. However on
systems that don't support CACHE or COREGROUP, these domains will
eventually be degenerated. The degeneration happens per CPU. Do note the
current fixup_topology() logic ensures that mask of a domain that is not
supported on the current platform is set to the previous domain.

Instead of waiting for the scheduler to degenerated try to consolidate
based on their masks and sd_flags. This is done just before setting
the scheduler topology.

Cc: linuxppc-dev 
Cc: LKML 
Cc: Michael Ellerman 
Cc: Nicholas Piggin 
Cc: Anton Blanchard 
Cc: Oliver O'Halloran 
Cc: Nathan Lynch 
Cc: Michael Neuling 
Cc: Gautham R Shenoy 
Cc: Satheesh Rajendran 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Valentin Schneider 
Signed-off-by: Srikar Dronamraju 
---
 arch/powerpc/kernel/smp.c | 26 ++
 1 file changed, 26 insertions(+)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index aeb219a4bf7a..6f866e6b12f8 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -1401,6 +1401,8 @@ int setup_profiling_timer(unsigned int multiplier)
 
 static void fixup_topology(void)
 {
+   int i;
+
 #ifdef CONFIG_SCHED_SMT
if (has_big_cores) {
pr_info("Big cores detected but using small core scheduling\n");
@@ -1410,6 +1412,30 @@ static void fixup_topology(void)
 
if (!has_coregroup_support())
powerpc_topology[mc_idx].mask = 
powerpc_topology[cache_idx].mask;
+
+   /*
+* Try to consolidate topology levels here instead of
+* allowing scheduler to degenerate.
+* - Dont consolidate if masks are different.
+* - Dont consolidate if sd_flags exists and are different.
+*/
+   for (i = 1; i <= die_idx; i++) {
+   if (powerpc_topology[i].mask != powerpc_topology[i - 1].mask)
+   continue;
+
+   if (powerpc_topology[i].sd_flags && powerpc_topology[i - 
1].sd_flags &&
+   powerpc_topology[i].sd_flags != 
powerpc_topology[i - 1].sd_flags)
+   continue;
+
+   if (!powerpc_topology[i - 1].sd_flags)
+   powerpc_topology[i - 1].sd_flags = 
powerpc_topology[i].sd_flags;
+
+   powerpc_topology[i].mask = powerpc_topology[i + 1].mask;
+   powerpc_topology[i].sd_flags = powerpc_topology[i + 1].sd_flags;
+#ifdef CONFIG_SCHED_DEBUG
+   powerpc_topology[i].name = powerpc_topology[i + 1].name;
+#endif
+   }
 }
 
 void __init smp_cpus_done(unsigned int max_cpus)
-- 
2.17.1



[PATCH v2 04/11] powerpc/smp: Optimize remove_cpu_from_masks

2020-09-21 Thread Srikar Dronamraju
While offlining a CPU, system currently iterate through all the CPUs in
the DIE to clear sibling, l2_cache and smallcore maps. However if there
are more cores in a DIE, system can end up spending more time iterating
through CPUs which are completely unrelated.

Optimize this by only iterating through smaller but relevant cpumap.
If shared_cache is set, cpu_l2_cache_map should be relevant else
cpu_sibling_map would be relevant.

Cc: linuxppc-dev 
Cc: LKML 
Cc: Michael Ellerman 
Cc: Nicholas Piggin 
Cc: Anton Blanchard 
Cc: Oliver O'Halloran 
Cc: Nathan Lynch 
Cc: Michael Neuling 
Cc: Gautham R Shenoy 
Cc: Satheesh Rajendran 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Valentin Schneider 
Signed-off-by: Srikar Dronamraju 
Tested-by: Satheesh Rajendran 
---
 arch/powerpc/kernel/smp.c | 11 +--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 8c095fe237b2..2e61a81aad88 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -1263,14 +1263,21 @@ static bool update_mask_by_l2(int cpu, struct cpumask 
*(*mask_fn)(int))
 #ifdef CONFIG_HOTPLUG_CPU
 static void remove_cpu_from_masks(int cpu)
 {
+   struct cpumask *(*mask_fn)(int) = cpu_sibling_mask;
int i;
 
-   for_each_cpu(i, cpu_cpu_mask(cpu)) {
+   if (shared_caches)
+   mask_fn = cpu_l2_cache_mask;
+
+   for_each_cpu(i, mask_fn(cpu)) {
set_cpus_unrelated(cpu, i, cpu_l2_cache_mask);
set_cpus_unrelated(cpu, i, cpu_sibling_mask);
if (has_big_cores)
set_cpus_unrelated(cpu, i, cpu_smallcore_mask);
-   if (has_coregroup_support())
+   }
+
+   if (has_coregroup_support()) {
+   for_each_cpu(i, cpu_coregroup_mask(cpu))
set_cpus_unrelated(cpu, i, cpu_coregroup_mask);
}
 }
-- 
2.17.1



[PATCH v2 07/11] powerpc/smp: Depend on cpu_l1_cache_map when adding CPUs

2020-09-21 Thread Srikar Dronamraju
Currently on hotplug/hotunplug, CPU iterates through all the CPUs in
its core to find threads in its thread group. However this info is
already captured in cpu_l1_cache_map. Hence reduce iterations and
cleanup add_cpu_to_smallcore_masks function.

Cc: linuxppc-dev 
Cc: LKML 
Cc: Michael Ellerman 
Cc: Nicholas Piggin 
Cc: Anton Blanchard 
Cc: Oliver O'Halloran 
Cc: Nathan Lynch 
Cc: Michael Neuling 
Cc: Gautham R Shenoy 
Cc: Satheesh Rajendran 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Valentin Schneider 
Signed-off-by: Srikar Dronamraju 
Tested-by: Satheesh Rajendran 
---
 arch/powerpc/kernel/smp.c | 7 +++
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 441c9c64b1e3..aeb219a4bf7a 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -1285,16 +1285,15 @@ static void remove_cpu_from_masks(int cpu)
 
 static inline void add_cpu_to_smallcore_masks(int cpu)
 {
-   struct cpumask *this_l1_cache_map = per_cpu(cpu_l1_cache_map, cpu);
-   int i, first_thread = cpu_first_thread_sibling(cpu);
+   int i;
 
if (!has_big_cores)
return;
 
cpumask_set_cpu(cpu, cpu_smallcore_mask(cpu));
 
-   for (i = first_thread; i < first_thread + threads_per_core; i++) {
-   if (cpu_online(i) && cpumask_test_cpu(i, this_l1_cache_map))
+   for_each_cpu(i, per_cpu(cpu_l1_cache_map, cpu)) {
+   if (cpu_online(i))
set_cpus_related(i, cpu, cpu_smallcore_mask);
}
 }
-- 
2.17.1



[PATCH v2 06/11] powerpc/smp: Stop passing mask to update_mask_by_l2

2020-09-21 Thread Srikar Dronamraju
update_mask_by_l2 is called only once. But it passes cpu_l2_cache_mask
as parameter. Instead of passing cpu_l2_cache_mask, use it directly in
update_mask_by_l2.

Cc: linuxppc-dev 
Cc: LKML 
Cc: Michael Ellerman 
Cc: Nicholas Piggin 
Cc: Anton Blanchard 
Cc: Oliver O'Halloran 
Cc: Nathan Lynch 
Cc: Michael Neuling 
Cc: Gautham R Shenoy 
Cc: Satheesh Rajendran 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Valentin Schneider 
Signed-off-by: Srikar Dronamraju 
Tested-by: Satheesh Rajendran 
---
 arch/powerpc/kernel/smp.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index c860c4950c9f..441c9c64b1e3 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -1218,7 +1218,7 @@ static struct device_node *cpu_to_l2cache(int cpu)
return cache;
 }
 
-static bool update_mask_by_l2(int cpu, struct cpumask *(*mask_fn)(int))
+static bool update_mask_by_l2(int cpu)
 {
struct device_node *l2_cache, *np;
int i;
@@ -1240,7 +1240,7 @@ static bool update_mask_by_l2(int cpu, struct cpumask 
*(*mask_fn)(int))
return false;
}
 
-   cpumask_set_cpu(cpu, mask_fn(cpu));
+   cpumask_set_cpu(cpu, cpu_l2_cache_mask(cpu));
for_each_cpu_and(i, cpu_online_mask, cpu_cpu_mask(cpu)) {
/*
 * when updating the marks the current CPU has not been marked
@@ -1251,7 +1251,7 @@ static bool update_mask_by_l2(int cpu, struct cpumask 
*(*mask_fn)(int))
continue;
 
if (np == l2_cache)
-   set_cpus_related(cpu, i, mask_fn);
+   set_cpus_related(cpu, i, cpu_l2_cache_mask);
 
of_node_put(np);
}
@@ -1315,7 +1315,7 @@ static void add_cpu_to_masks(int cpu)
set_cpus_related(i, cpu, cpu_sibling_mask);
 
add_cpu_to_smallcore_masks(cpu);
-   update_mask_by_l2(cpu, cpu_l2_cache_mask);
+   update_mask_by_l2(cpu);
 
if (has_coregroup_support()) {
int coregroup_id = cpu_to_coregroup_id(cpu);
-- 
2.17.1



[PATCH v2 05/11] powerpc/smp: Limit CPUs traversed to within a node.

2020-09-21 Thread Srikar Dronamraju
All the arch specific topology cpumasks are within a node/DIE.
However when setting these per CPU cpumasks, system traverses through
all the online CPUs. This is redundant.

Reduce the traversal to only CPUs that are online in the node to which
the CPU belongs to.

Cc: linuxppc-dev 
Cc: LKML 
Cc: Michael Ellerman 
Cc: Nicholas Piggin 
Cc: Anton Blanchard 
Cc: Oliver O'Halloran 
Cc: Nathan Lynch 
Cc: Michael Neuling 
Cc: Gautham R Shenoy 
Cc: Satheesh Rajendran 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Valentin Schneider 
Signed-off-by: Srikar Dronamraju 
Tested-by: Satheesh Rajendran 
---
 arch/powerpc/kernel/smp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 2e61a81aad88..c860c4950c9f 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -1241,7 +1241,7 @@ static bool update_mask_by_l2(int cpu, struct cpumask 
*(*mask_fn)(int))
}
 
cpumask_set_cpu(cpu, mask_fn(cpu));
-   for_each_cpu(i, cpu_online_mask) {
+   for_each_cpu_and(i, cpu_online_mask, cpu_cpu_mask(cpu)) {
/*
 * when updating the marks the current CPU has not been marked
 * online, but we need to update the cache masks
-- 
2.17.1



[PATCH v2 03/11] powerpc/smp: Remove get_physical_package_id

2020-09-21 Thread Srikar Dronamraju
Now that cpu_core_mask has been removed and topology_core_cpumask has
been updated to use cpu_cpu_mask, we no more need
get_physical_package_id.

Cc: linuxppc-dev 
Cc: LKML 
Cc: Michael Ellerman 
Cc: Nicholas Piggin 
Cc: Anton Blanchard 
Cc: Oliver O'Halloran 
Cc: Nathan Lynch 
Cc: Michael Neuling 
Cc: Gautham R Shenoy 
Cc: Satheesh Rajendran 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Valentin Schneider 
Signed-off-by: Srikar Dronamraju 
Tested-by: Satheesh Rajendran 
---
 arch/powerpc/include/asm/topology.h |  5 -
 arch/powerpc/kernel/smp.c   | 20 
 2 files changed, 25 deletions(-)

diff --git a/arch/powerpc/include/asm/topology.h 
b/arch/powerpc/include/asm/topology.h
index e0f232533c9d..e45219f74be0 100644
--- a/arch/powerpc/include/asm/topology.h
+++ b/arch/powerpc/include/asm/topology.h
@@ -114,12 +114,7 @@ static inline int cpu_to_coregroup_id(int cpu)
 #ifdef CONFIG_PPC64
 #include 
 
-#ifdef CONFIG_PPC_SPLPAR
-int get_physical_package_id(int cpu);
-#define topology_physical_package_id(cpu)  (get_physical_package_id(cpu))
-#else
 #define topology_physical_package_id(cpu)  (cpu_to_chip_id(cpu))
-#endif
 
 #define topology_sibling_cpumask(cpu)  (per_cpu(cpu_sibling_map, cpu))
 #define topology_core_cpumask(cpu) (cpu_cpu_mask(cpu))
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index ec41491beca4..8c095fe237b2 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -1292,26 +1292,6 @@ static inline void add_cpu_to_smallcore_masks(int cpu)
}
 }
 
-int get_physical_package_id(int cpu)
-{
-   int pkg_id = cpu_to_chip_id(cpu);
-
-   /*
-* If the platform is PowerNV or Guest on KVM, ibm,chip-id is
-* defined. Hence we would return the chip-id as the result of
-* get_physical_package_id.
-*/
-   if (pkg_id == -1 && firmware_has_feature(FW_FEATURE_LPAR) &&
-   IS_ENABLED(CONFIG_PPC_SPLPAR)) {
-   struct device_node *np = of_get_cpu_node(cpu, NULL);
-   pkg_id = of_node_to_nid(np);
-   of_node_put(np);
-   }
-
-   return pkg_id;
-}
-EXPORT_SYMBOL_GPL(get_physical_package_id);
-
 static void add_cpu_to_masks(int cpu)
 {
int first_thread = cpu_first_thread_sibling(cpu);
-- 
2.17.1



[PATCH v2 02/11] powerpc/smp: Stop updating cpu_core_mask

2020-09-21 Thread Srikar Dronamraju
Anton Blanchard reported that his 4096 vcpu KVM guest took around 30
minutes to boot. He also analyzed it to the time taken to iterate while
setting the cpu_core_mask.

Further analysis shows that cpu_core_mask and cpu_cpu_mask for any CPU
would be equal on Power. However updating cpu_core_mask took forever to
update as its a per cpu cpumask variable. Instead cpu_cpu_mask was a per
NODE /per DIE cpumask that was shared by all the respective CPUs.

Also cpu_cpu_mask is needed from a scheduler perspective. However
cpu_core_map is an exported symbol. Hence stop updating cpu_core_map
and make it point to cpu_cpu_mask.

Cc: linuxppc-dev 
Cc: LKML 
Cc: Michael Ellerman 
Cc: Nicholas Piggin 
Cc: Anton Blanchard 
Cc: Oliver O'Halloran 
Cc: Nathan Lynch 
Cc: Michael Neuling 
Cc: Gautham R Shenoy 
Cc: Satheesh Rajendran 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Valentin Schneider 
Signed-off-by: Srikar Dronamraju 
Tested-by: Satheesh Rajendran 
---
 arch/powerpc/include/asm/smp.h |  5 -
 arch/powerpc/kernel/smp.c  | 33 +++--
 2 files changed, 7 insertions(+), 31 deletions(-)

diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h
index 041f0b97c45b..40e121dd16af 100644
--- a/arch/powerpc/include/asm/smp.h
+++ b/arch/powerpc/include/asm/smp.h
@@ -119,11 +119,6 @@ static inline struct cpumask *cpu_sibling_mask(int cpu)
return per_cpu(cpu_sibling_map, cpu);
 }
 
-static inline struct cpumask *cpu_core_mask(int cpu)
-{
-   return per_cpu(cpu_core_map, cpu);
-}
-
 static inline struct cpumask *cpu_l2_cache_mask(int cpu)
 {
return per_cpu(cpu_l2_cache_map, cpu);
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 3d96752d6570..ec41491beca4 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -953,12 +953,17 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
local_memory_node(numa_cpu_lookup_table[cpu]));
}
 #endif
+   /*
+* cpu_core_map is now more updated and exists only since
+* its been exported for long. It only will have a snapshot
+* of cpu_cpu_mask.
+*/
+   cpumask_copy(per_cpu(cpu_core_map, cpu), cpu_cpu_mask(cpu));
}
 
/* Init the cpumasks so the boot CPU is related to itself */
cpumask_set_cpu(boot_cpuid, cpu_sibling_mask(boot_cpuid));
cpumask_set_cpu(boot_cpuid, cpu_l2_cache_mask(boot_cpuid));
-   cpumask_set_cpu(boot_cpuid, cpu_core_mask(boot_cpuid));
 
if (has_coregroup_support())
cpumask_set_cpu(boot_cpuid, cpu_coregroup_mask(boot_cpuid));
@@ -1260,9 +1265,7 @@ static void remove_cpu_from_masks(int cpu)
 {
int i;
 
-   /* NB: cpu_core_mask is a superset of the others */
-   for_each_cpu(i, cpu_core_mask(cpu)) {
-   set_cpus_unrelated(cpu, i, cpu_core_mask);
+   for_each_cpu(i, cpu_cpu_mask(cpu)) {
set_cpus_unrelated(cpu, i, cpu_l2_cache_mask);
set_cpus_unrelated(cpu, i, cpu_sibling_mask);
if (has_big_cores)
@@ -1312,7 +1315,6 @@ EXPORT_SYMBOL_GPL(get_physical_package_id);
 static void add_cpu_to_masks(int cpu)
 {
int first_thread = cpu_first_thread_sibling(cpu);
-   int pkg_id = get_physical_package_id(cpu);
int i;
 
/*
@@ -1320,7 +1322,6 @@ static void add_cpu_to_masks(int cpu)
 * add it to it's own thread sibling mask.
 */
cpumask_set_cpu(cpu, cpu_sibling_mask(cpu));
-   cpumask_set_cpu(cpu, cpu_core_mask(cpu));
 
for (i = first_thread; i < first_thread + threads_per_core; i++)
if (cpu_online(i))
@@ -1342,26 +1343,6 @@ static void add_cpu_to_masks(int cpu)
set_cpus_related(cpu, i, cpu_coregroup_mask);
}
}
-
-   if (pkg_id == -1) {
-   struct cpumask *(*mask)(int) = cpu_sibling_mask;
-
-   /*
-* Copy the sibling mask into core sibling mask and
-* mark any CPUs on the same chip as this CPU.
-*/
-   if (shared_caches)
-   mask = cpu_l2_cache_mask;
-
-   for_each_cpu(i, mask(cpu))
-   set_cpus_related(cpu, i, cpu_core_mask);
-
-   return;
-   }
-
-   for_each_cpu(i, cpu_online_mask)
-   if (get_physical_package_id(i) == pkg_id)
-   set_cpus_related(cpu, i, cpu_core_mask);
 }
 
 /* Activate a secondary processor. */
-- 
2.17.1



[PATCH v2 01/11] powerpc/topology: Update topology_core_cpumask

2020-09-21 Thread Srikar Dronamraju
On Power, cpu_core_mask and cpu_cpu_mask refer to the same set of CPUs.
cpu_cpu_mask is needed by scheduler, hence look at deprecating
cpu_core_mask. Before deleting the cpu_core_mask, ensure its only user
is moved to cpu_cpu_mask.

Cc: linuxppc-dev 
Cc: LKML 
Cc: Michael Ellerman 
Cc: Nicholas Piggin 
Cc: Anton Blanchard 
Cc: Oliver O'Halloran 
Cc: Nathan Lynch 
Cc: Michael Neuling 
Cc: Gautham R Shenoy 
Cc: Satheesh Rajendran 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Valentin Schneider 
Signed-off-by: Srikar Dronamraju 
Tested-by: Satheesh Rajendran 
---
 arch/powerpc/include/asm/topology.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/topology.h 
b/arch/powerpc/include/asm/topology.h
index 6609174918ab..e0f232533c9d 100644
--- a/arch/powerpc/include/asm/topology.h
+++ b/arch/powerpc/include/asm/topology.h
@@ -122,7 +122,7 @@ int get_physical_package_id(int cpu);
 #endif
 
 #define topology_sibling_cpumask(cpu)  (per_cpu(cpu_sibling_map, cpu))
-#define topology_core_cpumask(cpu) (per_cpu(cpu_core_map, cpu))
+#define topology_core_cpumask(cpu) (cpu_cpu_mask(cpu))
 #define topology_core_id(cpu)  (cpu_to_core_id(cpu))
 
 #endif
-- 
2.17.1



[PATCH v2 00/11] Optimization to improve CPU online/offline on Powerpc

2020-09-21 Thread Srikar Dronamraju
  0.8589786
+patchset 100  21.88  25.79  23.16   23.2945  0.86394839


Observations:
Performance of ebizzy/ perf_sched_bench / schbench remain the
same with and without the patchset.

Cc: linuxppc-dev 
Cc: LKML 
Cc: Michael Ellerman 
Cc: Nicholas Piggin 
Cc: Anton Blanchard 
Cc: Oliver O'Halloran 
Cc: Nathan Lynch 
Cc: Michael Neuling 
Cc: Gautham R Shenoy 
Cc: Satheesh Rajendran 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Valentin Schneider 
Signed-off-by: Srikar Dronamraju 

Changelog v1->v2:
v1 link: 
https://lore.kernel.org/linuxppc-dev/20200727075532.30058-1-sri...@linux.vnet.ibm.com/t/#u
Added five more patches on top of Seven.
Rebased to 19th Sept 2020 powerpc/next (based on v5.9-rc2)

Srikar Dronamraju (11):
  powerpc/topology: Update topology_core_cpumask
  powerpc/smp: Stop updating cpu_core_mask
  powerpc/smp: Remove get_physical_package_id
  powerpc/smp: Optimize remove_cpu_from_masks
  powerpc/smp: Limit CPUs traversed to within a node.
  powerpc/smp: Stop passing mask to update_mask_by_l2
  powerpc/smp: Depend on cpu_l1_cache_map when adding CPUs
  powerpc/smp: Check for duplicate topologies and consolidate
  powerpc/smp: Optimize update_mask_by_l2
  powerpc/smp: Move coregroup mask updation to a new function
  powerpc/smp: Optimize update_coregroup_mask

 arch/powerpc/include/asm/smp.h  |   5 -
 arch/powerpc/include/asm/topology.h |   7 +-
 arch/powerpc/kernel/smp.c   | 186 ++--
 3 files changed, 120 insertions(+), 78 deletions(-)

-- 
2.17.1



Re: [PATCH v5 05/10] powerpc/smp: Dont assume l2-cache to be superset of sibling

2020-09-13 Thread Srikar Dronamraju
Fix to make it work where CPUs dont have a l2-cache element.

>8-8<-

>From b25d47b01b7195b1df19083a4043fa6a87a901a3 Mon Sep 17 00:00:00 2001
From: Srikar Dronamraju 
Date: Thu, 9 Jul 2020 13:33:38 +0530
Subject: [PATCH v5.2 05/10] powerpc/smp: Dont assume l2-cache to be superset of
 sibling

Current code assumes that cpumask of cpus sharing a l2-cache mask will
always be a superset of cpu_sibling_mask.

Lets stop that assumption. cpu_l2_cache_mask is a superset of
cpu_sibling_mask if and only if shared_caches is set.

Cc: linuxppc-dev 
Cc: LKML 
Cc: Michael Ellerman 
Cc: Nicholas Piggin 
Cc: Anton Blanchard 
Cc: Oliver O'Halloran 
Cc: Nathan Lynch 
Cc: Michael Neuling 
Cc: Gautham R Shenoy 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Valentin Schneider 
Cc: Jordan Niethe 
Cc: Vaidyanathan Srinivasan 
Reviewed-by: Gautham R. Shenoy 
Signed-off-by: Srikar Dronamraju 
---
Changelog v1 -> v2:
Set cpumask after verifying l2-cache. (Gautham)

Changelog v5 -> v5.2:
If cpu has no l2-cache set cpumask as per its
 sibling mask. (Michael Ellerman)

 arch/powerpc/kernel/smp.c | 43 +--
 1 file changed, 29 insertions(+), 14 deletions(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 9f4333d..168532e 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -1186,9 +1186,23 @@ static bool update_mask_by_l2(int cpu, struct cpumask 
*(*mask_fn)(int))
int i;
 
l2_cache = cpu_to_l2cache(cpu);
-   if (!l2_cache)
+   if (!l2_cache) {
+   struct cpumask *(*sibling_mask)(int) = cpu_sibling_mask;
+
+   /*
+* If no l2cache for this CPU, assume all siblings to share
+* cache with this CPU.
+*/
+   if (has_big_cores)
+   sibling_mask = cpu_smallcore_mask;
+
+   for_each_cpu(i, sibling_mask(cpu))
+   set_cpus_related(cpu, i, cpu_l2_cache_mask);
+
return false;
+   }
 
+   cpumask_set_cpu(cpu, mask_fn(cpu));
for_each_cpu(i, cpu_online_mask) {
/*
 * when updating the marks the current CPU has not been marked
@@ -1271,29 +1285,30 @@ static void add_cpu_to_masks(int cpu)
 * add it to it's own thread sibling mask.
 */
cpumask_set_cpu(cpu, cpu_sibling_mask(cpu));
+   cpumask_set_cpu(cpu, cpu_core_mask(cpu));
 
for (i = first_thread; i < first_thread + threads_per_core; i++)
if (cpu_online(i))
set_cpus_related(i, cpu, cpu_sibling_mask);
 
add_cpu_to_smallcore_masks(cpu);
-   /*
-* Copy the thread sibling mask into the cache sibling mask
-* and mark any CPUs that share an L2 with this CPU.
-*/
-   for_each_cpu(i, cpu_sibling_mask(cpu))
-   set_cpus_related(cpu, i, cpu_l2_cache_mask);
update_mask_by_l2(cpu, cpu_l2_cache_mask);
 
-   /*
-* Copy the cache sibling mask into core sibling mask and mark
-* any CPUs on the same chip as this CPU.
-*/
-   for_each_cpu(i, cpu_l2_cache_mask(cpu))
-   set_cpus_related(cpu, i, cpu_core_mask);
+   if (pkg_id == -1) {
+   struct cpumask *(*mask)(int) = cpu_sibling_mask;
+
+   /*
+* Copy the sibling mask into core sibling mask and
+* mark any CPUs on the same chip as this CPU.
+*/
+   if (shared_caches)
+   mask = cpu_l2_cache_mask;
+
+   for_each_cpu(i, mask(cpu))
+   set_cpus_related(cpu, i, cpu_core_mask);
 
-   if (pkg_id == -1)
return;
+   }
 
for_each_cpu(i, cpu_online_mask)
if (get_physical_package_id(i) == pkg_id)
-- 
2.17.1



Re: [PATCH v5 05/10] powerpc/smp: Dont assume l2-cache to be superset of sibling

2020-09-13 Thread Srikar Dronamraju
* Michael Ellerman  [2020-09-13 11:46:41]:

> Srikar Dronamraju  writes:
> > * Michael Ellerman  [2020-09-11 21:55:23]:
> >
> >> Srikar Dronamraju  writes:
> >> > Current code assumes that cpumask of cpus sharing a l2-cache mask will
> >> > always be a superset of cpu_sibling_mask.
> >> >
> >> > Lets stop that assumption. cpu_l2_cache_mask is a superset of
> >> > cpu_sibling_mask if and only if shared_caches is set.
> >> 
> >> I'm seeing oopses with this:
> >> 
> 
> The patch fixes qemu, and I don't see the crash on mambo, but I still
> see:
>   [0.010536] smp: Bringing up secondary CPUs ...
>   [0.019189] smp: Brought up 2 nodes, 8 CPUs
>   [0.019210] numa: Node 0 CPUs: 0-3
>   [0.019235] numa: Node 1 CPUs: 4-7
>   [0.02]  the CACHE domain not a subset of the MC domain
>   [0.024505] BUG: arch topology borken
>   [0.024527]  the SMT domain not a subset of the CACHE domain
>   [0.024563] BUG: arch topology borken
>   [0.024584]  the CACHE domain not a subset of the MC domain
>   [0.024645] BUG: arch topology borken
>   [0.024666]  the SMT domain not a subset of the CACHE domain
>   [0.024702] BUG: arch topology borken
>   [0.024723]  the CACHE domain not a subset of the MC domain
> 
> That's the p9 mambo model, using skiboot.tcl from skiboot, with CPUS=2,
> THREADS=4 and MAMBO_NUMA=1.
> 

I was able to reproduce with
 qemu-system-ppc64 -nographic -vga none -M pseries -cpu POWER8 \
   -kernel build~/vmlinux \
   -m 2G,slots=2,maxmem=4G \
   -object memory-backend-ram,size=1G,id=m0 \
   -object memory-backend-ram,size=1G,id=m1 \
   -numa node,nodeid=0,memdev=m0 \
   -numa node,nodeid=1,memdev=m1 \
   -smp 8,threads=4,sockets=2,maxcpus=8  \


If the CPU doesn't have a l2-cache element, then CPU not only has to set
itself in the cpu_l2_cache but also the siblings. Otherwise it will so
happen that the Siblings will have 4 Cpus set, and the Cache domain will
have just one cpu set, leading to this BUG message.

Patch follows this mail.

> Node layout is:
> 
> [0.00] Early memory node ranges
> [0.00]   node   0: [mem 0x-0x]
> [0.00]   node   1: [mem 0x2000-0x2000]
> [0.00] Initmem setup node 0 [mem 
> 0x-0x]
> [0.00] On node 0 totalpages: 65536
> [0.00] Initmem setup node 1 [mem 
> 0x2000-0x2000]
> [    0.00] On node 1 totalpages: 65536
> 
> 
> There aren't any l2-cache properties in the device-tree under cpus.
> 
> I'll try and have a closer look tonight.
> 
> cheers

-- 
Thanks and Regards
Srikar Dronamraju


Re: [PATCH v5 05/10] powerpc/smp: Dont assume l2-cache to be superset of sibling

2020-09-11 Thread Srikar Dronamraju
* Michael Ellerman  [2020-09-11 21:55:23]:

> Srikar Dronamraju  writes:
> > Current code assumes that cpumask of cpus sharing a l2-cache mask will
> > always be a superset of cpu_sibling_mask.
> >
> > Lets stop that assumption. cpu_l2_cache_mask is a superset of
> > cpu_sibling_mask if and only if shared_caches is set.
> 
> I'm seeing oopses with this:
> 
> [0.117392][T1] smp: Bringing up secondary CPUs ...
> [0.156515][T1] smp: Brought up 2 nodes, 2 CPUs
> [0.158265][T1] numa: Node 0 CPUs: 0
> [0.158520][T1] numa: Node 1 CPUs: 1
> [0.167453][T1] BUG: Unable to handle kernel data access on read at 
> 0x800041228298
> [0.167992][T1] Faulting instruction address: 0xc018c128
> [0.168817][T1] Oops: Kernel access of bad area, sig: 11 [#1]
> [0.168964][T1] LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries
> [0.169417][T1] Modules linked in:
> [0.170047][T1] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 
> 5.9.0-rc2-00095-g7430ad5aa700 #209
> [0.170305][T1] NIP:  c018c128 LR: c018c0cc CTR: 
> c004dce0
> [0.170498][T1] REGS: c0007e343880 TRAP: 0380   Not tainted  
> (5.9.0-rc2-00095-g7430ad5aa700)
> [0.170602][T1] MSR:  82009033   CR: 
> 4400  XER: 
> [0.170985][T1] CFAR: c018c288 IRQMASK: 0
> [0.170985][T1] GPR00:  c0007e343b10 
> c173e400 4000
> [0.170985][T1] GPR04:  0800 
> 0800 
> [0.170985][T1] GPR08:  c122c298 
> c0003fffc000 c0007fd05ce8
> [0.170985][T1] GPR12: c0007e0119f8 c193 
> 8ade 
> [0.170985][T1] GPR16: c0007e3c0640 0917 
> c0007e3c0658 0008
> [0.170985][T1] GPR20: c15d0bb8 8ade 
> c0f57400 c1817c28
> [0.170985][T1] GPR24: c176dc80 c0007e3c0890 
> c0007e3cfe00 
> [0.170985][T1] GPR28: c1772310 c0007e011900 
> c0007e3c0800 0001
> [0.172750][T1] NIP [c018c128] build_sched_domains+0x808/0x14b0
> [0.172900][T1] LR [c018c0cc] build_sched_domains+0x7ac/0x14b0
> [0.173186][T1] Call Trace:
> [0.173484][T1] [c0007e343b10] [c018bfe8] 
> build_sched_domains+0x6c8/0x14b0 (unreliable)
> [0.173821][T1] [c0007e343c50] [c018dcdc] 
> sched_init_domains+0xec/0x130
> [0.174037][T1] [c0007e343ca0] [c10d59d8] 
> sched_init_smp+0x50/0xc4
> [0.174207][T1] [c0007e343cd0] [c10b45c4] 
> kernel_init_freeable+0x1b4/0x378
> [0.174378][T1] [c0007e343db0] [c00129fc] 
> kernel_init+0x24/0x158
> [0.174740][T1] [c0007e343e20] [c000d9d0] 
> ret_from_kernel_thread+0x5c/0x6c
> [0.175050][T1] Instruction dump:
> [0.175626][T1] 554905ee 71480040 7d2907b4 4182016c 2c29 3920006e 
> 913e002c 41820034
> [0.175841][T1] 7c6307b4 e9300020 78631f24 7d58182a <7d2a482a> 
> f93e0080 7d404828 314a0001
> [0.178340][T1] ---[ end trace 6876b88dd1d4b3bb ]---
> [0.178512][T1]
> [1.180458][T1] Kernel panic - not syncing: Attempted to kill init! 
> exitcode=0x000b
> 
> That's qemu:
> 
> qemu-system-ppc64 -nographic -vga none -M pseries -cpu POWER8 \
>   -kernel build~/vmlinux \
>   -m 2G,slots=2,maxmem=4G \
>   -object memory-backend-ram,size=1G,id=m0 \
>   -object memory-backend-ram,size=1G,id=m1 \
>   -numa node,nodeid=0,memdev=m0 \
>   -numa node,nodeid=1,memdev=m1 \
>   -smp 2,sockets=2,maxcpus=2  \
> 

Thanks Michael for the report and also for identifying the patch and also
giving an easy reproducer. That made my task easy. (My only problem was all
my PowerKVM hosts had a old compiler that refuse to compile never kernels.)

So in this setup, CPU doesn't have a l2-cache. And in that scenario, we
miss updating the l2-cache domain. Actually the initial patch had this
exact code. However it was my mistake. I should have reassessed it before
making changes suggested by Gautham.

Patch below. Do let me know if you want me to send the patch separately.

> 
> On mambo I get:
> 
> [0.005069][T1] smp: Bringing up secondary CPUs ...
> [0.011656][T1] smp: Brought up 2 nodes, 8 CPUs
> [0.011682][T1] numa: Node 0 CPUs: 0-3
> [0.011709][T1] numa: Node 1 CPUs: 4-7
> [0.012015][T1] BUG: arch topology borken
> [0.012040][T1]  the SMT domain not a subset of the CACHE domain
> [

[PATCH v6 2/3] powerpc/numa: Prefer node id queried from vphn

2020-08-18 Thread Srikar Dronamraju
Node id queried from the static device tree may not
be correct. For example: it may always show 0 on a shared processor.
Hence prefer the node id queried from vphn and fallback on the device tree
based node id if vphn query fails.

Cc: linuxppc-dev@lists.ozlabs.org
Cc: linux...@kvack.org
Cc: Michal Hocko 
Cc: Mel Gorman 
Cc: Vlastimil Babka 
Cc: Christopher Lameter 
Cc: Michael Ellerman 
Cc: Andrew Morton 
Cc: Linus Torvalds 
Cc: Gautham R Shenoy 
Cc: Satheesh Rajendran 
Cc: David Hildenbrand 
Cc: Aneesh Kumar KV 
Signed-off-by: Srikar Dronamraju 
---
Changelog v2:->v3:
- Resolved comments from Gautham.
Link v2: 
https://lore.kernel.org/linuxppc-dev/20200428093836.27190-1-sri...@linux.vnet.ibm.com/t/#u

Changelog v1:->v2:
- Rebased to v5.7-rc3

 arch/powerpc/mm/numa.c | 19 ++-
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 72f6cca1332c..10c5064eeb88 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -728,21 +728,22 @@ static int __init parse_numa_properties(void)
 */
for_each_present_cpu(i) {
struct device_node *cpu;
-   int nid;
-
-   cpu = of_get_cpu_node(i, NULL);
-   BUG_ON(!cpu);
-   nid = of_node_to_nid_single(cpu);
-   of_node_put(cpu);
+   int nid = vphn_get_nid(i);
 
/*
 * Don't fall back to default_nid yet -- we will plug
 * cpus into nodes once the memory scan has discovered
 * the topology.
 */
-   if (nid < 0)
-   continue;
-   node_set_online(nid);
+   if (nid == NUMA_NO_NODE) {
+   cpu = of_get_cpu_node(i, NULL);
+   BUG_ON(!cpu);
+   nid = of_node_to_nid_single(cpu);
+   of_node_put(cpu);
+   }
+
+   if (likely(nid > 0))
+   node_set_online(nid);
}
 
get_n_mem_cells(_mem_addr_cells, _mem_size_cells);
-- 
2.18.1



[PATCH v6 3/3] powerpc/numa: Offline memoryless cpuless node 0

2020-08-18 Thread Srikar Dronamraju
Currently Linux kernel with CONFIG_NUMA on a system with multiple
possible nodes, marks node 0 as online at boot.  However in practice,
there are systems which have node 0 as memoryless and cpuless.

This can cause numa_balancing to be enabled on systems with only one node
with memory and CPUs. The existence of this dummy node which is cpuless and
memoryless node can confuse users/scripts looking at output of lscpu /
numactl.

By marking, node 0 as offline, lets stop assuming that node 0 is
always online. If node 0 has CPU or memory that are online, node 0 will
again be set as online.

v5.8
 available: 2 nodes (0,2)
 node 0 cpus:
 node 0 size: 0 MB
 node 0 free: 0 MB
 node 2 cpus: 0 1 2 3 4 5 6 7
 node 2 size: 32625 MB
 node 2 free: 31490 MB
 node distances:
 node   0   2
   0:  10  20
   2:  20  10

proc and sys files
--
 /sys/devices/system/node/online:0,2
 /proc/sys/kernel/numa_balancing:1
 /sys/devices/system/node/has_cpu:   2
 /sys/devices/system/node/has_memory:2
 /sys/devices/system/node/has_normal_memory: 2
 /sys/devices/system/node/possible:  0-31

v5.8 + patch
--
 available: 1 nodes (2)
 node 2 cpus: 0 1 2 3 4 5 6 7
 node 2 size: 32625 MB
 node 2 free: 31487 MB
 node distances:
 node   2
   2:  10

proc and sys files
--
/sys/devices/system/node/online:2
/proc/sys/kernel/numa_balancing:0
/sys/devices/system/node/has_cpu:   2
/sys/devices/system/node/has_memory:2
/sys/devices/system/node/has_normal_memory: 2
/sys/devices/system/node/possible:  0-31

Example of a node with online CPUs/memory on node 0.
(Same o/p with and without patch)
numactl -H
available: 4 nodes (0-3)
node 0 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 
25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
node 0 size: 32482 MB
node 0 free: 22994 MB
node 1 cpus: 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 
70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
node 1 size: 0 MB
node 1 free: 0 MB
node 2 cpus: 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 
113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 
133 134 135 136 137 138 139 140 141 142 143
node 2 size: 0 MB
node 2 free: 0 MB
node 3 cpus: 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 
160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 
180 181 182 183 184 185 186 187 188 189 190 191 node 3 size: 0 MB
node 3 free: 0 MB
node distances:
node   0   1   2   3
  0:  10  20  40  40
  1:  20  10  40  40
  2:  40  40  10  20
  3:  40  40  20  10

Note: On Powerpc, cpu_to_node of possible but not present cpus would
previously return 0. Hence this commit depends on commit ("powerpc/numa: Set
numa_node for all possible cpus") and commit ("powerpc/numa: Prefer node id
queried from vphn"). Without the 2 commits, Powerpc system might crash.

1. User space applications like Numactl, lscpu, that parse the sysfs tend to
believe there is an extra online node. This tends to confuse users and
applications. Other user space applications start believing that system was
not able to use all the resources (i.e missing resources) or the system was
not setup correctly.

2. Also existence of dummy node also leads to inconsistent information. The
number of online nodes is inconsistent with the information in the
device-tree and resource-dump

3. When the dummy node is present, single node non-Numa systems end up showing
up as NUMA systems and numa_balancing gets enabled. This will mean we take
the hit from the unnecessary numa hinting faults.

Cc: linuxppc-dev@lists.ozlabs.org
Cc: linux...@kvack.org
Cc: Michal Hocko 
Cc: Mel Gorman 
Cc: Vlastimil Babka 
Cc: Christopher Lameter 
Cc: Michael Ellerman 
Cc: Andrew Morton 
Cc: Linus Torvalds 
Cc: Gautham R Shenoy 
Cc: David Hildenbrand 
Cc: Aneesh Kumar K V 
Signed-off-by: Srikar Dronamraju 
---
Changelog:
v5->v6:
Moved fix from arch independent code to powerpc specific
(Michal Hocko, Christopher Lamater)
 arch/powerpc/mm/numa.c | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 10c5064eeb88..0d72a7d4360e 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -924,6 +924,16 @@ void __init mem_topology_setup(void)
 {
int cpu;
 
+   /*
+* Linux/mm assumes node 0 to be online at boot. However this is not
+* true on PowerPC, where node 0 is similar to any other node, it
+* could be cpuless, memoryless node. So force node 0 to be offline
+* for now. This will prevent cpuless, memoryless node 0 showing up
+* unnecessarily as online. If a node has cpus or memory that need
+* to be online, then node will anyway be marked online.
+*/
+  

[PATCH v6 1/3] powerpc/numa: Set numa_node for all possible cpus

2020-08-18 Thread Srikar Dronamraju
A Powerpc system with multiple possible nodes and with CONFIG_NUMA
enabled always used to have a node 0, even if node 0 does not any cpus
or memory attached to it. As per PAPR, node affinity of a cpu is only
available once its present / online. For all cpus that are possible but
not present, cpu_to_node() would point to node 0.

To ensure a cpuless, memoryless dummy node is not online, powerpc need
to make sure all possible but not present cpu_to_node are set to a
proper node.

Cc: linuxppc-dev@lists.ozlabs.org
Cc: linux...@kvack.org
Cc: Michal Hocko 
Cc: Mel Gorman 
Cc: Vlastimil Babka 
Cc: Christopher Lameter 
Cc: Michael Ellerman 
Cc: Andrew Morton 
Cc: Linus Torvalds 
Cc: Gautham R Shenoy 
Cc: Satheesh Rajendran 
Cc: David Hildenbrand 
Cc: Aneesh Kumar K V 
Signed-off-by: Srikar Dronamraju 
---
Changelog v3:->v4:
- Resolved comments from Christopher.
Link v3: 
http://lore.kernel.org/lkml/20200501031128.19584-1-sri...@linux.vnet.ibm.com/t/#u

 arch/powerpc/mm/numa.c | 16 +++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 1f61fa2148b5..72f6cca1332c 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -507,6 +507,11 @@ static int numa_setup_cpu(unsigned long lcpu)
int fcpu = cpu_first_thread_sibling(lcpu);
int nid = NUMA_NO_NODE;
 
+   if (!cpu_present(lcpu)) {
+   set_cpu_numa_node(lcpu, first_online_node);
+   return first_online_node;
+   }
+
/*
 * If a valid cpu-to-node mapping is already available, use it
 * directly instead of querying the firmware, since it represents
@@ -935,8 +940,17 @@ void __init mem_topology_setup(void)
 
reset_numa_cpu_lookup_table();
 
-   for_each_present_cpu(cpu)
+   for_each_possible_cpu(cpu) {
+   /*
+* Powerpc with CONFIG_NUMA always used to have a node 0,
+* even if it was memoryless or cpuless. For all cpus that
+* are possible but not present, cpu_to_node() would point
+* to node 0. To remove a cpuless, memoryless dummy node,
+* powerpc need to make sure all possible but not present
+* cpu_to_node are set to a proper node.
+*/
numa_setup_cpu(cpu);
+   }
 }
 
 void __init initmem_init(void)
-- 
2.18.1



[PATCH v6 0/3] Offline memoryless cpuless node 0

2020-08-18 Thread Srikar Dronamraju
---
Testcase Time:  Min  Max  Avg  StdDev  %Change
./numa01.sh  Real:  164.20   164.38   164.28   0.080.292184%
./numa01.sh  Sys:   0.72 0.90 0.82 0.06271.951%
./numa01.sh  User:  1300.39  1301.97  1300.94  0.56-0.0061494%
./numa02.sh  Real:  27.4127.5127.450.030%
./numa02.sh  Sys:   0.09 0.16 0.13 0.0361.5385%
./numa02.sh  User:  216.38   216.91   216.64   0.210.0738552%

numa01.sh
param   no_patchwith_patch  %Change
-   --  --  ---
numa_hint_faults2946055 0   -100%
numa_hint_faults_local  2946055 0   -100%
numa_hit700617  681234  -2.76656%
numa_local  700617  681234  -2.76656%
numa_pte_updates2947175 0   -100%
pgfault 4125926 1120053 -72.8533%
pgmajfault  269 181 -32.7138%

numa02.sh
param   no_patchwith_patch  %Change
-   --  --  ---
numa_hint_faults137623  0   -100%
numa_hint_faults_local  137623  0   -100%
numa_hit51332   54645   6.45406%
numa_local  51332   54645   6.45406%
numa_pte_updates138903  0   -100%
pgfault 247058  116743  -52.7467%
pgmajfault  154 157 1.94805%

Observations:
The real time and user time actually doesn't change much. However the system
time changes to some extent. The reason being the number of numa hinting
faults. With the patch we are not seeing the numa hinting faults.

Cc: linuxppc-dev@lists.ozlabs.org
Cc: linux...@kvack.org
Cc: Michal Hocko 
Cc: Mel Gorman 
Cc: Vlastimil Babka 
Cc: Christopher Lameter 
Cc: Michael Ellerman 
Cc: Andrew Morton 
Cc: Linus Torvalds 
Cc: Gautham R Shenoy 
Cc: Satheesh Rajendran 
Cc: David Hildenbrand 
Cc: Aneesh Kumar K V 

Srikar Dronamraju (3):
  powerpc/numa: Set numa_node for all possible cpus
  powerpc/numa: Prefer node id queried from vphn
  powerpc/numa: Offline memoryless cpuless node 0

 arch/powerpc/mm/numa.c | 45 --
 1 file changed, 35 insertions(+), 10 deletions(-)

-- 
2.18.1



Re: [PATCH v5 3/3] mm/page_alloc: Keep memoryless cpuless node 0 offline

2020-08-18 Thread Srikar Dronamraju
* Michal Hocko  [2020-08-18 09:37:12]:

> On Tue 18-08-20 09:32:52, David Hildenbrand wrote:
> > On 12.08.20 08:01, Srikar Dronamraju wrote:
> > > Hi Andrew, Michal, David
> > > 
> > > * Andrew Morton  [2020-08-06 21:32:11]:
> > > 
> > >> On Fri, 3 Jul 2020 18:28:23 +0530 Srikar Dronamraju 
> > >>  wrote:
> > >>
> > >>>> The memory hotplug changes that somehow because you can hotremove numa
> > >>>> nodes and therefore make the nodemask sparse but that is not a common
> > >>>> case. I am not sure what would happen if a completely new node was 
> > >>>> added
> > >>>> and its corresponding node was already used by the renumbered one
> > >>>> though. It would likely conflate the two I am afraid. But I am not sure
> > >>>> this is really possible with x86 and a lack of a bug report would
> > >>>> suggest that nobody is doing that at least.
> > >>>>
> > >> So...  do we merge this patch or not?  Seems that the overall view is
> > >> "risky but nobody is likely to do anything better any time soon"?
> > > 
> > > Can we decide on this one way or the other?
> > 
> > Hmm, not sure who's the person to decide. I tend to prefer doing the
> > node renaming, handling this in ppc code;
> 
> Agreed. That would be a safer option.

Okay, will send arch specific v6 version.

> -- 
> Michal Hocko
> SUSE Labs

-- 
Thanks and Regards
Srikar Dronamraju


Re: [PATCH v2 1/3] powerpc/numa: Introduce logical numa id

2020-08-17 Thread Srikar Dronamraju
* Aneesh Kumar K.V  [2020-08-17 17:04:24]:

> On 8/17/20 4:29 PM, Srikar Dronamraju wrote:
> > * Aneesh Kumar K.V  [2020-08-17 16:02:36]:
> > 
> > > We use ibm,associativity and ibm,associativity-lookup-arrays to derive 
> > > the numa
> > > node numbers. These device tree properties are firmware indicated 
> > > grouping of
> > > resources based on their hierarchy in the platform. These numbers (group 
> > > id) are
> > > not sequential and hypervisor/firmware can follow different numbering 
> > > schemes.
> > > For ex: on powernv platforms, we group them in the below order.
> > > 
> > >   * - CCM node ID
> > >   * - HW card ID
> > >   * - HW module ID
> > >   * - Chip ID
> > >   * - Core ID
> > > 
> > > Based on ibm,associativity-reference-points we use one of the above group 
> > > ids as
> > > Linux NUMA node id. (On PowerNV platform Chip ID is used). This results
> > > in Linux reporting non-linear NUMA node id and which also results in Linux
> > > reporting empty node 0 NUMA nodes.
> > > 
> > > This can  be resolved by mapping the firmware provided group id to a 
> > > logical Linux
> > > NUMA id. In this patch, we do this only for pseries platforms considering 
> > > the
> > > firmware group id is a virtualized entity and users would not have drawn 
> > > any
> > > conclusion based on the Linux Numa Node id.
> > > 
> > > On PowerNV platform since we have historically mapped Chip ID as Linux 
> > > NUMA node
> > > id, we keep the existing Linux NUMA node id numbering.
> > 
> > I still dont understand how you are going to handle numa distances.
> > With your patch, have you tried dlpar add/remove on a sparsely noded 
> > machine?
> > 
> 
> We follow the same steps when fetching distance information. Instead of
> using affinity domain id, we now use the mapped node id. The relevant hunk
> in the patch is
> 
> + nid = affinity_domain_to_nid();
> 
>   if (nid > 0 &&
> - of_read_number(associativity, 1) >= distance_ref_points_depth) {
> + of_read_number(associativity, 1) >= distance_ref_points_depth) {
>   /*
>* Skip the length field and send start of associativity array
>*/
> 
> I haven't tried dlpar add/remove. I don't have a setup to try that. Do you
> see a problem there?
> 

Yes, I think there can be 2 problems.

1. distance table may be filled with incorrect data.
2. numactl -H distance table shows symmetric data, the symmetric nature may
be lost.

> -aneesh
> 
> 

-- 
Thanks and Regards
Srikar Dronamraju


Re: [PATCH v2 1/3] powerpc/numa: Introduce logical numa id

2020-08-17 Thread Srikar Dronamraju
* Aneesh Kumar K.V  [2020-08-17 16:02:36]:

> We use ibm,associativity and ibm,associativity-lookup-arrays to derive the 
> numa
> node numbers. These device tree properties are firmware indicated grouping of
> resources based on their hierarchy in the platform. These numbers (group id) 
> are
> not sequential and hypervisor/firmware can follow different numbering schemes.
> For ex: on powernv platforms, we group them in the below order.
> 
>  * - CCM node ID
>  * - HW card ID
>  * - HW module ID
>  * - Chip ID
>  * - Core ID
> 
> Based on ibm,associativity-reference-points we use one of the above group ids 
> as
> Linux NUMA node id. (On PowerNV platform Chip ID is used). This results
> in Linux reporting non-linear NUMA node id and which also results in Linux
> reporting empty node 0 NUMA nodes.
> 
> This can  be resolved by mapping the firmware provided group id to a logical 
> Linux
> NUMA id. In this patch, we do this only for pseries platforms considering the
> firmware group id is a virtualized entity and users would not have drawn any
> conclusion based on the Linux Numa Node id.
> 
> On PowerNV platform since we have historically mapped Chip ID as Linux NUMA 
> node
> id, we keep the existing Linux NUMA node id numbering.

I still dont understand how you are going to handle numa distances.
With your patch, have you tried dlpar add/remove on a sparsely noded machine?

-- 
Thanks and Regards
Srikar Dronamraju


[PATCH v3] powerpc/numa: Restrict possible nodes based on platform

2020-08-17 Thread Srikar Dronamraju
As per draft LoPAPR (Revision 2.9_pre7), section B.5.3 "Run Time Abstaction
Services (RTAS) Node at
https://openpowerfoundation.org/wp-content/uploads/2020/07/LoPAR-20200611.pdf,
there are 2 device tree property ibm,max-associativity-domains (which
defines the maximum number of domains that the firmware i.e PowerVM can
support) and ibm,current-associativity-domains (which defines the maximum
number of domains that the platform can support). Value of
ibm,max-associativity-domains property is always greater than or equal to
ibm,current-associativity-domains property. If the said property is not
available, use ibm,max-associativity-domain as fallback. In this yet to be
released LoPAPR, ibm,current-associativity-domains is mentioned in page 833
/ B.5.3 which is covered under under "Appendix B. System Binding" section

Currently Powerpc uses ibm,max-associativity-domains property while
setting the possible number of nodes. This is currently set at 32.
However the possible number of nodes for a platform may be significantly
less. Hence set the possible number of nodes based on
ibm,current-associativity-domains property.

Nathan Lynch had raised a valid concern that post LPM, a user could DLPAR
add processors and memory after LPM with "new" associativity properties.
https://lore.kernel.org/linuxppc-dev/871rljfet9@linux.ibm.com/t/#u

He also pointed out that ibm,max-associativity-domains has the same contents
on all currently available PowerVM systems, unlike
ibm,current-associativity-domains and hence may be better able to handle the
new numa associativity properties.

However with the recent commit dbce45628085 ("powerpc/numa: Limit possible
nodes to within num_possible_nodes"), all new numa  associativity properties
are capped to initially set nr_node_ids. Hence this commit should be safe
with any new dlpar add post LPM.

$ lsprop /proc/device-tree/rtas/ibm,*associ*-domains
/proc/device-tree/rtas/ibm,current-associativity-domains
 0005 0001 0002 0002 0002 0010
/proc/device-tree/rtas/ibm,max-associativity-domains
 0005 0001 0008 0020 0020 0100

$ cat /sys/devices/system/node/possible ##Before patch
0-31

$ cat /sys/devices/system/node/possible ##After patch
0-1

Note the maximum nodes this platform can support is only 2 but the
possible nodes is set to 32.

This is important because lot of kernel and user space code allocate
structures for all possible nodes leading to a lot of memory that is
allocated but not used.

I ran a simple experiment to create and destroy 100 memory cgroups on
boot on a 8 node machine (Power8 Alpine).

Before patch
free -k at boot
  totalusedfree  shared  buff/cache   available
Mem:  523498176 4106816   518820608   22272  570752   516606720
Swap:   4194240   0 4194240

free -k after creating 100 memory cgroups
  totalusedfree  shared  buff/cache   available
Mem:  523498176 4628416   518246464   22336  623296   516058688
Swap:   4194240   0 4194240

free -k after destroying 100 memory cgroups
  totalusedfree  shared  buff/cache   available
Mem:  523498176 4697408   518173760   22400  627008   515987904
Swap:   4194240   0 4194240

After patch
free -k at boot
  totalusedfree  shared  buff/cache   available
Mem:  523498176 3969472   518933888   22272  594816   516731776
Swap:   4194240   0 4194240

free -k after creating 100 memory cgroups
  totalusedfree  shared  buff/cache   available
Mem:  523498176 4181888   518676096   22208  640192   516496448
Swap:   4194240   0 4194240

free -k after destroying 100 memory cgroups
  totalusedfree  shared  buff/cache   available
Mem:  523498176 4232320   518619904   22272  645952   516443264
Swap:   4194240   0 4194240

Observations:
Fixed kernel takes 137344 kb (4106816-3969472) less to boot.
Fixed kernel takes 309184 kb (4628416-4181888-137344) less to create 100 memcgs.

Cc: Nathan Lynch 
Cc: Michael Ellerman 
Cc: linuxppc-dev@lists.ozlabs.org
Cc: Anton Blanchard 
Cc: Bharata B Rao 
Cc: Tyrel Datwyler 
Signed-off-by: Srikar Dronamraju 
---
Changelog v2->v3:
v2: 
https://lore.kernel.org/linuxppc-dev/20200707140644.7241-1-sri...@linux.vnet.ibm.com/t/#u
Updated commit msg to mention LoPAPR reference section  and url details
(Michael Ellerman)
Updated commit msg to discuss about possible new numa associativity properties.
(Nathan Lynch)

Changelog v1->v2:
v1: 
https://lore.kernel.org/linuxppc-dev/20200706064002.14848-1-sri...@linux.vnet.ibm.com/t/#u
Fallback to ibm

Re: [PATCH v5 3/3] mm/page_alloc: Keep memoryless cpuless node 0 offline

2020-08-12 Thread Srikar Dronamraju
Hi Andrew, Michal, David

* Andrew Morton  [2020-08-06 21:32:11]:

> On Fri, 3 Jul 2020 18:28:23 +0530 Srikar Dronamraju 
>  wrote:
> 
> > > The memory hotplug changes that somehow because you can hotremove numa
> > > nodes and therefore make the nodemask sparse but that is not a common
> > > case. I am not sure what would happen if a completely new node was added
> > > and its corresponding node was already used by the renumbered one
> > > though. It would likely conflate the two I am afraid. But I am not sure
> > > this is really possible with x86 and a lack of a bug report would
> > > suggest that nobody is doing that at least.
> > > 
> > 
> > JFYI,
> > Satheesh copied in this mailchain had opened a bug a year on crash with vcpu
> > hotplug on memoryless node. 
> > 
> > https://bugzilla.kernel.org/show_bug.cgi?id=202187
> 
> So...  do we merge this patch or not?  Seems that the overall view is
> "risky but nobody is likely to do anything better any time soon"?

Can we decide on this one way or the other?

-- 
Thanks and Regards
Srikar Dronamraju


Re: [RFC PATCH 1/2] powerpc/numa: Introduce logical numa id

2020-08-10 Thread Srikar Dronamraju
* Aneesh Kumar K.V  [2020-08-06 16:14:21]:

> >
> > associativity_to_nid gets called the first time a cpu is being made present
> > from offline. So it need not be in boot path. We may to verify if cpu
> > hotplug, dlpar, operations are synchronized. For example a memory hotadd and
> > cpu hotplug are they synchronized? I am not sure if they are synchronized at
> > this time.
> 
> But you don't online cpu or memory to a non existent node post boot
> right?. If the node is existent we have already initialized the nid_map.
> 

Not sure what you mean by existent and non-existent. Are you referring to
online / offline?

> However i am not sure whether we do a parallel initialization of devices. ie,
> of_device_add getting called in parallel. if it can then we need the
> below?
> 
> @@ -226,6 +226,7 @@ static u32 nid_map[MAX_NUMNODES] = {[0 ... MAX_NUMNODES - 
> 1] =  NUMA_NO_NODE};
>  int firmware_group_id_to_nid(int firmware_gid)
>  {
> static int last_nid = 0;
> +   static DEFINE_SPINLOCK(node_id_lock);
> 
> /*
>  * For PowerNV we don't change the node id. This helps to avoid
> @@ -238,8 +239,13 @@ int firmware_group_id_to_nid(int firmware_gid)
> if (firmware_gid ==  -1)
> return NUMA_NO_NODE;
> 
> -   if (nid_map[firmware_gid] == NUMA_NO_NODE)
> -   nid_map[firmware_gid] = last_nid++;
> +   if (nid_map[firmware_gid] == NUMA_NO_NODE) {
> +   spin_lock(_id_lock);
> +   /*  recheck with lock held */
> +   if (nid_map[firmware_gid] == NUMA_NO_NODE)
> +   nid_map[firmware_gid] = last_nid++;
> +   spin_unlock(_id_lock);
> +   }
> 
> return nid_map[firmware_gid];
>  }
> 

This should help.


> 
> I will also add a las_nid > MAX_NUMNODES check in
> firmware_group_id_to_nid() to handle the case where we find more numa
> nodes than MAX_NUMANODES in device tree.
> 

Okay, 

Whats your plan to handle the node distances?
Currently the node distances we compute from the device tree properties are
based on distance from node 0.  If you rename a different node as node 0,
how do you plan to remap the node distances?

> -aneesh

-- 
Thanks and Regards
Srikar Dronamraju


[PATCH v5 02/10] powerpc/smp: Merge Power9 topology with Power topology

2020-08-10 Thread Srikar Dronamraju
A new sched_domain_topology_level was added just for Power9. However the
same can be achieved by merging powerpc_topology with power9_topology
and makes the code more simpler especially when adding a new sched
domain.

Cc: linuxppc-dev 
Cc: LKML 
Cc: Michael Ellerman 
Cc: Nicholas Piggin 
Cc: Anton Blanchard 
Cc: Oliver O'Halloran 
Cc: Nathan Lynch 
Cc: Michael Neuling 
Cc: Gautham R Shenoy 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Valentin Schneider 
Cc: Jordan Niethe 
Cc: Vaidyanathan Srinivasan 
Reviewed-by: Gautham R. Shenoy 
Signed-off-by: Srikar Dronamraju 
---
Changelog v1 -> v2:
Replaced a reference to cpu_smt_mask with per_cpu(cpu_sibling_map, cpu)
since cpu_smt_mask is only defined under CONFIG_SCHED_SMT

 arch/powerpc/kernel/smp.c | 25 +++--
 1 file changed, 3 insertions(+), 22 deletions(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index edf94ca64eea..08da765b91f1 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -1313,7 +1313,7 @@ int setup_profiling_timer(unsigned int multiplier)
 }
 
 #ifdef CONFIG_SCHED_SMT
-/* cpumask of CPUs with asymetric SMT dependancy */
+/* cpumask of CPUs with asymmetric SMT dependency */
 static int powerpc_smt_flags(void)
 {
int flags = SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES;
@@ -1326,14 +1326,6 @@ static int powerpc_smt_flags(void)
 }
 #endif
 
-static struct sched_domain_topology_level powerpc_topology[] = {
-#ifdef CONFIG_SCHED_SMT
-   { cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) },
-#endif
-   { cpu_cpu_mask, SD_INIT_NAME(DIE) },
-   { NULL, },
-};
-
 /*
  * P9 has a slightly odd architecture where pairs of cores share an L2 cache.
  * This topology makes it *much* cheaper to migrate tasks between adjacent 
cores
@@ -1361,7 +1353,7 @@ static const struct cpumask *smallcore_smt_mask(int cpu)
 }
 #endif
 
-static struct sched_domain_topology_level power9_topology[] = {
+static struct sched_domain_topology_level powerpc_topology[] = {
 #ifdef CONFIG_SCHED_SMT
{ cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) },
 #endif
@@ -1386,21 +1378,10 @@ void __init smp_cpus_done(unsigned int max_cpus)
 #ifdef CONFIG_SCHED_SMT
if (has_big_cores) {
pr_info("Big cores detected but using small core scheduling\n");
-   power9_topology[0].mask = smallcore_smt_mask;
powerpc_topology[0].mask = smallcore_smt_mask;
}
 #endif
-   /*
-* If any CPU detects that it's sharing a cache with another CPU then
-* use the deeper topology that is aware of this sharing.
-*/
-   if (shared_caches) {
-   pr_info("Using shared cache scheduler topology\n");
-   set_sched_topology(power9_topology);
-   } else {
-   pr_info("Using standard scheduler topology\n");
-   set_sched_topology(powerpc_topology);
-   }
+   set_sched_topology(powerpc_topology);
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
-- 
2.18.2



[PATCH v5 01/10] powerpc/smp: Fix a warning under !NEED_MULTIPLE_NODES

2020-08-10 Thread Srikar Dronamraju
Fix a build warning in a non CONFIG_NEED_MULTIPLE_NODES
"error: _numa_cpu_lookup_table_ undeclared"

Cc: linuxppc-dev 
Cc: LKML 
Cc: Michael Ellerman 
Cc: Nicholas Piggin 
Cc: Anton Blanchard 
Cc: Oliver O'Halloran 
Cc: Nathan Lynch 
Cc: Michael Neuling 
Cc: Gautham R Shenoy 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Valentin Schneider 
Cc: Jordan Niethe 
Cc: Vaidyanathan Srinivasan 
Reviewed-by: Gautham R. Shenoy 
Signed-off-by: Srikar Dronamraju 
---
Changelog v2 -> v3:
Removed node caching part. Rewrote the Commit msg (Michael Ellerman)
Renamed to powerpc/smp: Fix a warning under !NEED_MULTIPLE_NODES

 arch/powerpc/kernel/smp.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 73199470c265..edf94ca64eea 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -860,6 +860,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
GFP_KERNEL, cpu_to_node(cpu));
zalloc_cpumask_var_node(_cpu(cpu_core_map, cpu),
GFP_KERNEL, cpu_to_node(cpu));
+#ifdef CONFIG_NEED_MULTIPLE_NODES
/*
 * numa_node_id() works after this.
 */
@@ -868,6 +869,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
set_cpu_numa_mem(cpu,
local_memory_node(numa_cpu_lookup_table[cpu]));
}
+#endif
}
 
/* Init the cpumasks so the boot CPU is related to itself */
-- 
2.18.2



[PATCH v5 08/10] powerpc/smp: Allocate cpumask only after searching thread group

2020-08-10 Thread Srikar Dronamraju
If allocated earlier and the search fails, then cpu_l1_cache_map cpumask
is unnecessarily cleared. However cpu_l1_cache_map can be allocated /
cleared after we search thread group.

Please note CONFIG_CPUMASK_OFFSTACK is not set on Powerpc. Hence cpumask
allocated by zalloc_cpumask_var_node is never freed.

Cc: linuxppc-dev 
Cc: LKML 
Cc: Michael Ellerman 
Cc: Nicholas Piggin 
Cc: Anton Blanchard 
Cc: Oliver O'Halloran 
Cc: Nathan Lynch 
Cc: Michael Neuling 
Cc: Gautham R Shenoy 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Valentin Schneider 
Cc: Jordan Niethe 
Cc: Vaidyanathan Srinivasan 
Reviewed-by: Gautham R. Shenoy 
Signed-off-by: Srikar Dronamraju 
---
Changelog v4 ->v5:
Updated commit msg on why cpumask need not be freed.
(Michael Ellerman)

 arch/powerpc/kernel/smp.c | 7 +++
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 7403fdcf3821..0536ac06876b 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -789,10 +789,6 @@ static int init_cpu_l1_cache_map(int cpu)
if (err)
goto out;
 
-   zalloc_cpumask_var_node(_cpu(cpu_l1_cache_map, cpu),
-   GFP_KERNEL,
-   cpu_to_node(cpu));
-
cpu_group_start = get_cpu_thread_group_start(cpu, );
 
if (unlikely(cpu_group_start == -1)) {
@@ -801,6 +797,9 @@ static int init_cpu_l1_cache_map(int cpu)
goto out;
}
 
+   zalloc_cpumask_var_node(_cpu(cpu_l1_cache_map, cpu),
+   GFP_KERNEL, cpu_to_node(cpu));
+
for (i = first_thread; i < first_thread + threads_per_core; i++) {
int i_group_start = get_cpu_thread_group_start(i, );
 
-- 
2.18.2



[PATCH v5 10/10] powerpc/smp: Implement cpu_to_coregroup_id

2020-08-10 Thread Srikar Dronamraju
Lookup the coregroup id from the associativity array.

If unable to detect the coregroup id, fallback on the core id.
This way, ensure sched_domain degenerates and an extra sched domain is
not created.

Ideally this function should have been implemented in
arch/powerpc/kernel/smp.c. However if its implemented in mm/numa.c, we
don't need to find the primary domain again.

If the device-tree mentions more than one coregroup, then kernel
implements only the last or the smallest coregroup, which currently
corresponds to the penultimate domain in the device-tree.

Cc: linuxppc-dev 
Cc: LKML 
Cc: Michael Ellerman 
Cc: Nicholas Piggin 
Cc: Anton Blanchard 
Cc: Oliver O'Halloran 
Cc: Nathan Lynch 
Cc: Michael Neuling 
Cc: Gautham R Shenoy 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Valentin Schneider 
Cc: Jordan Niethe 
Cc: Vaidyanathan Srinivasan 
Reviewed-by: Gautham R. Shenoy 
Signed-off-by: Srikar Dronamraju 
---
Changelog v1 -> v2:
Move coregroup_enabled before getting associativity (Gautham)

 arch/powerpc/mm/numa.c | 20 
 1 file changed, 20 insertions(+)

diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 0d57779e7942..8b3b3ec7fcc4 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -1218,6 +1218,26 @@ int find_and_online_cpu_nid(int cpu)
 
 int cpu_to_coregroup_id(int cpu)
 {
+   __be32 associativity[VPHN_ASSOC_BUFSIZE] = {0};
+   int index;
+
+   if (cpu < 0 || cpu > nr_cpu_ids)
+   return -1;
+
+   if (!coregroup_enabled)
+   goto out;
+
+   if (!firmware_has_feature(FW_FEATURE_VPHN))
+   goto out;
+
+   if (vphn_get_associativity(cpu, associativity))
+   goto out;
+
+   index = of_read_number(associativity, 1);
+   if (index > min_common_depth + 1)
+   return of_read_number([index - 1], 1);
+
+out:
return cpu_to_core_id(cpu);
 }
 
-- 
2.18.2



[PATCH v5 09/10] powerpc/smp: Create coregroup domain

2020-08-10 Thread Srikar Dronamraju
Add percpu coregroup maps and masks to create coregroup domain.
If a coregroup doesn't exist, the coregroup domain will be degenerated
in favour of SMT/CACHE domain. Do note this patch is only creating stubs
for cpu_to_coregroup_id. The actual cpu_to_coregroup_id implementation
would be in a subsequent patch.

Cc: linuxppc-dev 
Cc: LKML 
Cc: Michael Ellerman 
Cc: Nicholas Piggin 
Cc: Anton Blanchard 
Cc: Oliver O'Halloran 
Cc: Nathan Lynch 
Cc: Michael Neuling 
Cc: Gautham R. Shenoy 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Valentin Schneider 
Cc: Jordan Niethe 
Cc: Vaidyanathan Srinivasan 
Reviewed-by: Gautham R. Shenoy 
Signed-off-by: Srikar Dronamraju 
---
Changelog v4 ->v5:
Updated commit msg to specify actual implementation of
cpu_to_coregroup_id is in a subsequent patch (Michael Ellerman)

Changelog v3 ->v4:
if coregroup_support doesn't exist, update MC mask to the next
smaller domain mask.

Changelog v2 -> v3:
Add optimization for mask updation under coregroup_support

Changelog v1 -> v2:
Moved coregroup topology fixup to fixup_topology (Gautham)

 arch/powerpc/include/asm/topology.h | 10 ++
 arch/powerpc/kernel/smp.c   | 54 -
 arch/powerpc/mm/numa.c  |  5 +++
 3 files changed, 68 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/topology.h 
b/arch/powerpc/include/asm/topology.h
index f0b6300e7dd3..6609174918ab 100644
--- a/arch/powerpc/include/asm/topology.h
+++ b/arch/powerpc/include/asm/topology.h
@@ -88,12 +88,22 @@ static inline int cpu_distance(__be32 *cpu1_assoc, __be32 
*cpu2_assoc)
 
 #if defined(CONFIG_NUMA) && defined(CONFIG_PPC_SPLPAR)
 extern int find_and_online_cpu_nid(int cpu);
+extern int cpu_to_coregroup_id(int cpu);
 #else
 static inline int find_and_online_cpu_nid(int cpu)
 {
return 0;
 }
 
+static inline int cpu_to_coregroup_id(int cpu)
+{
+#ifdef CONFIG_SMP
+   return cpu_to_core_id(cpu);
+#else
+   return 0;
+#endif
+}
+
 #endif /* CONFIG_NUMA && CONFIG_PPC_SPLPAR */
 
 #include 
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 0536ac06876b..566e3accac3e 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -80,12 +80,22 @@ DEFINE_PER_CPU(cpumask_var_t, cpu_sibling_map);
 DEFINE_PER_CPU(cpumask_var_t, cpu_smallcore_map);
 DEFINE_PER_CPU(cpumask_var_t, cpu_l2_cache_map);
 DEFINE_PER_CPU(cpumask_var_t, cpu_core_map);
+DEFINE_PER_CPU(cpumask_var_t, cpu_coregroup_map);
 
 EXPORT_PER_CPU_SYMBOL(cpu_sibling_map);
 EXPORT_PER_CPU_SYMBOL(cpu_l2_cache_map);
 EXPORT_PER_CPU_SYMBOL(cpu_core_map);
 EXPORT_SYMBOL_GPL(has_big_cores);
 
+enum {
+#ifdef CONFIG_SCHED_SMT
+   smt_idx,
+#endif
+   cache_idx,
+   mc_idx,
+   die_idx,
+};
+
 #define MAX_THREAD_LIST_SIZE   8
 #define THREAD_GROUP_SHARE_L1   1
 struct thread_groups {
@@ -861,11 +871,27 @@ static const struct cpumask *smallcore_smt_mask(int cpu)
 }
 #endif
 
+static struct cpumask *cpu_coregroup_mask(int cpu)
+{
+   return per_cpu(cpu_coregroup_map, cpu);
+}
+
+static bool has_coregroup_support(void)
+{
+   return coregroup_enabled;
+}
+
+static const struct cpumask *cpu_mc_mask(int cpu)
+{
+   return cpu_coregroup_mask(cpu);
+}
+
 static struct sched_domain_topology_level powerpc_topology[] = {
 #ifdef CONFIG_SCHED_SMT
{ cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) },
 #endif
{ shared_cache_mask, powerpc_shared_cache_flags, SD_INIT_NAME(CACHE) },
+   { cpu_mc_mask, SD_INIT_NAME(MC) },
{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
{ NULL, },
 };
@@ -912,6 +938,10 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
GFP_KERNEL, cpu_to_node(cpu));
zalloc_cpumask_var_node(_cpu(cpu_core_map, cpu),
GFP_KERNEL, cpu_to_node(cpu));
+   if (has_coregroup_support())
+   zalloc_cpumask_var_node(_cpu(cpu_coregroup_map, 
cpu),
+   GFP_KERNEL, cpu_to_node(cpu));
+
 #ifdef CONFIG_NEED_MULTIPLE_NODES
/*
 * numa_node_id() works after this.
@@ -929,6 +959,9 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
cpumask_set_cpu(boot_cpuid, cpu_l2_cache_mask(boot_cpuid));
cpumask_set_cpu(boot_cpuid, cpu_core_mask(boot_cpuid));
 
+   if (has_coregroup_support())
+   cpumask_set_cpu(boot_cpuid, cpu_coregroup_mask(boot_cpuid));
+
init_big_cores();
if (has_big_cores) {
cpumask_set_cpu(boot_cpuid,
@@ -1220,6 +1253,8 @@ static void remove_cpu_from_masks(int cpu)
set_cpus_unrelated(cpu, i, cpu_sibling_mask);
if (has_big_cores)
set_cpus_unrelated(cpu, i, cpu_smallcore_mask);
+   if (has_coregroup_support())
+   set_cpus_unrelate

[PATCH v5 07/10] powerpc/numa: Detect support for coregroup

2020-08-10 Thread Srikar Dronamraju
Add support for grouping cores based on the device-tree classification.
- The last domain in the associativity domains always refers to the
core.
- If primary reference domain happens to be the penultimate domain in
the associativity domains device-tree property, then there are no
coregroups. However if its not a penultimate domain, then there are
coregroups. There can be more than one coregroup. For now we would be
interested in the last or the smallest coregroups, i.e one sub-group
per DIE.

Currently there are no firmwares that are exposing this grouping. Hence
allow the basis for grouping to be abstract.  Once the firmware starts
using this grouping, code would be added to detect the type of grouping
and adjust the sd domain flags accordingly.

Cc: linuxppc-dev 
Cc: LKML 
Cc: Michael Ellerman 
Cc: Nicholas Piggin 
Cc: Anton Blanchard 
Cc: Oliver O'Halloran 
Cc: Nathan Lynch 
Cc: Michael Neuling 
Cc: Gautham R Shenoy 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Valentin Schneider 
Cc: Jordan Niethe 
Cc: Vaidyanathan Srinivasan 
Reviewed-by: Gautham R. Shenoy 
Signed-off-by: Srikar Dronamraju 
---
Changelog v4->v5:
Updated commit msg with current abstract nature of the coregroups
(Michael Ellerman)

Changelog v1 -> v2:
Explained Coregroup in commit msg (Michael Ellerman)

 arch/powerpc/include/asm/smp.h |  1 +
 arch/powerpc/kernel/smp.c  |  1 +
 arch/powerpc/mm/numa.c | 34 +-
 3 files changed, 23 insertions(+), 13 deletions(-)

diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h
index 49a25e2400f2..5bdc17a7049f 100644
--- a/arch/powerpc/include/asm/smp.h
+++ b/arch/powerpc/include/asm/smp.h
@@ -28,6 +28,7 @@
 extern int boot_cpuid;
 extern int spinning_secondaries;
 extern u32 *cpu_to_phys_id;
+extern bool coregroup_enabled;
 
 extern void cpu_die(void);
 extern int cpu_to_chip_id(int cpu);
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 91cf5d05e7ec..7403fdcf3821 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -74,6 +74,7 @@ static DEFINE_PER_CPU(int, cpu_state) = { 0 };
 
 struct task_struct *secondary_current;
 bool has_big_cores;
+bool coregroup_enabled;
 
 DEFINE_PER_CPU(cpumask_var_t, cpu_sibling_map);
 DEFINE_PER_CPU(cpumask_var_t, cpu_smallcore_map);
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 2298899a0f0a..51cb672f113b 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -886,7 +886,9 @@ static void __init setup_node_data(int nid, u64 start_pfn, 
u64 end_pfn)
 static void __init find_possible_nodes(void)
 {
struct device_node *rtas;
-   u32 numnodes, i;
+   const __be32 *domains;
+   int prop_length, max_nodes;
+   u32 i;
 
if (!numa_enabled)
return;
@@ -895,25 +897,31 @@ static void __init find_possible_nodes(void)
if (!rtas)
return;
 
-   if (of_property_read_u32_index(rtas, 
"ibm,current-associativity-domains",
-   min_common_depth, )) {
-   /*
-* ibm,current-associativity-domains is a fairly recent
-* property. If it doesn't exist, then fallback on
-* ibm,max-associativity-domains. Current denotes what the
-* platform can support compared to max which denotes what the
-* Hypervisor can support.
-*/
-   if (of_property_read_u32_index(rtas, 
"ibm,max-associativity-domains",
-   min_common_depth, ))
+   /*
+* ibm,current-associativity-domains is a fairly recent property. If
+* it doesn't exist, then fallback on ibm,max-associativity-domains.
+* Current denotes what the platform can support compared to max
+* which denotes what the Hypervisor can support.
+*/
+   domains = of_get_property(rtas, "ibm,current-associativity-domains",
+   _length);
+   if (!domains) {
+   domains = of_get_property(rtas, "ibm,max-associativity-domains",
+   _length);
+   if (!domains)
goto out;
}
 
-   for (i = 0; i < numnodes; i++) {
+   max_nodes = of_read_number([min_common_depth], 1);
+   for (i = 0; i < max_nodes; i++) {
if (!node_possible(i))
node_set(i, node_possible_map);
}
 
+   prop_length /= sizeof(int);
+   if (prop_length > min_common_depth + 2)
+   coregroup_enabled = 1;
+
 out:
of_node_put(rtas);
 }
-- 
2.18.2



[PATCH v5 06/10] powerpc/smp: Optimize start_secondary

2020-08-10 Thread Srikar Dronamraju
In start_secondary, even if shared_cache was already set, system does a
redundant match for cpumask. This redundant check can be removed by
checking if shared_cache is already set.

While here, localize the sibling_mask variable to within the if
condition.

Cc: linuxppc-dev 
Cc: LKML 
Cc: Michael Ellerman 
Cc: Nicholas Piggin 
Cc: Anton Blanchard 
Cc: Oliver O'Halloran 
Cc: Nathan Lynch 
Cc: Michael Neuling 
Cc: Gautham R Shenoy 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Valentin Schneider 
Cc: Jordan Niethe 
Cc: Vaidyanathan Srinivasan 
Signed-off-by: Srikar Dronamraju 
---
Changelog v4 ->v5:
Retain cache domain, no need for generalization
 (Michael Ellerman, Peter Zijlstra,
 Valentin Schneider, Gautham R. Shenoy)

Changelog v1 -> v2:
Moved shared_cache topology fixup to fixup_topology (Gautham)

 arch/powerpc/kernel/smp.c | 17 +++--
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 0c960ce3be42..91cf5d05e7ec 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -851,7 +851,7 @@ static int powerpc_shared_cache_flags(void)
  */
 static const struct cpumask *shared_cache_mask(int cpu)
 {
-   return cpu_l2_cache_mask(cpu);
+   return per_cpu(cpu_l2_cache_map, cpu);
 }
 
 #ifdef CONFIG_SCHED_SMT
@@ -1305,7 +1305,6 @@ static void add_cpu_to_masks(int cpu)
 void start_secondary(void *unused)
 {
unsigned int cpu = smp_processor_id();
-   struct cpumask *(*sibling_mask)(int) = cpu_sibling_mask;
 
mmgrab(_mm);
current->active_mm = _mm;
@@ -1331,14 +1330,20 @@ void start_secondary(void *unused)
/* Update topology CPU masks */
add_cpu_to_masks(cpu);
 
-   if (has_big_cores)
-   sibling_mask = cpu_smallcore_mask;
/*
 * Check for any shared caches. Note that this must be done on a
 * per-core basis because one core in the pair might be disabled.
 */
-   if (!cpumask_equal(cpu_l2_cache_mask(cpu), sibling_mask(cpu)))
-   shared_caches = true;
+   if (!shared_caches) {
+   struct cpumask *(*sibling_mask)(int) = cpu_sibling_mask;
+   struct cpumask *mask = cpu_l2_cache_mask(cpu);
+
+   if (has_big_cores)
+   sibling_mask = cpu_smallcore_mask;
+
+   if (cpumask_weight(mask) > cpumask_weight(sibling_mask(cpu)))
+   shared_caches = true;
+   }
 
set_numa_node(numa_cpu_lookup_table[cpu]);
set_numa_mem(local_memory_node(numa_cpu_lookup_table[cpu]));
-- 
2.18.2



[PATCH v5 04/10] powerpc/smp: Move topology fixups into a new function

2020-08-10 Thread Srikar Dronamraju
Move topology fixup based on the platform attributes into its own
function which is called just before set_sched_topology.

Cc: linuxppc-dev 
Cc: LKML 
Cc: Michael Ellerman 
Cc: Nicholas Piggin 
Cc: Anton Blanchard 
Cc: Oliver O'Halloran 
Cc: Nathan Lynch 
Cc: Michael Neuling 
Cc: Gautham R Shenoy 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Valentin Schneider 
Cc: Jordan Niethe 
Cc: Vaidyanathan Srinivasan 
Reviewed-by: Gautham R. Shenoy 
Signed-off-by: Srikar Dronamraju 
---
Changelog v2 -> v3:
Rewrote changelog (Gautham)
Renamed to powerpc/smp: Move topology fixups into  a new function

 arch/powerpc/kernel/smp.c | 17 +++--
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 39224a042468..b13161a5ffc3 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -1362,6 +1362,16 @@ int setup_profiling_timer(unsigned int multiplier)
return 0;
 }
 
+static void fixup_topology(void)
+{
+#ifdef CONFIG_SCHED_SMT
+   if (has_big_cores) {
+   pr_info("Big cores detected but using small core scheduling\n");
+   powerpc_topology[0].mask = smallcore_smt_mask;
+   }
+#endif
+}
+
 void __init smp_cpus_done(unsigned int max_cpus)
 {
/*
@@ -1375,12 +1385,7 @@ void __init smp_cpus_done(unsigned int max_cpus)
 
dump_numa_cpu_topology();
 
-#ifdef CONFIG_SCHED_SMT
-   if (has_big_cores) {
-   pr_info("Big cores detected but using small core scheduling\n");
-   powerpc_topology[0].mask = smallcore_smt_mask;
-   }
-#endif
+   fixup_topology();
set_sched_topology(powerpc_topology);
 }
 
-- 
2.18.2



[PATCH v5 05/10] powerpc/smp: Dont assume l2-cache to be superset of sibling

2020-08-10 Thread Srikar Dronamraju
Current code assumes that cpumask of cpus sharing a l2-cache mask will
always be a superset of cpu_sibling_mask.

Lets stop that assumption. cpu_l2_cache_mask is a superset of
cpu_sibling_mask if and only if shared_caches is set.

Cc: linuxppc-dev 
Cc: LKML 
Cc: Michael Ellerman 
Cc: Nicholas Piggin 
Cc: Anton Blanchard 
Cc: Oliver O'Halloran 
Cc: Nathan Lynch 
Cc: Michael Neuling 
Cc: Gautham R Shenoy 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Valentin Schneider 
Cc: Jordan Niethe 
Cc: Vaidyanathan Srinivasan 
Reviewed-by: Gautham R. Shenoy 
Signed-off-by: Srikar Dronamraju 
---
Changelog v1 -> v2:
Set cpumask after verifying l2-cache. (Gautham)

 arch/powerpc/kernel/smp.c | 28 +++-
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index b13161a5ffc3..0c960ce3be42 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -1188,6 +1188,7 @@ static bool update_mask_by_l2(int cpu, struct cpumask 
*(*mask_fn)(int))
if (!l2_cache)
return false;
 
+   cpumask_set_cpu(cpu, mask_fn(cpu));
for_each_cpu(i, cpu_online_mask) {
/*
 * when updating the marks the current CPU has not been marked
@@ -1270,29 +1271,30 @@ static void add_cpu_to_masks(int cpu)
 * add it to it's own thread sibling mask.
 */
cpumask_set_cpu(cpu, cpu_sibling_mask(cpu));
+   cpumask_set_cpu(cpu, cpu_core_mask(cpu));
 
for (i = first_thread; i < first_thread + threads_per_core; i++)
if (cpu_online(i))
set_cpus_related(i, cpu, cpu_sibling_mask);
 
add_cpu_to_smallcore_masks(cpu);
-   /*
-* Copy the thread sibling mask into the cache sibling mask
-* and mark any CPUs that share an L2 with this CPU.
-*/
-   for_each_cpu(i, cpu_sibling_mask(cpu))
-   set_cpus_related(cpu, i, cpu_l2_cache_mask);
update_mask_by_l2(cpu, cpu_l2_cache_mask);
 
-   /*
-* Copy the cache sibling mask into core sibling mask and mark
-* any CPUs on the same chip as this CPU.
-*/
-   for_each_cpu(i, cpu_l2_cache_mask(cpu))
-   set_cpus_related(cpu, i, cpu_core_mask);
+   if (pkg_id == -1) {
+   struct cpumask *(*mask)(int) = cpu_sibling_mask;
+
+   /*
+* Copy the sibling mask into core sibling mask and
+* mark any CPUs on the same chip as this CPU.
+*/
+   if (shared_caches)
+   mask = cpu_l2_cache_mask;
+
+   for_each_cpu(i, mask(cpu))
+   set_cpus_related(cpu, i, cpu_core_mask);
 
-   if (pkg_id == -1)
return;
+   }
 
for_each_cpu(i, cpu_online_mask)
if (get_physical_package_id(i) == pkg_id)
-- 
2.18.2



[PATCH v5 03/10] powerpc/smp: Move powerpc_topology above

2020-08-10 Thread Srikar Dronamraju
Just moving the powerpc_topology description above.
This will help in using functions in this file and avoid declarations.

No other functional changes

Cc: linuxppc-dev 
Cc: LKML 
Cc: Michael Ellerman 
Cc: Nicholas Piggin 
Cc: Anton Blanchard 
Cc: Oliver O'Halloran 
Cc: Nathan Lynch 
Cc: Michael Neuling 
Cc: Gautham R Shenoy 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Valentin Schneider 
Cc: Jordan Niethe 
Cc: Vaidyanathan Srinivasan 
Reviewed-by: Gautham R. Shenoy 
Signed-off-by: Srikar Dronamraju 
---
 arch/powerpc/kernel/smp.c | 104 +++---
 1 file changed, 52 insertions(+), 52 deletions(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 08da765b91f1..39224a042468 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -818,6 +818,58 @@ static int init_cpu_l1_cache_map(int cpu)
return err;
 }
 
+static bool shared_caches;
+
+#ifdef CONFIG_SCHED_SMT
+/* cpumask of CPUs with asymmetric SMT dependency */
+static int powerpc_smt_flags(void)
+{
+   int flags = SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES;
+
+   if (cpu_has_feature(CPU_FTR_ASYM_SMT)) {
+   printk_once(KERN_INFO "Enabling Asymmetric SMT scheduling\n");
+   flags |= SD_ASYM_PACKING;
+   }
+   return flags;
+}
+#endif
+
+/*
+ * P9 has a slightly odd architecture where pairs of cores share an L2 cache.
+ * This topology makes it *much* cheaper to migrate tasks between adjacent 
cores
+ * since the migrated task remains cache hot. We want to take advantage of this
+ * at the scheduler level so an extra topology level is required.
+ */
+static int powerpc_shared_cache_flags(void)
+{
+   return SD_SHARE_PKG_RESOURCES;
+}
+
+/*
+ * We can't just pass cpu_l2_cache_mask() directly because
+ * returns a non-const pointer and the compiler barfs on that.
+ */
+static const struct cpumask *shared_cache_mask(int cpu)
+{
+   return cpu_l2_cache_mask(cpu);
+}
+
+#ifdef CONFIG_SCHED_SMT
+static const struct cpumask *smallcore_smt_mask(int cpu)
+{
+   return cpu_smallcore_mask(cpu);
+}
+#endif
+
+static struct sched_domain_topology_level powerpc_topology[] = {
+#ifdef CONFIG_SCHED_SMT
+   { cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) },
+#endif
+   { shared_cache_mask, powerpc_shared_cache_flags, SD_INIT_NAME(CACHE) },
+   { cpu_cpu_mask, SD_INIT_NAME(DIE) },
+   { NULL, },
+};
+
 static int init_big_cores(void)
 {
int cpu;
@@ -1247,8 +1299,6 @@ static void add_cpu_to_masks(int cpu)
set_cpus_related(cpu, i, cpu_core_mask);
 }
 
-static bool shared_caches;
-
 /* Activate a secondary processor. */
 void start_secondary(void *unused)
 {
@@ -1312,56 +1362,6 @@ int setup_profiling_timer(unsigned int multiplier)
return 0;
 }
 
-#ifdef CONFIG_SCHED_SMT
-/* cpumask of CPUs with asymmetric SMT dependency */
-static int powerpc_smt_flags(void)
-{
-   int flags = SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES;
-
-   if (cpu_has_feature(CPU_FTR_ASYM_SMT)) {
-   printk_once(KERN_INFO "Enabling Asymmetric SMT scheduling\n");
-   flags |= SD_ASYM_PACKING;
-   }
-   return flags;
-}
-#endif
-
-/*
- * P9 has a slightly odd architecture where pairs of cores share an L2 cache.
- * This topology makes it *much* cheaper to migrate tasks between adjacent 
cores
- * since the migrated task remains cache hot. We want to take advantage of this
- * at the scheduler level so an extra topology level is required.
- */
-static int powerpc_shared_cache_flags(void)
-{
-   return SD_SHARE_PKG_RESOURCES;
-}
-
-/*
- * We can't just pass cpu_l2_cache_mask() directly because
- * returns a non-const pointer and the compiler barfs on that.
- */
-static const struct cpumask *shared_cache_mask(int cpu)
-{
-   return cpu_l2_cache_mask(cpu);
-}
-
-#ifdef CONFIG_SCHED_SMT
-static const struct cpumask *smallcore_smt_mask(int cpu)
-{
-   return cpu_smallcore_mask(cpu);
-}
-#endif
-
-static struct sched_domain_topology_level powerpc_topology[] = {
-#ifdef CONFIG_SCHED_SMT
-   { cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) },
-#endif
-   { shared_cache_mask, powerpc_shared_cache_flags, SD_INIT_NAME(CACHE) },
-   { cpu_cpu_mask, SD_INIT_NAME(DIE) },
-   { NULL, },
-};
-
 void __init smp_cpus_done(unsigned int max_cpus)
 {
/*
-- 
2.18.2



[PATCH v5 00/10] Coregroup support on Powerpc

2020-08-10 Thread Srikar Dronamraju
-
$ cat /proc/sys/kernel/sched_domain/cpu0/domain*/name
SMT
CACHE
DIE
NUMA

$ head /proc/schedstat
version 15
timestamp 4318242208
cpu0 0 0 0 0 0 0 28077107004 4773387362 78205
domain0 ,,,0055 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
domain1 ,,,00ff 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
domain2 ,,, 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
domain3 ,,, 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
cpu1 0 0 0 0 0 0 24177439200 413887604 75393
domain0 ,,,00aa 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
domain1 ,,,00ff 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

After patchset
--
$ cat /proc/sys/kernel/sched_domain/cpu0/domain*/name
SMT
CACHE
MC
DIE
NUMA

$ head /proc/schedstat
version 15
timestamp 4318242208
cpu0 0 0 0 0 0 0 28077107004 4773387362 78205
domain0 ,,,0055 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
domain1 ,,,00ff 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
domain2 ,,, 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
domain3 ,,, 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
domain4 ,,, 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
cpu1 0 0 0 0 0 0 24177439200 413887604 75393
domain0 ,,,00aa 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

Cc: linuxppc-dev 
Cc: LKML 
Cc: Michael Ellerman 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Valentin Schneider 
Cc: Nick Piggin 
Cc: Oliver OHalloran 
Cc: Nathan Lynch 
Cc: Michael Neuling 
Cc: Anton Blanchard 
Cc: Gautham R Shenoy 
Cc: Vaidyanathan Srinivasan 
Cc: Jordan Niethe 

Srikar Dronamraju (10):
  powerpc/smp: Fix a warning under !NEED_MULTIPLE_NODES
  powerpc/smp: Merge Power9 topology with Power topology
  powerpc/smp: Move powerpc_topology above
  powerpc/smp: Move topology fixups into  a new function
  powerpc/smp: Dont assume l2-cache to be superset of sibling
  powerpc/smp: Optimize start_secondary
  powerpc/numa: Detect support for coregroup
  powerpc/smp: Allocate cpumask only after searching thread group
  powerpc/smp: Create coregroup domain
  powerpc/smp: Implement cpu_to_coregroup_id

 arch/powerpc/include/asm/smp.h  |   1 +
 arch/powerpc/include/asm/topology.h |  10 ++
 arch/powerpc/kernel/smp.c   | 235 +---
 arch/powerpc/mm/numa.c  |  59 +--
 4 files changed, 198 insertions(+), 107 deletions(-)

-- 
2.18.2



[PATCH v2 2/2] powerpc/topology: Override cpu_smt_mask

2020-08-07 Thread Srikar Dronamraju
 1182315.7 60018.733

1018433 - 1047037 : ##3% (3)
1047037 - 1075640 :   4% (4)
1075640 - 1104244 :   4% (4)
1104244 - 1132848 : ###   7% (7)
1132848 - 1161452 :  17% 
(17)
1161452 - 1190055 : ##   12% 
(12)
1190055 - 1218659 : #21% 
(21)
1218659 - 1247263 : ##   23% 
(23)
1247263 - 1275866 :   4% (4)
1275866 - 1304470 :   4% (4)

with patch
  N   Min   MaxMedian   AvgStddev
100967014   1292938   1208819 1185281.8 69815.851

 967014 - 999606  : ##1% (1)
 999606 - 1032199 : ##1% (1)
1032199 - 1064791 :   6% (6)
1064791 - 1097384 : ##5% (5)
1097384 - 1129976 : ##9% (9)
1129976 - 1162568 :  10% 
(10)
1162568 - 1195161 : ##   13% 
(13)
1195161 - 1227753 :  22% 
(22)
1227753 - 1260346 : ##   25% 
(25)
1260346 - 1292938 : ##7% (7)

Observations: Not much changes, ebizzy is not much impacted.

Cc: linuxppc-dev 
Cc: LKML 
Cc: Michael Ellerman 
Cc: Michael Neuling 
Cc: Gautham R Shenoy 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Valentin Schneider 
Cc: Dietmar Eggemann 
Cc: Mel Gorman 
Cc: Vincent Guittot 
Cc: Vaidyanathan Srinivasan 
Signed-off-by: Srikar Dronamraju 
---
Changelog v1->v2:
Modified commit msg as per mailing list discussion.
Added performance numbers

 arch/powerpc/include/asm/cputhreads.h |  1 -
 arch/powerpc/include/asm/smp.h| 13 +
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/cputhreads.h 
b/arch/powerpc/include/asm/cputhreads.h
index deb99fd6e060..98c8bd155bf9 100644
--- a/arch/powerpc/include/asm/cputhreads.h
+++ b/arch/powerpc/include/asm/cputhreads.h
@@ -23,7 +23,6 @@
 extern int threads_per_core;
 extern int threads_per_subcore;
 extern int threads_shift;
-extern bool has_big_cores;
 extern cpumask_t threads_core_mask;
 #else
 #define threads_per_core   1
diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h
index 9cd0765633c5..bb06aa875131 100644
--- a/arch/powerpc/include/asm/smp.h
+++ b/arch/powerpc/include/asm/smp.h
@@ -131,6 +131,19 @@ static inline struct cpumask *cpu_smallcore_mask(int cpu)
 
 extern int cpu_to_core_id(int cpu);
 
+extern bool has_big_cores;
+
+#define cpu_smt_mask cpu_smt_mask
+#ifdef CONFIG_SCHED_SMT
+static inline const struct cpumask *cpu_smt_mask(int cpu)
+{
+   if (has_big_cores)
+   return per_cpu(cpu_smallcore_map, cpu);
+
+   return per_cpu(cpu_sibling_map, cpu);
+}
+#endif /* CONFIG_SCHED_SMT */
+
 /* Since OpenPIC has only 4 IPIs, we use slightly different message numbers.
  *
  * Make sure this matches openpic_request_IPIs in open_pic.c, or what shows up
-- 
2.18.2



[PATCH v2 1/2] sched/topology: Allow archs to override cpu_smt_mask

2020-08-07 Thread Srikar Dronamraju
cpu_smt_mask tracks topology_sibling_cpumask. This would be good for
most architectures. One of the users of cpu_smt_mask(), would be to
identify idle-cores. On Power9, a pair of SMT4 cores can be presented by
the firmware as a SMT8 core for backward compatibility reasons.

Powerpc allows LPARs to be live migrated from Power8 to Power9. Do note
Power8 had only SMT8 cores. Existing software which has been
developed/configured for Power8 would expect to see SMT8 core.
Maintaining the illusion of SMT8 core is a requirement to make that
work.

In order to maintain above userspace backward compatibility with
previous versions of processor, Power9 onwards there is option to the
firmware to advertise a pair of SMT4 cores as a fused cores aka SMT8
core. On Power9 this pair shares the L2 cache as well. However, from the
scheduler's point of view, a core should be determined by SMT4, since
its a completely independent unit of compute. Hence allow PowerPc
architecture to override the default cpu_smt_mask() to point to the SMT4
cores in a SMT8 mode.

This will ensure the scheduler is always given the right information.

Cc: linuxppc-dev 
Cc: LKML 
Cc: Michael Ellerman 
Cc: Michael Neuling 
Cc: Gautham R Shenoy 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Valentin Schneider 
Cc: Dietmar Eggemann 
Cc: Mel Gorman 
Cc: Vincent Guittot 
Cc: Vaidyanathan Srinivasan 
Acked-by; Peter Zijlstra (Intel) 
Signed-off-by: Srikar Dronamraju 
---
Changelog v1->v2:
Update the commit msg based on the discussion in community esp
with Peter Zijlstra and Michael Ellerman.

 include/linux/topology.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/topology.h b/include/linux/topology.h
index 608fa4aadf0e..ad03df1cc266 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -198,7 +198,7 @@ static inline int cpu_to_mem(int cpu)
 #define topology_die_cpumask(cpu)  cpumask_of(cpu)
 #endif
 
-#ifdef CONFIG_SCHED_SMT
+#if defined(CONFIG_SCHED_SMT) && !defined(cpu_smt_mask)
 static inline const struct cpumask *cpu_smt_mask(int cpu)
 {
return topology_sibling_cpumask(cpu);
-- 
2.18.2



Re: [PATCH 1/2] sched/topology: Allow archs to override cpu_smt_mask

2020-08-06 Thread Srikar Dronamraju
* pet...@infradead.org  [2020-08-06 15:15:47]:

> > But my understanding is most LPARs don't get migrated back and forth,
> > they'll start life on a P8 and only get migrated to a P9 once when the
> > customer gets a P9. They might then run for a long time (months to
> > years) on the P9 in P8 compat mode, not because they ever want to
> > migrate back to a real P8, but because the software in the LPAR is still
> > expecting to be on a P8.
> > 
> > I'm not a real expert on all the Enterprisey stuff though, so someone
> > else might be able to give us a better picture.
> > 
> > But the point of mentioning the migration stuff was mainly just to
> > explain why we feel we need to present SMT8 to userspace even on P9.
> 
> OK, fair enough. The patch wasn't particularly onerous, I was just
> wondering why etc..
> 
> The case of starting on a P8 and being migrated to a P9 makes sense to
> me; in that case you'd like to rebuild your sched domains, but can't go
> about changing user visible topolofy information.
> 
> I suppose:
> 
> Acked-by; Peter Zijlstra (Intel) 
> 
> An updated Changelog that recaps some of this discussion might also be
> nice.

Okay, will surely do the needful.

-- 
Thanks and Regards
Srikar Dronamraju


Re: [PATCH 1/2] sched/topology: Allow archs to override cpu_smt_mask

2020-08-06 Thread Srikar Dronamraju
* pet...@infradead.org  [2020-08-06 10:54:29]:

> On Thu, Aug 06, 2020 at 03:32:25PM +1000, Michael Ellerman wrote:
> 
> > That brings with it a bunch of problems, such as existing software that
> > has been developed/configured for Power8 and expects to see SMT8.
> > 
> > We also allow LPARs to be live migrated from Power8 to Power9 (and back), so
> > maintaining the illusion of SMT8 is considered a requirement to make that 
> > work.
> 
> So how does that work if the kernel booted on P9 and demuxed the SMT8
> into 2xSMT4? If you migrate that state onto a P8 with actual SMT8 you're
> toast again.
> 

To add to what Michael already said, the reason we don't expose the demux of
SMT8 into 2xSMT4 to userspace, is to make the userspace believe they are on
a SMT8. When the kernel is live migrated from P8 to P9, till the time of reboot
they would only have the older P8 topology. After reboot the kernel topology
would change, but the userspace is made to believe that they are running on
SMT8 core by way of keeping the sibling_cpumask at SMT8 core level.

-- 
Thanks and Regards
Srikar Dronamraju


Re: [PATCH 1/2] sched/topology: Allow archs to override cpu_smt_mask

2020-08-04 Thread Srikar Dronamraju
* pet...@infradead.org  [2020-08-04 12:45:20]:

> On Tue, Aug 04, 2020 at 09:03:06AM +0530, Srikar Dronamraju wrote:
> > cpu_smt_mask tracks topology_sibling_cpumask. This would be good for
> > most architectures. One of the users of cpu_smt_mask(), would be to
> > identify idle-cores. On Power9, a pair of cores can be presented by the
> > firmware as a big-core for backward compatibility reasons.
> > 
> > In order to maintain userspace backward compatibility with previous
> > versions of processor, (since Power8 had SMT8 cores), Power9 onwards there
> > is option to the firmware to advertise a pair of SMT4 cores as a fused
> > cores (referred to as the big_core mode in the Linux Kernel). On Power9
> > this pair shares the L2 cache as well. However, from the scheduler's point
> > of view, a core should be determined by SMT4. The load-balancer already
> > does this. Hence allow PowerPc architecture to override the default
> > cpu_smt_mask() to point to the SMT4 cores in a big_core mode.
> 
> I'm utterly confused.
> 
> Why can't you set your regular siblings mask to the smt4 thing? Who
> cares about the compat stuff, I thought that was an LPAR/AIX thing.

There are no technical challenges to set the sibling mask to SMT4.
This is for Linux running on PowerVM. When these Power9 boxes are sold /
marketed as X core boxes (where X stand for SMT8 cores).  Since on PowerVM
world, everything is in SMT8 mode, the device tree properties still mark
the system to be running on 8 thread core. There are a number of utilities
like ppc64_cpu that directly read from device-tree. They would get core
count and thread count which is SMT8 based.

If the sibling_mask is set to small core, then same user when looking at
output from lscpu and other utilities that look at sysfs will start seeing
2x number of cores to what he had provisioned and what the utilities from
the device-tree show. This can gets users confused.

So to keep the device-tree properties, utilities depending on device-tree,
sysfs and utilities depending on sysfs on the same page, userspace are only
exposed as SMT8.

-- 
Thanks and Regards
Srikar Dronamraju


Re: [RFC PATCH 1/2] powerpc/numa: Introduce logical numa id

2020-08-04 Thread Srikar Dronamraju
* Aneesh Kumar K.V  [2020-08-02 19:51:41]:
> Srikar Dronamraju  writes:
> > * Aneesh Kumar K.V  [2020-07-31 16:49:14]:
> >
> >
> > If its just to eliminate node 0, then we have 2 other probably better
> > solutions.
> > 1. Dont mark node 0 as spl (currently still in mm-tree and a result in
> > linux-next)
> > 2. powerpc specific: explicitly clear node 0 during numa bringup.
> >
> 
> 
> I am not sure I consider them better. But yes, those patches are good
> and also resolves the node 0 initialization when the firmware didn't
> indicate the presence of such a node.
> 
> This patch in addition make sure that we get the same topolgy report
> across reboot on a virtualized partitions as longs as the cpu/memory
> ratio per powervm domains remain the same. This should also help to
> avoid confusion after an LPM migration once we start applying topology
> updates. 
> 

What do we mean by cpu/memory ratio. The topology across reboot would have
changed only if PowerVM would have allocated resources differently by
scrambling/unscrambling. We are no more processing topology updates at
runtime. As far as I know, after LPM, the source topology is maintained.

> >> This can  be resolved by mapping the firmware provided group id to a 
> >> logical Linux
> >> NUMA id. In this patch, we do this only for pseries platforms considering 
> >> the
> >
> > On PowerVM, as you would know the nid is already a logical or a flattened
> > chip-id and not the actual hardware chip-id.
> 
> Yes. But then they are derived based on PowerVM resources AKA domains.
> Now based on the available resource on a system, we could end up with
> different node numbers with same toplogy across reboots. Making it
> logical at OS level prevent that. 

The above statement kind of gives an impression, that topology changes
across every reboot.  We only end up with different node numbers if and only
if the underlying topology has changed and that case is very rare. Or am I
missing something?

> 
> >> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
> >> index e437a9ac4956..6c659aada55b 100644
> >> --- a/arch/powerpc/mm/numa.c
> >> +++ b/arch/powerpc/mm/numa.c
> >> @@ -221,25 +221,51 @@ static void initialize_distance_lookup_table(int nid,
> >>}
> >>  }
> >> 
> >> +static u32 nid_map[MAX_NUMNODES] = {[0 ... MAX_NUMNODES - 1] =  
> >> NUMA_NO_NODE};
> >> +
> >> +int firmware_group_id_to_nid(int firmware_gid)
> >> +{
> >> +  static int last_nid = 0;
> >> +
> >> +  /*
> >> +   * For PowerNV we don't change the node id. This helps to avoid
> >> +   * confusion w.r.t the expected node ids. On pseries, node numbers
> >> +   * are virtualized. Hence do logical node id for pseries.
> >> +   */
> >> +  if (!firmware_has_feature(FW_FEATURE_LPAR))
> >> +  return firmware_gid;
> >> +
> >> +  if (firmware_gid ==  -1)
> >> +  return NUMA_NO_NODE;
> >> +
> >> +  if (nid_map[firmware_gid] == NUMA_NO_NODE)
> >> +  nid_map[firmware_gid] = last_nid++;
> >
> > How do we ensure 2 simultaneous firmware_group_id_to_nid() calls dont end up
> > at this place in parallel?
> 
> Do we have a code path where we do that? All the node id init should
> happen early and there should not be two cpus doing node init at the
> same time. I might be mistaken. Can you point to the code path where you
> expect this to be called in parallel?
> 

associativity_to_nid gets called the first time a cpu is being made present
from offline. So it need not be in boot path. We may to verify if cpu
hotplug, dlpar, operations are synchronized. For example a memory hotadd and
cpu hotplug are they synchronized? I am not sure if they are synchronized at
this time.

> >
> >> +
> >> +  return nid_map[firmware_gid];
> >> +}
> >> +
> >>  /* Returns nid in the range [0..MAX_NUMNODES-1], or -1 if no useful numa
> >>   * info is found.
> >>   */
> >>  static int associativity_to_nid(const __be32 *associativity)
> >>  {
> >>int nid = NUMA_NO_NODE;
> >> +  int firmware_gid = -1;
> >> 
> >>if (!numa_enabled)
> >>goto out;
> >> 
> >>if (of_read_number(associativity, 1) >= min_common_depth)
> >> -  nid = of_read_number([min_common_depth], 1);
> >> +  firmware_gid = of_read_number([min_common_depth], 
> >> 1);
> >> 
> >>/* POWER4 LPAR uses 0x as invalid node */
> >> -  if (nid == 0xf

[PATCH 2/2] powerpc/topology: Override cpu_smt_mask

2020-08-03 Thread Srikar Dronamraju
On Power9 a pair of cores can be presented by the firmware as a big-core
for backward compatibility reasons, with 4 threads per (small) core and 8
threads per big-core. cpu_smt_mask() should generally point to the cpu mask
of the (small)core.

In order to maintain userspace backward compatibility (with Power8 chips in
case of Power9) in enterprise Linux systems, the topology_sibling_cpumask
has to be set to big-core. Hence override the default cpu_smt_mask() to be
powerpc specific allowing for better scheduling behaviour on Power.

Cc: linuxppc-dev 
Cc: LKML 
Cc: Michael Ellerman 
Cc: Michael Neuling 
Cc: Gautham R Shenoy 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Valentin Schneider 
Cc: Dietmar Eggemann 
Cc: Mel Gorman 
Cc: Vincent Guittot 
Cc: Vaidyanathan Srinivasan 
Signed-off-by: Srikar Dronamraju 
---
 arch/powerpc/include/asm/cputhreads.h |  1 -
 arch/powerpc/include/asm/smp.h| 13 +
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/cputhreads.h 
b/arch/powerpc/include/asm/cputhreads.h
index deb99fd6e060..98c8bd155bf9 100644
--- a/arch/powerpc/include/asm/cputhreads.h
+++ b/arch/powerpc/include/asm/cputhreads.h
@@ -23,7 +23,6 @@
 extern int threads_per_core;
 extern int threads_per_subcore;
 extern int threads_shift;
-extern bool has_big_cores;
 extern cpumask_t threads_core_mask;
 #else
 #define threads_per_core   1
diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h
index 9cd0765633c5..d4bc28accb28 100644
--- a/arch/powerpc/include/asm/smp.h
+++ b/arch/powerpc/include/asm/smp.h
@@ -131,6 +131,19 @@ static inline struct cpumask *cpu_smallcore_mask(int cpu)
 
 extern int cpu_to_core_id(int cpu);
 
+extern bool has_big_cores;
+
+#define cpu_smt_mask cpu_smt_mask
+#ifdef CONFIG_SCHED_SMT
+static inline const struct cpumask *cpu_smt_mask(int cpu)
+{
+   if (has_big_cores)
+   return per_cpu(cpu_smallcore_map, cpu);
+
+   return per_cpu(cpu_sibling_map, cpu);
+}
+#endif /* CONFIG_SCHED_SMT */
+
 /* Since OpenPIC has only 4 IPIs, we use slightly different message numbers.
  *
  * Make sure this matches openpic_request_IPIs in open_pic.c, or what shows up
-- 
2.18.2



[PATCH 1/2] sched/topology: Allow archs to override cpu_smt_mask

2020-08-03 Thread Srikar Dronamraju
cpu_smt_mask tracks topology_sibling_cpumask. This would be good for
most architectures. One of the users of cpu_smt_mask(), would be to
identify idle-cores. On Power9, a pair of cores can be presented by the
firmware as a big-core for backward compatibility reasons.

In order to maintain userspace backward compatibility with previous
versions of processor, (since Power8 had SMT8 cores), Power9 onwards there
is option to the firmware to advertise a pair of SMT4 cores as a fused
cores (referred to as the big_core mode in the Linux Kernel). On Power9
this pair shares the L2 cache as well. However, from the scheduler's point
of view, a core should be determined by SMT4. The load-balancer already
does this. Hence allow PowerPc architecture to override the default
cpu_smt_mask() to point to the SMT4 cores in a big_core mode.

Cc: linuxppc-dev 
Cc: LKML 
Cc: Michael Ellerman 
Cc: Michael Neuling 
Cc: Gautham R Shenoy 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Valentin Schneider 
Cc: Dietmar Eggemann 
Cc: Mel Gorman 
Cc: Vincent Guittot 
Cc: Vaidyanathan Srinivasan 
Signed-off-by: Srikar Dronamraju 
---
 include/linux/topology.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/topology.h b/include/linux/topology.h
index 608fa4aadf0e..ad03df1cc266 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -198,7 +198,7 @@ static inline int cpu_to_mem(int cpu)
 #define topology_die_cpumask(cpu)  cpumask_of(cpu)
 #endif

-#ifdef CONFIG_SCHED_SMT
+#if defined(CONFIG_SCHED_SMT) && !defined(cpu_smt_mask)
 static inline const struct cpumask *cpu_smt_mask(int cpu)
 {
return topology_sibling_cpumask(cpu);
-- 
2.18.2



Re: [PATCH v4 09/10] Powerpc/smp: Create coregroup domain

2020-08-03 Thread Srikar Dronamraju
> > Also in the current P9 itself, two neighbouring core-pairs form a quad.
> > Cache latency within a quad is better than a latency to a distant core-pair.
> > Cache latency within a core pair is way better than latency within a quad.
> > So if we have only 4 threads running on a DIE all of them accessing the same
> > cache-lines, then we could probably benefit if all the tasks were to run
> > within the quad aka MC/Coregroup.
> >
> 
> Did you test this? WRT load balance we do try to balance "load" over the
> different domain spans, so if you represent quads as their own MC domain,
> you would AFAICT end up spreading tasks over the quads (rather than packing
> them) when balancing at e.g. DIE level. The desired behaviour might be
> hackable with some more ASYM_PACKING, but I'm not sure I should be
> suggesting that :-)
> 

Agree, load balance will try to spread the load across the quads. In my hack,
I was explicitly marking QUAD domains as !SD_PREFER_SIBLING + relaxing few
load spreading rules when SD_PREFER_SIBLING was not set. And this was on a
slightly older kernel (without recent Vincent's load balance overhaul).

> > I have found some benchmarks which are latency sensitive to benefit by
> > having a grouping a quad level (using kernel hacks and not backed by
> > firmware changes). Gautham also found similar results in his experiments
> > but he only used binding within the stock kernel.
> >
> 
> IIUC you reflect this "fabric quirk" (i.e. coregroups) using this DT
> binding thing.
> 
> That's also where things get interesting (for me) because I experienced
> something similar on another arm64 platform (ThunderX1). This was more
> about cache bandwidth than cache latency, but IMO it's in the same bag of
> fabric quirks. I blabbered a bit about this at last LPC [1], but kind of
> gave up on it given the TX1 was the only (arm64) platform where I could get
> both significant and reproducible results.
> 
> Now, if you folks are seeing this on completely different hardware and have
> "real" workloads that truly benefit from this kind of domain partitioning,
> this might be another incentive to try and sort of generalize this. That's
> outside the scope of your series, but your findings give me some hope!
> 
> I think what I had in mind back then was that if enough folks cared about
> it, we might get some bits added to the ACPI spec; something along the
> lines of proximity domains for the caches described in PPTT, IOW a cache
> distance matrix. I don't really know what it'll take to get there, but I
> figured I'd dump this in case someone's listening :-)
> 

Very interesting.

> > I am not setting SD_SHARE_PKG_RESOURCES in MC/Coregroup sd_flags as in MC
> > domain need not be LLC domain for Power.
> 
> From what I understood your MC domain does seem to map to LLC; but in any
> case, shouldn't you set that flag at least for BIGCORE (i.e. L2)? AIUI with
> your changes your sd_llc is gonna be SMT, and that's not going to be a very
> big mask. IMO you do want to correctly reflect your LLC situation via this
> flag to make cpus_share_cache() work properly.

I detect if the LLC is shared at BIGCORE, and if they are shared at BIGCORE,
then I dynamically rename the DOMAIN as CACHE and enable
SD_SHARE_PKG_RESOURCES in that domain.

> 
> [1]: https://linuxplumbersconf.org/event/4/contributions/484/

Thanks for the pointer.

-- 
Thanks and Regards
Srikar Dronamraju


Re: [RFC PATCH 1/2] powerpc/numa: Introduce logical numa id

2020-07-31 Thread Srikar Dronamraju
epth], 1);
> + firmware_gid = of_read_number([min_common_depth], 
> 1);
> 
>   /* POWER4 LPAR uses 0x as invalid node */
> - if (nid == 0x || nid >= MAX_NUMNODES)
> - nid = NUMA_NO_NODE;
> + if (firmware_gid == 0x || firmware_gid >= MAX_NUMNODES)
> + firmware_gid = -1;

Lets assume two or more invocations of associativity_to_nid for the same
associativity, end up with -1, In each case aren't giving different
nids?


> +
> + nid = firmware_group_id_to_nid(firmware_gid);
> 
>   if (nid > 0 &&
> - of_read_number(associativity, 1) >= distance_ref_points_depth) {
> + of_read_number(associativity, 1) >= distance_ref_points_depth) {
>   /*
>* Skip the length field and send start of associativity array
>*/
> @@ -432,24 +458,25 @@ static int of_get_assoc_arrays(struct assoc_arrays *aa)
>  static int of_drconf_to_nid_single(struct drmem_lmb *lmb)
>  {
>   struct assoc_arrays aa = { .arrays = NULL };
> - int default_nid = NUMA_NO_NODE;
> - int nid = default_nid;
> + int nid = NUMA_NO_NODE, firmware_gid;
>   int rc, index;
> 
>   if ((min_common_depth < 0) || !numa_enabled)
> - return default_nid;
> + return NUMA_NO_NODE;
> 
>   rc = of_get_assoc_arrays();
>   if (rc)
> - return default_nid;
> + return NUMA_NO_NODE;

https://lore.kernel.org/linuxppc-dev/87lfjc1b5f@linux.ibm.com/t/#u

> 
>   if (min_common_depth <= aa.array_sz &&
>   !(lmb->flags & DRCONF_MEM_AI_INVALID) && lmb->aa_index < 
> aa.n_arrays) {
>   index = lmb->aa_index * aa.array_sz + min_common_depth - 1;
> - nid = of_read_number([index], 1);
> + firmware_gid = of_read_number([index], 1);
> 
> - if (nid == 0x || nid >= MAX_NUMNODES)
> - nid = default_nid;
> + if (firmware_gid == 0x || firmware_gid >= MAX_NUMNODES)
> + firmware_gid = -1;

Same case as above, How do we ensure that we return unique nid for a
similar assoc_array?

> +
> + nid = firmware_group_id_to_nid(firmware_gid);
> 
>   if (nid > 0) {
>   index = lmb->aa_index * aa.array_sz;
> -- 
> 2.26.2
> 

-- 
Thanks and Regards
Srikar Dronamraju


Re: [PATCH v4 10/10] powerpc/smp: Implement cpu_to_coregroup_id

2020-07-31 Thread Srikar Dronamraju
* Michael Ellerman  [2020-07-31 18:02:21]:

> Srikar Dronamraju  writes:
> > Lookup the coregroup id from the associativity array.
> 

Thanks Michael for all your comments and inputs.

> It's slightly strange that this is called in patch 9, but only properly
> implemented here in patch 10.
> 
> I'm not saying you have to squash them together, but it would be good if
> the change log for patch 9 mentioned that a subsequent commit will
> complete the implementation and how that affects the behaviour.
> 

I probably got influenced by few LKML community members who always add a
stub and implement the gory details in a subsequent patch. I will surely add
the change log in patch 9 about the subsequent patches.

> cheers
> 

-- 
Thanks and Regards
Srikar Dronamraju


Re: [PATCH v4 08/10] powerpc/smp: Allocate cpumask only after searching thread group

2020-07-31 Thread Srikar Dronamraju
* Michael Ellerman  [2020-07-31 17:52:15]:

> Srikar Dronamraju  writes:
> > If allocated earlier and the search fails, then cpumask need to be
> > freed. However cpu_l1_cache_map can be allocated after we search thread
> > group.
> 
> It's not freed anywhere AFAICS?
> 

Yes, its never freed. Infact we are never checking if
zalloc_cpumask_var_node fails. Its not just this cpumask, but historically
all the other existing cpumasks in arch/powerpc/kernel/smp.c are never
freed/checked. I did dig into this a bit and it appears that ..
(Please do correct me if I am wrong!! )

Powerpc using cpumask_var_t for all of the percpu variables. And it dont seem
to enable CONFIG_CPUMASK_OFFSTACK even from the MAXSMP config.

So from include/linux/cpumask.h

typedef struct cpumask cpumask_var_t[1];
and
zalloc_cpumask_var_node ends up being cpumask_clear

So I think we are historically we seem to assume we are always
!CPUMASK_OFFSTACK and hence we dont need to check for return as well as
free..

I would look forward to your comments on how we should handle this going
forward. But I would keep this the same for this patchset.

One of the questions that I have is if we most likely are to be in
!CONFIG_CPUMASK_OFFSTACK, then should be migrate to cpumask_t for percpu
variables. 
 
The reason being we end up using NR_CPU cpumask for each percpu cpumask
variable instead of using NR_CPU cpumask_t pointer.

> And even after this change there's still an error path that doesn't free
> it, isn't there?
> 
> cheers
> 
> > Cc: linuxppc-dev 
> > Cc: LKML 
> > Cc: Michael Ellerman 
> > Cc: Nicholas Piggin 
> > Cc: Anton Blanchard 
> > Cc: Oliver O'Halloran 
> > Cc: Nathan Lynch 
> > Cc: Michael Neuling 
> > Cc: Gautham R Shenoy 
> > Cc: Ingo Molnar 
> > Cc: Peter Zijlstra 
> > Cc: Valentin Schneider 
> > Cc: Jordan Niethe 
> > Reviewed-by: Gautham R. Shenoy 
> > Signed-off-by: Srikar Dronamraju 
> > ---
> >  arch/powerpc/kernel/smp.c | 7 +++
> >  1 file changed, 3 insertions(+), 4 deletions(-)
> >
> > diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
> > index 698000c7f76f..dab96a1203ec 100644
> > --- a/arch/powerpc/kernel/smp.c
> > +++ b/arch/powerpc/kernel/smp.c
> > @@ -797,10 +797,6 @@ static int init_cpu_l1_cache_map(int cpu)
> > if (err)
> > goto out;
> >  
> > -   zalloc_cpumask_var_node(_cpu(cpu_l1_cache_map, cpu),
> > -   GFP_KERNEL,
> > -   cpu_to_node(cpu));
> > -
> > cpu_group_start = get_cpu_thread_group_start(cpu, );
> >  
> > if (unlikely(cpu_group_start == -1)) {
> > @@ -809,6 +805,9 @@ static int init_cpu_l1_cache_map(int cpu)
> > goto out;
> > }
> >  
> > +   zalloc_cpumask_var_node(_cpu(cpu_l1_cache_map, cpu),
> > +   GFP_KERNEL, cpu_to_node(cpu));
> > +
> > for (i = first_thread; i < first_thread + threads_per_core; i++) {
> > int i_group_start = get_cpu_thread_group_start(i, );
> >  
> > -- 
> > 2.17.1

-- 
Thanks and Regards
Srikar Dronamraju


Re: [PATCH v4 06/10] powerpc/smp: Generalize 2nd sched domain

2020-07-31 Thread Srikar Dronamraju
* Michael Ellerman  [2020-07-31 17:45:37]:

> Srikar Dronamraju  writes:
> > Currently "CACHE" domain happens to be the 2nd sched domain as per
> > powerpc_topology. This domain will collapse if cpumask of l2-cache is
> > same as SMT domain. However we could generalize this domain such that it
> > could mean either be a "CACHE" domain or a "BIGCORE" domain.
> >
> > While setting up the "CACHE" domain, check if shared_cache is already
> > set.
> 
> PeterZ asked for some overview of what you're doing and why, you
> responded to his mail, but I was expecting to see that text incorporated
> here somewhere.
> 

Okay, do you want that as part of the code or documentation dir or the
changelog?

> He also asked for some comments, which I would also like to see.
> 
> 
> I'm also not clear why we want to rename it to "bigcore", that's not a
> commonly understood term, I don't think it's clear to new readers what
> it means.
> 
> Leaving it as the shared cache domain, and having a comment mentioning
> that "bigcores" share a cache, would be clearer I think.
> 

Today, Shared cache is equal to Big Core. However in not too distant future,
Shared cache domain and Big Core may not be the same. For example lets
assume that L2 cache were to Shrink per small core with the firmware
exposing the core as a bigcore. Then with the current design, we have a SMT
== SHARED CACHE, and a DIE. We would not have any domain at the publicised 8
thread level. Keeping the Bigcore as a domain and mapping the shared
cache, (I am resetting the domain name as CACHE if BIGCORE==SHARED_CACHE),
helps us in this scenario.

> cheers
> 

-- 
Thanks and Regards
Srikar Dronamraju


Re: [PATCH v4 07/10] Powerpc/numa: Detect support for coregroup

2020-07-31 Thread Srikar Dronamraju
* Michael Ellerman  [2020-07-31 17:49:55]:

> Srikar Dronamraju  writes:
> > Add support for grouping cores based on the device-tree classification.
> > - The last domain in the associativity domains always refers to the
> > core.
> > - If primary reference domain happens to be the penultimate domain in
> > the associativity domains device-tree property, then there are no
> > coregroups. However if its not a penultimate domain, then there are
> > coregroups. There can be more than one coregroup. For now we would be
> > interested in the last or the smallest coregroups.
> 
> This still doesn't tell me what a coregroup actually represents.
> 
> I get that it's a grouping of cores, and that the device tree specifies
> it for us, but grouping based on what?
> 

We have just abstracted the fact that we are creating a sub-group of cores
within a DIE. We are limiting to one sub-group per core. However this would
allow the firmware the flexibility to vary the grouping. Once the firmware
starts using this group, we could add more code to detect the type of
grouping and adjust the sd domain flags accordingly.

> I think the answer is we aren't being told by firmware, it's just a
> grouping based on some opaque performance characteristic and we just
> have to take that as given.
> 

This is partially true. At this time, we dont have firmwares that can
exploit this code. Once the firmwares start using this grouping, we could
add more code to align the grouping to the scheduler topology.

> But please explain that clearly in the change log and the code comments.
> 

Okay, I will do the needful.

> cheers
> 

-- 
Thanks and Regards
Srikar Dronamraju


Re: [PATCH v4 00/10] Coregroup support on Powerpc

2020-07-30 Thread Srikar Dronamraju
* Srikar Dronamraju  [2020-07-27 11:02:20]:

> Changelog v3 ->v4:
> v3: 
> https://lore.kernel.org/lkml/20200723085116.4731-1-sri...@linux.vnet.ibm.com/t/#u
>

Here is a summary of some of the testing done with coregroup v4 patchsets.
It includes ebizzy, schbench, perf bench sched pipe and topology verification.
One the left side are results from powerpc/next tree and on the right are the
results with the patchset applied.  Topological verification clearly shows that
there is no change in topology with and without the patches on all the 3 class
of systems that were tested.

On PowerPc/NextOn 
Powerpc/next + Coregroup Support v4 patchset

Power 9 PowerNV (2 Node/ 160 Cpu System)
-
ebizzy (Throughput of 100 iterations of 30 seconds higher throughput is better)
  N  Min   MaxMedian   AvgStddev  N 
 Min   MaxMedian   Avg  Stddev
100   993884   1276090   1173476   1165914 54867.201100   
910470   1279820   1171095   116209167363.28

schbench (latency hence lower is better)
Latency percentiles (usec)  Latency 
percentiles (usec)
50.0th: 455 
50.0th: 454
75.0th: 533 
75.0th: 543
90.0th: 683 
90.0th: 701
95.0th: 743 
95.0th: 737
*99.0th: 815
*99.0th: 805
99.5th: 839 
99.5th: 835
99.9th: 913 
99.9th: 893
min=0, max=1011 
min=0, max=2833

perf bench sched pipe (lesser time and higher ops/sec is better)
# Running 'sched/pipe' benchmark:   # 
Running 'sched/pipe' benchmark:
# Executed 100 pipe operations between two processes# 
Executed 100 pipe operations between two processes

 Total time: 6.083 [sec] 
Total time: 6.303 [sec]

   6.083576 usecs/op   
6.303318 usecs/op
 164377 ops/sec 
 158646 ops/sec


Power 9 LPAR (2 Node/ 128 Cpu System)
-
ebizzy (Throughput of 100 iterations of 30 seconds higher throughput is better)
  N   Min   MaxMedian Avg  Stddev N 
  Min   MaxMedian Avg  Stddev
100   1058029   1295393   1200414   1188306.7   56786.538   100
943264   1287619   1180522   1168473.2   64469.955

schbench (latency hence lower is better)
Latency percentiles (usec)
Latency percentiles (usec)
50.th: 34   
  50.th: 39
75.th: 46   
  75.th: 52
90.th: 53   
  90.th: 68
95.th: 56   
  95.th: 77
*99.th: 61  
  *99.th: 89
99.5000th: 63   
  99.5000th: 94
99.9000th: 81   
  99.9000th: 169
min=0, max=8405 
  min=0, max=23674

perf bench sched pipe (lesser time and higher ops/sec is better)
# Running 'sched/pipe' benchmark:# 
Running 'sched/pipe' benchmark:
# Executed 100 pipe operations between two processes # 
Executed 100 pipe operations between two processes

 Total time: 8.768 [sec]
  Total time: 5.217 [sec]

   8.768400 usecs/op
5.217625 usecs/op
 114045 ops/sec 
  191658 ops/sec

Power 8 LPAR (8 Node/ 256 Cpu System)
-
ebizzy (Throughput of 100 iterations of 30 seconds higher throughput is better)
  N   Min   MaxMedian Avg  Stddev   N  
Min  Max   MedianAvg Stddev
100   1267615   1965234   1707423   1689137.6   144363.29 100  
1175357  1924262  1691104  1664792.1   145876.4

schbench

Re: [PATCH v2] selftests: powerpc: Fix online CPU selection

2020-07-29 Thread Srikar Dronamraju
* Sandipan Das  [2020-06-09 13:07:33]:

> The size of the CPU affinity mask must be large enough for
> systems with a very large number of CPUs. Otherwise, tests
> which try to determine the first online CPU by calling
> sched_getaffinity() will fail. This makes sure that the size
> of the allocated affinity mask is dependent on the number of
> CPUs as reported by get_nprocs().
> 
> Fixes: 3752e453f6ba ("selftests/powerpc: Add tests of PMU EBBs")
> Reported-by: Shirisha Ganta 
> Signed-off-by: Sandipan Das 
> Reviewed-by: Kamalesh Babulal 
> ---
> Previous versions can be found at:
> v1: 
> https://lore.kernel.org/linuxppc-dev/20200608144212.985144-1-sandi...@linux.ibm.com/
> 
> @@ -88,28 +89,40 @@ void *get_auxv_entry(int type)
> 
>  int pick_online_cpu(void)
>  {
> - cpu_set_t mask;
> - int cpu;
> + int ncpus, cpu = -1;
> + cpu_set_t *mask;
> + size_t size;
> +
> + ncpus = get_nprocs();

Please use get_nprocs_conf or sysconf(_SC_NPROCESSORS_CONF). The manpage
seems to suggest the latter. Not sure how accurate the manpage is.

get_nprocs is returning online cpus and when smt is off, the cpu numbers
would be sparse and hence the result from get_nprocs wouldn't be ideal for
allocating cpumask. However get_nprocs_conf would return the max configured
cpus and would be able to handle it. 

I think this was the same situation hit by Michael Ellerman.

> + size = CPU_ALLOC_SIZE(ncpus);
> + mask = CPU_ALLOC(ncpus);
> + if (!mask) {
> + perror("malloc");
> + return -1;
> + }
> 

-- 
Thanks and Regards
Srikar Dronamraju


Re: [PATCH v4 09/10] Powerpc/smp: Create coregroup domain

2020-07-29 Thread Srikar Dronamraju
* Valentin Schneider  [2020-07-28 16:03:11]:

Hi Valentin,

Thanks for looking into the patches.

> On 27/07/20 06:32, Srikar Dronamraju wrote:
> > Add percpu coregroup maps and masks to create coregroup domain.
> > If a coregroup doesn't exist, the coregroup domain will be degenerated
> > in favour of SMT/CACHE domain.
> >
> 
> So there's at least one arm64 platform out there with the same "pairs of
> cores share L2" thing (Ampere eMAG), and that lives quite happily with the
> default scheduler topology (SMT/MC/DIE). Each pair of core gets its MC
> domain, and the whole system is covered by DIE.
> 
> Now arguably it's not a perfect representation; DIE doesn't have
> SD_SHARE_PKG_RESOURCES so the highest level sd_llc can point to is MC. That
> will impact all callsites using cpus_share_cache(): in the eMAG case, only
> pairs of cores will be seen as sharing cache, even though *all* cores share
> the same L3.
> 

Okay, Its good to know that we have a chip which is similar to P9 in
topology.

> I'm trying to paint a picture of what the P9 topology looks like (the one
> you showcase in your cover letter) to see if there are any similarities;
> from what I gather in [1], wikichips and your cover letter, with P9 you can
> have something like this in a single DIE (somewhat unsure about L3 setup;
> it looks to be distributed?)
> 
>  +-+
>  |  L3 |
>  +---+-+---+-+---+-+---+
>  |   L2  | |   L2  | |   L2  | |   L2  |
>  +--+-+--+ +--+-+--+ +--+-+--+ +--+-+--+
>  |  L1  | |  L1  | |  L1  | |  L1  | |  L1  | |  L1  | |  L1  | |  L1  |
>  +--+ +--+ +--+ +--+ +--+ +--+ +--+ +--+
>  |4 CPUs| |4 CPUs| |4 CPUs| |4 CPUs| |4 CPUs| |4 CPUs| |4 CPUs| |4 CPUs|
>  +--+ +--+ +--+ +--+ +--+ +--+ +--+ +--+
> 
> Which would lead to (ignoring the whole SMT CPU numbering shenanigans)
> 
> NUMA [   ...
> DIE  [ ]
> MC   [ ] [ ] [ ] [ ]
> BIGCORE  [ ] [ ] [ ] [ ]
> SMT  [   ] [   ] [   ] [   ] [   ] [   ] [   ] [   ]
>  00-03 04-07 08-11 12-15 16-19 20-23 24-27 28-31  
> 

What you have summed up is perfectly what a P9 topology looks like. I dont
think I could have explained it better than this.

> This however has MC == BIGCORE; what makes it you can have different spans
> for these two domains? If it's not too much to ask, I'd love to have a P9
> topology diagram.
> 
> [1]: 20200722081822.gg9...@linux.vnet.ibm.com

At this time the current topology would be good enough i.e BIGCORE would
always be equal to a MC. However in future we could have chips that can have
lesser/larger number of CPUs in llc than in a BIGCORE or we could have
granular or split L3 caches within a DIE. In such a case BIGCORE != MC.

Also in the current P9 itself, two neighbouring core-pairs form a quad.
Cache latency within a quad is better than a latency to a distant core-pair.
Cache latency within a core pair is way better than latency within a quad.
So if we have only 4 threads running on a DIE all of them accessing the same
cache-lines, then we could probably benefit if all the tasks were to run
within the quad aka MC/Coregroup.

I have found some benchmarks which are latency sensitive to benefit by
having a grouping a quad level (using kernel hacks and not backed by
firmware changes). Gautham also found similar results in his experiments
but he only used binding within the stock kernel.

I am not setting SD_SHARE_PKG_RESOURCES in MC/Coregroup sd_flags as in MC
domain need not be LLC domain for Power.

-- 
Thanks and Regards
Srikar Dronamraju


Re: [PATCH v3 09/10] powerpc/smp: Create coregroup domain

2020-07-27 Thread Srikar Dronamraju
* Gautham R Shenoy  [2020-07-27 10:09:41]:

> > 
> >  static void fixup_topology(void)
> >  {
> > +   if (!has_coregroup_support())
> > +   powerpc_topology[mc_idx].mask = cpu_bigcore_mask;
> > +
> > if (shared_caches) {
> > pr_info("Using shared cache scheduler topology\n");
> > powerpc_topology[bigcore_idx].mask = shared_cache_mask;
> 
> 
> Suppose we consider a topology which does not have coregroup_support,
> but has shared_caches. In that case, we would want our coregroup
> domain to degenerate.
> 
> From the above code, after the fixup, our topology will look as
> follows:
> 
> static struct sched_domain_topology_level powerpc_topology[] = {
>   { cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) },
>   { shared_cache_mask, powerpc_shared_cache_flags, SD_INIT_NAME(CACHE) },
>   { cpu_bigcore_mask, SD_INIT_NAME(MC) },
>   { cpu_cpu_mask, SD_INIT_NAME(DIE) },
>   { NULL, },
> 
> So, in this case, the core-group domain (identified by MC) will
> degenerate only if cpu_bigcore_mask() and shared_cache_mask() return
> the same value. This may work for existing platforms, because either
> shared_caches don't exist, or when they do, cpu_bigcore_mask and
> shared_cache_mask return the same set of CPUs. But this may or may not
> continue to hold good in the future.
> 
> Furthermore, if that is always going to be the case that in the
> presence of shared_caches the cpu_bigcore_mask() and
> shared_cache_mask() will always be the same, then why even define two
> separate masks and not just have only the cpu_bigcore_mask() ?
> 

Your two statements are contradicting. In the former you are saying we
should be future proof and in the latter, you are asking for why add if they
are both going to be the same.

> The correct way would be to set the powerpc_topology[mc_idx].mask to
> powerpc_topology[bigcore_idx].mask *after* we have fixedup the
> big_core level.

The reason I modified it in v4 is not for degeneration or for future case
but for the current PowerNV/SMT 4 case. I could have as well detected the
the same and modified bigcore but thought fixup at one place would be
better.

-- 
Thanks and Regards
Srikar Dronamraju


[PATCH 7/7] powerpc/smp: Depend on cpu_l1_cache_map when adding cpus

2020-07-27 Thread Srikar Dronamraju
Currently on hotplug/hotunplug, the cpu iterates through all the cpus in
its core to find threads in its thread group. However this info is
already captured in cpu_l1_cache_map. Hence we could reduce the
iteration and cleanup add_cpu_to_smallcore_masks function.

Cc: linuxppc-dev 
Cc: LKML 
Cc: Michael Ellerman 
Cc: Nicholas Piggin 
Cc: Anton Blanchard 
Cc: Oliver O'Halloran 
Cc: Nathan Lynch 
Cc: Michael Neuling 
Cc: Gautham R Shenoy 
Cc: Satheesh Rajendran 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Valentin Schneider 
Signed-off-by: Srikar Dronamraju 
---
 arch/powerpc/kernel/smp.c | 7 +++
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index eceb7aa0f4b8..22f4b3856470 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -1276,16 +1276,15 @@ static void remove_cpu_from_masks(int cpu)
 
 static inline void add_cpu_to_smallcore_masks(int cpu)
 {
-   struct cpumask *this_l1_cache_map = per_cpu(cpu_l1_cache_map, cpu);
-   int i, first_thread = cpu_first_thread_sibling(cpu);
+   int i;
 
if (!has_big_cores)
return;
 
cpumask_set_cpu(cpu, cpu_smallcore_mask(cpu));
 
-   for (i = first_thread; i < first_thread + threads_per_core; i++) {
-   if (cpu_online(i) && cpumask_test_cpu(i, this_l1_cache_map))
+   for_each_cpu(i, per_cpu(cpu_l1_cache_map, cpu)) {
+   if (cpu_online(i))
set_cpus_related(i, cpu, cpu_smallcore_mask);
}
 }
-- 
2.17.1



[PATCH 5/7] powerpc/smp: Limit cpus traversed to within a node.

2020-07-27 Thread Srikar Dronamraju
All the arch specific topology cpumasks are within a node/die.
However when setting these per cpu cpumasks, system traverses through
all the online cpus. This is redundant.

Reduce the traversal to only cpus that are online in the node to which
the cpu belongs to.

Cc: linuxppc-dev 
Cc: LKML 
Cc: Michael Ellerman 
Cc: Nicholas Piggin 
Cc: Anton Blanchard 
Cc: Oliver O'Halloran 
Cc: Nathan Lynch 
Cc: Michael Neuling 
Cc: Gautham R Shenoy 
Cc: Satheesh Rajendran 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Valentin Schneider 
Signed-off-by: Srikar Dronamraju 
---
 arch/powerpc/kernel/smp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index cde157483abf..9b03aad0beac 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -1232,7 +1232,7 @@ static bool update_mask_by_l2(int cpu, struct cpumask 
*(*mask_fn)(int))
return false;
 
cpumask_set_cpu(cpu, mask_fn(cpu));
-   for_each_cpu(i, cpu_online_mask) {
+   for_each_cpu_and(i, cpu_online_mask, cpu_cpu_mask(cpu)) {
/*
 * when updating the marks the current CPU has not been marked
 * online, but we need to update the cache masks
-- 
2.17.1



[PATCH 6/7] powerpc/smp: Stop passing mask to update_mask_by_l2

2020-07-27 Thread Srikar Dronamraju
update_mask_by_l2 is called only once. But it passes cpu_l2_cache_mask
as parameter. Instead of passing cpu_l2_cache_mask, use it directly in
update_mask_by_l2.

Cc: linuxppc-dev 
Cc: LKML 
Cc: Michael Ellerman 
Cc: Nicholas Piggin 
Cc: Anton Blanchard 
Cc: Oliver O'Halloran 
Cc: Nathan Lynch 
Cc: Michael Neuling 
Cc: Gautham R Shenoy 
Cc: Satheesh Rajendran 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Valentin Schneider 
Signed-off-by: Srikar Dronamraju 
---
 arch/powerpc/kernel/smp.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 9b03aad0beac..eceb7aa0f4b8 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -1222,7 +1222,7 @@ static struct device_node *cpu_to_l2cache(int cpu)
return cache;
 }
 
-static bool update_mask_by_l2(int cpu, struct cpumask *(*mask_fn)(int))
+static bool update_mask_by_l2(int cpu)
 {
struct device_node *l2_cache, *np;
int i;
@@ -1231,7 +1231,7 @@ static bool update_mask_by_l2(int cpu, struct cpumask 
*(*mask_fn)(int))
if (!l2_cache)
return false;
 
-   cpumask_set_cpu(cpu, mask_fn(cpu));
+   cpumask_set_cpu(cpu, cpu_l2_cache_mask(cpu));
for_each_cpu_and(i, cpu_online_mask, cpu_cpu_mask(cpu)) {
/*
 * when updating the marks the current CPU has not been marked
@@ -1242,7 +1242,7 @@ static bool update_mask_by_l2(int cpu, struct cpumask 
*(*mask_fn)(int))
continue;
 
if (np == l2_cache)
-   set_cpus_related(cpu, i, mask_fn);
+   set_cpus_related(cpu, i, cpu_l2_cache_mask);
 
of_node_put(np);
}
@@ -1306,7 +1306,7 @@ static void add_cpu_to_masks(int cpu)
set_cpus_related(i, cpu, cpu_sibling_mask);
 
add_cpu_to_smallcore_masks(cpu);
-   update_mask_by_l2(cpu, cpu_l2_cache_mask);
+   update_mask_by_l2(cpu);
 
if (has_coregroup_support()) {
int coregroup_id = cpu_to_coregroup_id(cpu);
-- 
2.17.1



[PATCH 4/7] powerpc/smp: Optimize remove_cpu_from_masks

2020-07-27 Thread Srikar Dronamraju
Currently while offlining a cpu, we iterate through all the cpus in the
DIE to clear sibling, l2_cache and smallcore maps. However if there are
more number of cores in a DIE, we end up spending more time iterating
through cpus which are completely unrelated.

Optimize this by only iterating through lesser but relevant cpumap.
If shared_cache is set, cpu_l2_cache_map should be relevant else
cpu_sibling_map would be relevant.

Cc: linuxppc-dev 
Cc: LKML 
Cc: Michael Ellerman 
Cc: Nicholas Piggin 
Cc: Anton Blanchard 
Cc: Oliver O'Halloran 
Cc: Nathan Lynch 
Cc: Michael Neuling 
Cc: Gautham R Shenoy 
Cc: Satheesh Rajendran 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Valentin Schneider 
Signed-off-by: Srikar Dronamraju 
---
 arch/powerpc/kernel/smp.c | 11 +--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index d476098fc25c..cde157483abf 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -1254,14 +1254,21 @@ static bool update_mask_by_l2(int cpu, struct cpumask 
*(*mask_fn)(int))
 #ifdef CONFIG_HOTPLUG_CPU
 static void remove_cpu_from_masks(int cpu)
 {
+   struct cpumask *(*mask_fn)(int) = cpu_sibling_mask;
int i;
 
-   for_each_cpu(i, cpu_cpu_mask(cpu)) {
+   if (shared_caches)
+   mask_fn = cpu_l2_cache_mask;
+
+   for_each_cpu(i, mask_fn(cpu)) {
set_cpus_unrelated(cpu, i, cpu_l2_cache_mask);
set_cpus_unrelated(cpu, i, cpu_sibling_mask);
if (has_big_cores)
set_cpus_unrelated(cpu, i, cpu_smallcore_mask);
-   if (has_coregroup_support())
+   }
+
+   if (has_coregroup_support()) {
+   for_each_cpu(i, cpu_coregroup_mask(cpu))
set_cpus_unrelated(cpu, i, cpu_coregroup_mask);
}
 }
-- 
2.17.1



[PATCH 3/7] powerpc/smp: Remove get_physical_package_id

2020-07-27 Thread Srikar Dronamraju
Now that cpu_core_mask has been removed and topology_core_cpumask has
been updated to use cpu_cpu_mask, we no more need
get_physical_package_id.

Please note get_physical_package_id is an exported symbol. However
it was introduced recently and probably no users outside kernel.

Cc: linuxppc-dev 
Cc: LKML 
Cc: Michael Ellerman 
Cc: Nicholas Piggin 
Cc: Anton Blanchard 
Cc: Oliver O'Halloran 
Cc: Nathan Lynch 
Cc: Michael Neuling 
Cc: Gautham R Shenoy 
Cc: Satheesh Rajendran 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Valentin Schneider 
Signed-off-by: Srikar Dronamraju 
---
 arch/powerpc/include/asm/topology.h |  5 -
 arch/powerpc/kernel/smp.c   | 20 
 2 files changed, 25 deletions(-)

diff --git a/arch/powerpc/include/asm/topology.h 
b/arch/powerpc/include/asm/topology.h
index e0f232533c9d..e45219f74be0 100644
--- a/arch/powerpc/include/asm/topology.h
+++ b/arch/powerpc/include/asm/topology.h
@@ -114,12 +114,7 @@ static inline int cpu_to_coregroup_id(int cpu)
 #ifdef CONFIG_PPC64
 #include 
 
-#ifdef CONFIG_PPC_SPLPAR
-int get_physical_package_id(int cpu);
-#define topology_physical_package_id(cpu)  (get_physical_package_id(cpu))
-#else
 #define topology_physical_package_id(cpu)  (cpu_to_chip_id(cpu))
-#endif
 
 #define topology_sibling_cpumask(cpu)  (per_cpu(cpu_sibling_map, cpu))
 #define topology_core_cpumask(cpu) (cpu_cpu_mask(cpu))
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 8c28e1b4957b..d476098fc25c 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -1283,26 +1283,6 @@ static inline void add_cpu_to_smallcore_masks(int cpu)
}
 }
 
-int get_physical_package_id(int cpu)
-{
-   int pkg_id = cpu_to_chip_id(cpu);
-
-   /*
-* If the platform is PowerNV or Guest on KVM, ibm,chip-id is
-* defined. Hence we would return the chip-id as the result of
-* get_physical_package_id.
-*/
-   if (pkg_id == -1 && firmware_has_feature(FW_FEATURE_LPAR) &&
-   IS_ENABLED(CONFIG_PPC_SPLPAR)) {
-   struct device_node *np = of_get_cpu_node(cpu, NULL);
-   pkg_id = of_node_to_nid(np);
-   of_node_put(np);
-   }
-
-   return pkg_id;
-}
-EXPORT_SYMBOL_GPL(get_physical_package_id);
-
 static void add_cpu_to_masks(int cpu)
 {
int first_thread = cpu_first_thread_sibling(cpu);
-- 
2.17.1



[PATCH 2/7] powerpc/smp: Stop updating cpu_core_mask

2020-07-27 Thread Srikar Dronamraju
Anton Blanchard reported that his 4096 vcpu KVM guest took around 30
minutes to boot. He also analyzed it to the time taken to iterate while
setting the cpu_core_mask.

Further analysis shows that cpu_core_mask and cpu_cpu_mask for any CPU
would be equal on Power. However updating cpu_core_mask took forever to
update as its a per cpu cpumask variable. Instead cpu_cpu_mask was a per
NODE /per DIE cpumask that was shared by all the respective CPUs.

Also cpu_cpu_mask is needed from a scheduler perspective. However
cpu_core_map is an exported symbol. Hence stop updating cpu_core_map
and make it point to cpu_cpu_mask.

Cc: linuxppc-dev 
Cc: LKML 
Cc: Michael Ellerman 
Cc: Nicholas Piggin 
Cc: Anton Blanchard 
Cc: Oliver O'Halloran 
Cc: Nathan Lynch 
Cc: Michael Neuling 
Cc: Gautham R Shenoy 
Cc: Satheesh Rajendran 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Valentin Schneider 
Signed-off-by: Srikar Dronamraju 
---
 arch/powerpc/include/asm/smp.h |  5 -
 arch/powerpc/kernel/smp.c  | 33 +++--
 2 files changed, 7 insertions(+), 31 deletions(-)

diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h
index 5bdc17a7049f..cf6e7c7be62b 100644
--- a/arch/powerpc/include/asm/smp.h
+++ b/arch/powerpc/include/asm/smp.h
@@ -119,11 +119,6 @@ static inline struct cpumask *cpu_sibling_mask(int cpu)
return per_cpu(cpu_sibling_map, cpu);
 }
 
-static inline struct cpumask *cpu_core_mask(int cpu)
-{
-   return per_cpu(cpu_core_map, cpu);
-}
-
 static inline struct cpumask *cpu_l2_cache_mask(int cpu)
 {
return per_cpu(cpu_l2_cache_map, cpu);
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 95f0bf72e283..8c28e1b4957b 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -957,12 +957,17 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
local_memory_node(numa_cpu_lookup_table[cpu]));
}
 #endif
+   /*
+* cpu_core_map is no more updated and exists only since
+* its been exported for long. It only will have a snapshot
+* of cpu_cpu_mask.
+*/
+   cpumask_copy(per_cpu(cpu_core_map, cpu), cpu_cpu_mask(cpu));
}
 
/* Init the cpumasks so the boot CPU is related to itself */
cpumask_set_cpu(boot_cpuid, cpu_sibling_mask(boot_cpuid));
cpumask_set_cpu(boot_cpuid, cpu_l2_cache_mask(boot_cpuid));
-   cpumask_set_cpu(boot_cpuid, cpu_core_mask(boot_cpuid));
 
if (has_coregroup_support())
cpumask_set_cpu(boot_cpuid, cpu_coregroup_mask(boot_cpuid));
@@ -1251,9 +1256,7 @@ static void remove_cpu_from_masks(int cpu)
 {
int i;
 
-   /* NB: cpu_core_mask is a superset of the others */
-   for_each_cpu(i, cpu_core_mask(cpu)) {
-   set_cpus_unrelated(cpu, i, cpu_core_mask);
+   for_each_cpu(i, cpu_cpu_mask(cpu)) {
set_cpus_unrelated(cpu, i, cpu_l2_cache_mask);
set_cpus_unrelated(cpu, i, cpu_sibling_mask);
if (has_big_cores)
@@ -1303,7 +1306,6 @@ EXPORT_SYMBOL_GPL(get_physical_package_id);
 static void add_cpu_to_masks(int cpu)
 {
int first_thread = cpu_first_thread_sibling(cpu);
-   int pkg_id = get_physical_package_id(cpu);
int i;
 
/*
@@ -1311,7 +1313,6 @@ static void add_cpu_to_masks(int cpu)
 * add it to it's own thread sibling mask.
 */
cpumask_set_cpu(cpu, cpu_sibling_mask(cpu));
-   cpumask_set_cpu(cpu, cpu_core_mask(cpu));
 
for (i = first_thread; i < first_thread + threads_per_core; i++)
if (cpu_online(i))
@@ -1333,26 +1334,6 @@ static void add_cpu_to_masks(int cpu)
set_cpus_related(cpu, i, cpu_coregroup_mask);
}
}
-
-   if (pkg_id == -1) {
-   struct cpumask *(*mask)(int) = cpu_sibling_mask;
-
-   /*
-* Copy the sibling mask into core sibling mask and
-* mark any CPUs on the same chip as this CPU.
-*/
-   if (shared_caches)
-   mask = cpu_l2_cache_mask;
-
-   for_each_cpu(i, mask(cpu))
-   set_cpus_related(cpu, i, cpu_core_mask);
-
-   return;
-   }
-
-   for_each_cpu(i, cpu_online_mask)
-   if (get_physical_package_id(i) == pkg_id)
-   set_cpus_related(cpu, i, cpu_core_mask);
 }
 
 /* Activate a secondary processor. */
-- 
2.17.1



[PATCH 1/7] powerpc/topology: Update topology_core_cpumask

2020-07-27 Thread Srikar Dronamraju
On Power, cpu_core_mask and cpu_cpu_mask refer to the same set of CPUs.
cpu_cpu_mask is needed by scheduler, hence look at deprecating
cpu_core_mask. Before deleting the cpu_core_mask, ensure its only user
is moved to cpu_cpu_mask.

Cc: linuxppc-dev 
Cc: LKML 
Cc: Michael Ellerman 
Cc: Nicholas Piggin 
Cc: Anton Blanchard 
Cc: Oliver O'Halloran 
Cc: Nathan Lynch 
Cc: Michael Neuling 
Cc: Gautham R Shenoy 
Cc: Satheesh Rajendran 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Valentin Schneider 
Signed-off-by: Srikar Dronamraju 
---
 arch/powerpc/include/asm/topology.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/topology.h 
b/arch/powerpc/include/asm/topology.h
index 6609174918ab..e0f232533c9d 100644
--- a/arch/powerpc/include/asm/topology.h
+++ b/arch/powerpc/include/asm/topology.h
@@ -122,7 +122,7 @@ int get_physical_package_id(int cpu);
 #endif
 
 #define topology_sibling_cpumask(cpu)  (per_cpu(cpu_sibling_map, cpu))
-#define topology_core_cpumask(cpu) (per_cpu(cpu_core_map, cpu))
+#define topology_core_cpumask(cpu) (cpu_cpu_mask(cpu))
 #define topology_core_id(cpu)  (cpu_to_core_id(cpu))
 
 #endif
-- 
2.17.1



[PATCH 0/7] Optimization to improve cpu online/offline on Powerpc

2020-07-27 Thread Srikar Dronamraju
Anton reported that his 4096 cpu (1024 cores in a socket) was taking too
long to boot. He also analyzed that most of the time was being spent on
updating cpu_core_mask.

Here are some optimizations and fixes to make ppc64_cpu --smt=8/ppc64_cpu
--smt=1 run faster and hence boot the kernel also faster.

Its based on top of my v4 coregroup support patchset.
http://lore.kernel.org/lkml/20200727053230.19753-1-sri...@linux.vnet.ibm.com/t/#u

The first two patches should solve Anton's immediate problem.
On the unofficial patches, Anton reported that the boot time came from 30
mins to 6 seconds. (Basically a high core count in a single socket
configuration). Satheesh also reported similar numbers.

The rest are simple cleanups/optimizations.

Since cpu_core_mask is an exported symbol for a long duration, lets retain
as a snapshot of cpumask_of_node.

Architecture:ppc64le
Byte Order:  Little Endian
CPU(s):  160
On-line CPU(s) list: 0-159
Thread(s) per core:  4
Core(s) per socket:  20
Socket(s):   2
NUMA node(s):2
Model:   2.2 (pvr 004e 1202)
Model name:  POWER9, altivec supported
CPU max MHz: 3800.
CPU min MHz: 2166.
L1d cache:   32K
L1i cache:   32K
L2 cache:512K
L3 cache:10240K
NUMA node0 CPU(s):   0-79
NUMA node8 CPU(s):   80-159

without patch (powerpc/next)
[0.099347] smp: Bringing up secondary CPUs ...
[0.832513] smp: Brought up 2 nodes, 160 CPUs

with powerpc/next + coregroup support patchset
[0.099241] smp: Bringing up secondary CPUs ...
[0.835627] smp: Brought up 2 nodes, 160 CPUs

with powerpc/next + coregroup + this patchset
[0.097232] smp: Bringing up secondary CPUs ...
[0.528457] smp: Brought up 2 nodes, 160 CPUs

x ppc64_cpu --smt=1
+ ppc64_cpu --smt=4

without patch
N   Min   MaxMedian   AvgStddev
x 100 11.82 17.06 14.01 14.05 1.2665247
+ 100 12.25 16.59 13.86   14.1143  1.164293

with patch
N   Min   MaxMedian   AvgStddev
x 100 12.68 16.15 14.2414.2380.75489246
+ 100 12.93 15.85 14.35   14.28970.60041813

Cc: linuxppc-dev 
Cc: LKML 
Cc: Michael Ellerman 
Cc: Nicholas Piggin 
Cc: Anton Blanchard 
Cc: Oliver O'Halloran 
Cc: Nathan Lynch 
Cc: Michael Neuling 
Cc: Gautham R Shenoy 
Cc: Satheesh Rajendran 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Valentin Schneider 

Srikar Dronamraju (7):
  powerpc/topology: Update topology_core_cpumask
  powerpc/smp: Stop updating cpu_core_mask
  powerpc/smp: Remove get_physical_package_id
  powerpc/smp: Optimize remove_cpu_from_masks
  powerpc/smp: Limit cpus traversed to within a node.
  powerpc/smp: Stop passing mask to update_mask_by_l2
  powerpc/smp: Depend on cpu_l1_cache_map when adding cpus

 arch/powerpc/include/asm/smp.h  |  5 --
 arch/powerpc/include/asm/topology.h |  7 +--
 arch/powerpc/kernel/smp.c   | 79 +
 3 files changed, 24 insertions(+), 67 deletions(-)

-- 
2.17.1



Re: [PATCH v4 00/10] Coregroup support on Powerpc

2020-07-27 Thread Srikar Dronamraju
* Srikar Dronamraju  [2020-07-27 10:47:55]:


> Changelog v3 ->v4:
> v3: 
> https://lore.kernel.org/lkml/20200723085116.4731-1-sri...@linux.vnet.ibm.com/t/#u
> 
> powerpc/smp: Create coregroup domain
>   if coregroup_support doesn't exist, update MC mask to the next
>   smaller domain mask.
> 

Sorry for the double post of v4.
Please follow the other thread.

http://lore.kernel.org/lkml/20200727053230.19753-1-sri...@linux.vnet.ibm.com/t/#u

> 

-- 
Thanks and Regards
Srikar Dronamraju


[PATCH v4 10/10] powerpc/smp: Implement cpu_to_coregroup_id

2020-07-27 Thread Srikar Dronamraju
Lookup the coregroup id from the associativity array.

If unable to detect the coregroup id, fallback on the core id.
This way, ensure sched_domain degenerates and an extra sched domain is
not created.

Ideally this function should have been implemented in
arch/powerpc/kernel/smp.c. However if its implemented in mm/numa.c, we
don't need to find the primary domain again.

If the device-tree mentions more than one coregroup, then kernel
implements only the last or the smallest coregroup, which currently
corresponds to the penultimate domain in the device-tree.

Cc: linuxppc-dev 
Cc: LKML 
Cc: Michael Ellerman 
Cc: Nicholas Piggin 
Cc: Anton Blanchard 
Cc: Oliver O'Halloran 
Cc: Nathan Lynch 
Cc: Michael Neuling 
Cc: Gautham R Shenoy 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Valentin Schneider 
Cc: Jordan Niethe 
Reviewed-by : Gautham R. Shenoy 
Signed-off-by: Srikar Dronamraju 
---
Changelog v1 -> v2:
Move coregroup_enabled before getting associativity (Gautham)

 arch/powerpc/mm/numa.c | 20 
 1 file changed, 20 insertions(+)

diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 0d57779e7942..8b3b3ec7fcc4 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -1218,6 +1218,26 @@ int find_and_online_cpu_nid(int cpu)
 
 int cpu_to_coregroup_id(int cpu)
 {
+   __be32 associativity[VPHN_ASSOC_BUFSIZE] = {0};
+   int index;
+
+   if (cpu < 0 || cpu > nr_cpu_ids)
+   return -1;
+
+   if (!coregroup_enabled)
+   goto out;
+
+   if (!firmware_has_feature(FW_FEATURE_VPHN))
+   goto out;
+
+   if (vphn_get_associativity(cpu, associativity))
+   goto out;
+
+   index = of_read_number(associativity, 1);
+   if (index > min_common_depth + 1)
+   return of_read_number([index - 1], 1);
+
+out:
return cpu_to_core_id(cpu);
 }
 
-- 
2.17.1



[PATCH v4 09/10] Powerpc/smp: Create coregroup domain

2020-07-26 Thread Srikar Dronamraju
Add percpu coregroup maps and masks to create coregroup domain.
If a coregroup doesn't exist, the coregroup domain will be degenerated
in favour of SMT/CACHE domain.

Cc: linuxppc-dev 
Cc: LKML 
Cc: Michael Ellerman 
Cc: Nicholas Piggin 
Cc: Anton Blanchard 
Cc: Oliver O'Halloran 
Cc: Nathan Lynch 
Cc: Michael Neuling 
Cc: Gautham R Shenoy 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Valentin Schneider 
Cc: Jordan Niethe 
Signed-off-by: Srikar Dronamraju 
---
Changelog v3 ->v4:
if coregroup_support doesn't exist, update MC mask to the next
smaller domain mask.

Changelog v2 -> v3:
Add optimization for mask updation under coregroup_support

Changelog v1 -> v2:
Moved coregroup topology fixup to fixup_topology (Gautham)

 arch/powerpc/include/asm/topology.h | 10 +++
 arch/powerpc/kernel/smp.c   | 44 +
 arch/powerpc/mm/numa.c  |  5 
 3 files changed, 59 insertions(+)

diff --git a/arch/powerpc/include/asm/topology.h 
b/arch/powerpc/include/asm/topology.h
index f0b6300e7dd3..6609174918ab 100644
--- a/arch/powerpc/include/asm/topology.h
+++ b/arch/powerpc/include/asm/topology.h
@@ -88,12 +88,22 @@ static inline int cpu_distance(__be32 *cpu1_assoc, __be32 
*cpu2_assoc)
 
 #if defined(CONFIG_NUMA) && defined(CONFIG_PPC_SPLPAR)
 extern int find_and_online_cpu_nid(int cpu);
+extern int cpu_to_coregroup_id(int cpu);
 #else
 static inline int find_and_online_cpu_nid(int cpu)
 {
return 0;
 }
 
+static inline int cpu_to_coregroup_id(int cpu)
+{
+#ifdef CONFIG_SMP
+   return cpu_to_core_id(cpu);
+#else
+   return 0;
+#endif
+}
+
 #endif /* CONFIG_NUMA && CONFIG_PPC_SPLPAR */
 
 #include 
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index dab96a1203ec..95f0bf72e283 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -80,6 +80,7 @@ DEFINE_PER_CPU(cpumask_var_t, cpu_sibling_map);
 DEFINE_PER_CPU(cpumask_var_t, cpu_smallcore_map);
 DEFINE_PER_CPU(cpumask_var_t, cpu_l2_cache_map);
 DEFINE_PER_CPU(cpumask_var_t, cpu_core_map);
+DEFINE_PER_CPU(cpumask_var_t, cpu_coregroup_map);
 
 EXPORT_PER_CPU_SYMBOL(cpu_sibling_map);
 EXPORT_PER_CPU_SYMBOL(cpu_l2_cache_map);
@@ -91,6 +92,7 @@ enum {
smt_idx,
 #endif
bigcore_idx,
+   mc_idx,
die_idx,
 };
 
@@ -869,6 +871,21 @@ static const struct cpumask *smallcore_smt_mask(int cpu)
 }
 #endif
 
+static struct cpumask *cpu_coregroup_mask(int cpu)
+{
+   return per_cpu(cpu_coregroup_map, cpu);
+}
+
+static bool has_coregroup_support(void)
+{
+   return coregroup_enabled;
+}
+
+static const struct cpumask *cpu_mc_mask(int cpu)
+{
+   return cpu_coregroup_mask(cpu);
+}
+
 static const struct cpumask *cpu_bigcore_mask(int cpu)
 {
return per_cpu(cpu_sibling_map, cpu);
@@ -879,6 +896,7 @@ static struct sched_domain_topology_level 
powerpc_topology[] = {
{ cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) },
 #endif
{ cpu_bigcore_mask, SD_INIT_NAME(BIGCORE) },
+   { cpu_mc_mask, SD_INIT_NAME(MC) },
{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
{ NULL, },
 };
@@ -925,6 +943,10 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
GFP_KERNEL, cpu_to_node(cpu));
zalloc_cpumask_var_node(_cpu(cpu_core_map, cpu),
GFP_KERNEL, cpu_to_node(cpu));
+   if (has_coregroup_support())
+   zalloc_cpumask_var_node(_cpu(cpu_coregroup_map, 
cpu),
+   GFP_KERNEL, cpu_to_node(cpu));
+
 #ifdef CONFIG_NEED_MULTIPLE_NODES
/*
 * numa_node_id() works after this.
@@ -942,6 +964,9 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
cpumask_set_cpu(boot_cpuid, cpu_l2_cache_mask(boot_cpuid));
cpumask_set_cpu(boot_cpuid, cpu_core_mask(boot_cpuid));
 
+   if (has_coregroup_support())
+   cpumask_set_cpu(boot_cpuid, cpu_coregroup_mask(boot_cpuid));
+
init_big_cores();
if (has_big_cores) {
cpumask_set_cpu(boot_cpuid,
@@ -1233,6 +1258,8 @@ static void remove_cpu_from_masks(int cpu)
set_cpus_unrelated(cpu, i, cpu_sibling_mask);
if (has_big_cores)
set_cpus_unrelated(cpu, i, cpu_smallcore_mask);
+   if (has_coregroup_support())
+   set_cpus_unrelated(cpu, i, cpu_coregroup_mask);
}
 }
 #endif
@@ -1293,6 +1320,20 @@ static void add_cpu_to_masks(int cpu)
add_cpu_to_smallcore_masks(cpu);
update_mask_by_l2(cpu, cpu_l2_cache_mask);
 
+   if (has_coregroup_support()) {
+   int coregroup_id = cpu_to_coregroup_id(cpu);
+
+   cpumask_set_cpu(cpu, cpu_coregroup_mask(cpu));
+   for_each_cpu_and(i, cpu_online_mask, cpu_cpu_mask(cpu)) {
+   int fcpu = c

[PATCH v4 08/10] powerpc/smp: Allocate cpumask only after searching thread group

2020-07-26 Thread Srikar Dronamraju
If allocated earlier and the search fails, then cpumask need to be
freed. However cpu_l1_cache_map can be allocated after we search thread
group.

Cc: linuxppc-dev 
Cc: LKML 
Cc: Michael Ellerman 
Cc: Nicholas Piggin 
Cc: Anton Blanchard 
Cc: Oliver O'Halloran 
Cc: Nathan Lynch 
Cc: Michael Neuling 
Cc: Gautham R Shenoy 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Valentin Schneider 
Cc: Jordan Niethe 
Reviewed-by: Gautham R. Shenoy 
Signed-off-by: Srikar Dronamraju 
---
 arch/powerpc/kernel/smp.c | 7 +++
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 698000c7f76f..dab96a1203ec 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -797,10 +797,6 @@ static int init_cpu_l1_cache_map(int cpu)
if (err)
goto out;
 
-   zalloc_cpumask_var_node(_cpu(cpu_l1_cache_map, cpu),
-   GFP_KERNEL,
-   cpu_to_node(cpu));
-
cpu_group_start = get_cpu_thread_group_start(cpu, );
 
if (unlikely(cpu_group_start == -1)) {
@@ -809,6 +805,9 @@ static int init_cpu_l1_cache_map(int cpu)
goto out;
}
 
+   zalloc_cpumask_var_node(_cpu(cpu_l1_cache_map, cpu),
+   GFP_KERNEL, cpu_to_node(cpu));
+
for (i = first_thread; i < first_thread + threads_per_core; i++) {
int i_group_start = get_cpu_thread_group_start(i, );
 
-- 
2.17.1



[PATCH v4 07/10] Powerpc/numa: Detect support for coregroup

2020-07-26 Thread Srikar Dronamraju
Add support for grouping cores based on the device-tree classification.
- The last domain in the associativity domains always refers to the
core.
- If primary reference domain happens to be the penultimate domain in
the associativity domains device-tree property, then there are no
coregroups. However if its not a penultimate domain, then there are
coregroups. There can be more than one coregroup. For now we would be
interested in the last or the smallest coregroups.

Cc: linuxppc-dev 
Cc: LKML 
Cc: Michael Ellerman 
Cc: Nicholas Piggin 
Cc: Anton Blanchard 
Cc: Oliver O'Halloran 
Cc: Nathan Lynch 
Cc: Michael Neuling 
Cc: Gautham R Shenoy 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Valentin Schneider 
Cc: Jordan Niethe 
Reviewed-by: Gautham R. Shenoy 
Signed-off-by: Srikar Dronamraju 
---
Changelog v1 -> v2:
Explained Coregroup in commit msg (Michael Ellerman)

 arch/powerpc/include/asm/smp.h |  1 +
 arch/powerpc/kernel/smp.c  |  1 +
 arch/powerpc/mm/numa.c | 34 +-
 3 files changed, 23 insertions(+), 13 deletions(-)

diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h
index 49a25e2400f2..5bdc17a7049f 100644
--- a/arch/powerpc/include/asm/smp.h
+++ b/arch/powerpc/include/asm/smp.h
@@ -28,6 +28,7 @@
 extern int boot_cpuid;
 extern int spinning_secondaries;
 extern u32 *cpu_to_phys_id;
+extern bool coregroup_enabled;
 
 extern void cpu_die(void);
 extern int cpu_to_chip_id(int cpu);
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 3c5ccf6d2b1c..698000c7f76f 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -74,6 +74,7 @@ static DEFINE_PER_CPU(int, cpu_state) = { 0 };
 
 struct task_struct *secondary_current;
 bool has_big_cores;
+bool coregroup_enabled;
 
 DEFINE_PER_CPU(cpumask_var_t, cpu_sibling_map);
 DEFINE_PER_CPU(cpumask_var_t, cpu_smallcore_map);
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 2298899a0f0a..51cb672f113b 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -886,7 +886,9 @@ static void __init setup_node_data(int nid, u64 start_pfn, 
u64 end_pfn)
 static void __init find_possible_nodes(void)
 {
struct device_node *rtas;
-   u32 numnodes, i;
+   const __be32 *domains;
+   int prop_length, max_nodes;
+   u32 i;
 
if (!numa_enabled)
return;
@@ -895,25 +897,31 @@ static void __init find_possible_nodes(void)
if (!rtas)
return;
 
-   if (of_property_read_u32_index(rtas, 
"ibm,current-associativity-domains",
-   min_common_depth, )) {
-   /*
-* ibm,current-associativity-domains is a fairly recent
-* property. If it doesn't exist, then fallback on
-* ibm,max-associativity-domains. Current denotes what the
-* platform can support compared to max which denotes what the
-* Hypervisor can support.
-*/
-   if (of_property_read_u32_index(rtas, 
"ibm,max-associativity-domains",
-   min_common_depth, ))
+   /*
+* ibm,current-associativity-domains is a fairly recent property. If
+* it doesn't exist, then fallback on ibm,max-associativity-domains.
+* Current denotes what the platform can support compared to max
+* which denotes what the Hypervisor can support.
+*/
+   domains = of_get_property(rtas, "ibm,current-associativity-domains",
+   _length);
+   if (!domains) {
+   domains = of_get_property(rtas, "ibm,max-associativity-domains",
+   _length);
+   if (!domains)
goto out;
}
 
-   for (i = 0; i < numnodes; i++) {
+   max_nodes = of_read_number([min_common_depth], 1);
+   for (i = 0; i < max_nodes; i++) {
if (!node_possible(i))
node_set(i, node_possible_map);
}
 
+   prop_length /= sizeof(int);
+   if (prop_length > min_common_depth + 2)
+   coregroup_enabled = 1;
+
 out:
of_node_put(rtas);
 }
-- 
2.17.1



[PATCH v4 06/10] powerpc/smp: Generalize 2nd sched domain

2020-07-26 Thread Srikar Dronamraju
Currently "CACHE" domain happens to be the 2nd sched domain as per
powerpc_topology. This domain will collapse if cpumask of l2-cache is
same as SMT domain. However we could generalize this domain such that it
could mean either be a "CACHE" domain or a "BIGCORE" domain.

While setting up the "CACHE" domain, check if shared_cache is already
set.

Cc: linuxppc-dev 
Cc: LKML 
Cc: Michael Ellerman 
Cc: Nicholas Piggin 
Cc: Anton Blanchard 
Cc: Oliver O'Halloran 
Cc: Nathan Lynch 
Cc: Michael Neuling 
Cc: Gautham R Shenoy 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Valentin Schneider 
Cc: Jordan Niethe 
Signed-off-by: Srikar Dronamraju 
---
Changelog v1 -> v2:
Moved shared_cache topology fixup to fixup_topology (Gautham)

 arch/powerpc/kernel/smp.c | 48 +++
 1 file changed, 34 insertions(+), 14 deletions(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index d997c7411664..3c5ccf6d2b1c 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -85,6 +85,14 @@ EXPORT_PER_CPU_SYMBOL(cpu_l2_cache_map);
 EXPORT_PER_CPU_SYMBOL(cpu_core_map);
 EXPORT_SYMBOL_GPL(has_big_cores);
 
+enum {
+#ifdef CONFIG_SCHED_SMT
+   smt_idx,
+#endif
+   bigcore_idx,
+   die_idx,
+};
+
 #define MAX_THREAD_LIST_SIZE   8
 #define THREAD_GROUP_SHARE_L1   1
 struct thread_groups {
@@ -851,13 +859,7 @@ static int powerpc_shared_cache_flags(void)
  */
 static const struct cpumask *shared_cache_mask(int cpu)
 {
-   if (shared_caches)
-   return cpu_l2_cache_mask(cpu);
-
-   if (has_big_cores)
-   return cpu_smallcore_mask(cpu);
-
-   return per_cpu(cpu_sibling_map, cpu);
+   return per_cpu(cpu_l2_cache_map, cpu);
 }
 
 #ifdef CONFIG_SCHED_SMT
@@ -867,11 +869,16 @@ static const struct cpumask *smallcore_smt_mask(int cpu)
 }
 #endif
 
+static const struct cpumask *cpu_bigcore_mask(int cpu)
+{
+   return per_cpu(cpu_sibling_map, cpu);
+}
+
 static struct sched_domain_topology_level powerpc_topology[] = {
 #ifdef CONFIG_SCHED_SMT
{ cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) },
 #endif
-   { shared_cache_mask, powerpc_shared_cache_flags, SD_INIT_NAME(CACHE) },
+   { cpu_bigcore_mask, SD_INIT_NAME(BIGCORE) },
{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
{ NULL, },
 };
@@ -1311,7 +1318,6 @@ static void add_cpu_to_masks(int cpu)
 void start_secondary(void *unused)
 {
unsigned int cpu = smp_processor_id();
-   struct cpumask *(*sibling_mask)(int) = cpu_sibling_mask;
 
mmgrab(_mm);
current->active_mm = _mm;
@@ -1337,14 +1343,20 @@ void start_secondary(void *unused)
/* Update topology CPU masks */
add_cpu_to_masks(cpu);
 
-   if (has_big_cores)
-   sibling_mask = cpu_smallcore_mask;
/*
 * Check for any shared caches. Note that this must be done on a
 * per-core basis because one core in the pair might be disabled.
 */
-   if (!cpumask_equal(cpu_l2_cache_mask(cpu), sibling_mask(cpu)))
-   shared_caches = true;
+   if (!shared_caches) {
+   struct cpumask *(*sibling_mask)(int) = cpu_sibling_mask;
+   struct cpumask *mask = cpu_l2_cache_mask(cpu);
+
+   if (has_big_cores)
+   sibling_mask = cpu_smallcore_mask;
+
+   if (cpumask_weight(mask) > cpumask_weight(sibling_mask(cpu)))
+   shared_caches = true;
+   }
 
set_numa_node(numa_cpu_lookup_table[cpu]);
set_numa_mem(local_memory_node(numa_cpu_lookup_table[cpu]));
@@ -1375,9 +1387,17 @@ static void fixup_topology(void)
 #ifdef CONFIG_SCHED_SMT
if (has_big_cores) {
pr_info("Big cores detected but using small core scheduling\n");
-   powerpc_topology[0].mask = smallcore_smt_mask;
+   powerpc_topology[smt_idx].mask = smallcore_smt_mask;
}
 #endif
+   if (shared_caches) {
+   pr_info("Using shared cache scheduler topology\n");
+   powerpc_topology[bigcore_idx].mask = shared_cache_mask;
+   powerpc_topology[bigcore_idx].sd_flags = 
powerpc_shared_cache_flags;
+#ifdef CONFIG_SCHED_DEBUG
+   powerpc_topology[bigcore_idx].name = "CACHE";
+#endif
+   }
 }
 
 void __init smp_cpus_done(unsigned int max_cpus)
-- 
2.17.1



[PATCH v4 05/10] powerpc/smp: Dont assume l2-cache to be superset of sibling

2020-07-26 Thread Srikar Dronamraju
Current code assumes that cpumask of cpus sharing a l2-cache mask will
always be a superset of cpu_sibling_mask.

Lets stop that assumption. cpu_l2_cache_mask is a superset of
cpu_sibling_mask if and only if shared_caches is set.

Cc: linuxppc-dev 
Cc: LKML 
Cc: Michael Ellerman 
Cc: Nicholas Piggin 
Cc: Anton Blanchard 
Cc: Oliver O'Halloran 
Cc: Nathan Lynch 
Cc: Michael Neuling 
Cc: Gautham R Shenoy 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Valentin Schneider 
Cc: Jordan Niethe 
Reviewed-by: Gautham R. Shenoy 
Signed-off-by: Srikar Dronamraju 
---
Changelog v1 -> v2:
Set cpumask after verifying l2-cache. (Gautham)

 arch/powerpc/kernel/smp.c | 28 +++-
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index da27f6909be1..d997c7411664 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -1194,6 +1194,7 @@ static bool update_mask_by_l2(int cpu, struct cpumask 
*(*mask_fn)(int))
if (!l2_cache)
return false;
 
+   cpumask_set_cpu(cpu, mask_fn(cpu));
for_each_cpu(i, cpu_online_mask) {
/*
 * when updating the marks the current CPU has not been marked
@@ -1276,29 +1277,30 @@ static void add_cpu_to_masks(int cpu)
 * add it to it's own thread sibling mask.
 */
cpumask_set_cpu(cpu, cpu_sibling_mask(cpu));
+   cpumask_set_cpu(cpu, cpu_core_mask(cpu));
 
for (i = first_thread; i < first_thread + threads_per_core; i++)
if (cpu_online(i))
set_cpus_related(i, cpu, cpu_sibling_mask);
 
add_cpu_to_smallcore_masks(cpu);
-   /*
-* Copy the thread sibling mask into the cache sibling mask
-* and mark any CPUs that share an L2 with this CPU.
-*/
-   for_each_cpu(i, cpu_sibling_mask(cpu))
-   set_cpus_related(cpu, i, cpu_l2_cache_mask);
update_mask_by_l2(cpu, cpu_l2_cache_mask);
 
-   /*
-* Copy the cache sibling mask into core sibling mask and mark
-* any CPUs on the same chip as this CPU.
-*/
-   for_each_cpu(i, cpu_l2_cache_mask(cpu))
-   set_cpus_related(cpu, i, cpu_core_mask);
+   if (pkg_id == -1) {
+   struct cpumask *(*mask)(int) = cpu_sibling_mask;
+
+   /*
+* Copy the sibling mask into core sibling mask and
+* mark any CPUs on the same chip as this CPU.
+*/
+   if (shared_caches)
+   mask = cpu_l2_cache_mask;
+
+   for_each_cpu(i, mask(cpu))
+   set_cpus_related(cpu, i, cpu_core_mask);
 
-   if (pkg_id == -1)
return;
+   }
 
for_each_cpu(i, cpu_online_mask)
if (get_physical_package_id(i) == pkg_id)
-- 
2.17.1



[PATCH v4 04/10] powerpc/smp: Move topology fixups into a new function

2020-07-26 Thread Srikar Dronamraju
Move topology fixup based on the platform attributes into its own
function which is called just before set_sched_topology.

Cc: linuxppc-dev 
Cc: LKML 
Cc: Michael Ellerman 
Cc: Nicholas Piggin 
Cc: Anton Blanchard 
Cc: Oliver O'Halloran 
Cc: Nathan Lynch 
Cc: Michael Neuling 
Cc: Gautham R Shenoy 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Valentin Schneider 
Cc: Jordan Niethe 
Reviewed-by: Gautham R. Shenoy 
Signed-off-by: Srikar Dronamraju 
---
Changelog v2 -> v3:
Rewrote changelog (Gautham)
Renamed to powerpc/smp: Move topology fixups into  a new function

 arch/powerpc/kernel/smp.c | 17 +++--
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index a685915e5941..da27f6909be1 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -1368,6 +1368,16 @@ int setup_profiling_timer(unsigned int multiplier)
return 0;
 }
 
+static void fixup_topology(void)
+{
+#ifdef CONFIG_SCHED_SMT
+   if (has_big_cores) {
+   pr_info("Big cores detected but using small core scheduling\n");
+   powerpc_topology[0].mask = smallcore_smt_mask;
+   }
+#endif
+}
+
 void __init smp_cpus_done(unsigned int max_cpus)
 {
/*
@@ -1381,12 +1391,7 @@ void __init smp_cpus_done(unsigned int max_cpus)
 
dump_numa_cpu_topology();
 
-#ifdef CONFIG_SCHED_SMT
-   if (has_big_cores) {
-   pr_info("Big cores detected but using small core scheduling\n");
-   powerpc_topology[0].mask = smallcore_smt_mask;
-   }
-#endif
+   fixup_topology();
set_sched_topology(powerpc_topology);
 }
 
-- 
2.17.1



[PATCH v4 03/10] powerpc/smp: Move powerpc_topology above

2020-07-26 Thread Srikar Dronamraju
Just moving the powerpc_topology description above.
This will help in using functions in this file and avoid declarations.

No other functional changes

Cc: linuxppc-dev 
Cc: LKML 
Cc: Michael Ellerman 
Cc: Nicholas Piggin 
Cc: Anton Blanchard 
Cc: Oliver O'Halloran 
Cc: Nathan Lynch 
Cc: Michael Neuling 
Cc: Gautham R Shenoy 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Valentin Schneider 
Cc: Jordan Niethe 
Reviewed-by: Gautham R. Shenoy 
Signed-off-by: Srikar Dronamraju 
---
 arch/powerpc/kernel/smp.c | 116 +++---
 1 file changed, 58 insertions(+), 58 deletions(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 283a04e54f52..a685915e5941 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -818,6 +818,64 @@ static int init_cpu_l1_cache_map(int cpu)
return err;
 }
 
+static bool shared_caches;
+
+#ifdef CONFIG_SCHED_SMT
+/* cpumask of CPUs with asymmetric SMT dependency */
+static int powerpc_smt_flags(void)
+{
+   int flags = SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES;
+
+   if (cpu_has_feature(CPU_FTR_ASYM_SMT)) {
+   printk_once(KERN_INFO "Enabling Asymmetric SMT scheduling\n");
+   flags |= SD_ASYM_PACKING;
+   }
+   return flags;
+}
+#endif
+
+/*
+ * P9 has a slightly odd architecture where pairs of cores share an L2 cache.
+ * This topology makes it *much* cheaper to migrate tasks between adjacent 
cores
+ * since the migrated task remains cache hot. We want to take advantage of this
+ * at the scheduler level so an extra topology level is required.
+ */
+static int powerpc_shared_cache_flags(void)
+{
+   return SD_SHARE_PKG_RESOURCES;
+}
+
+/*
+ * We can't just pass cpu_l2_cache_mask() directly because
+ * returns a non-const pointer and the compiler barfs on that.
+ */
+static const struct cpumask *shared_cache_mask(int cpu)
+{
+   if (shared_caches)
+   return cpu_l2_cache_mask(cpu);
+
+   if (has_big_cores)
+   return cpu_smallcore_mask(cpu);
+
+   return per_cpu(cpu_sibling_map, cpu);
+}
+
+#ifdef CONFIG_SCHED_SMT
+static const struct cpumask *smallcore_smt_mask(int cpu)
+{
+   return cpu_smallcore_mask(cpu);
+}
+#endif
+
+static struct sched_domain_topology_level powerpc_topology[] = {
+#ifdef CONFIG_SCHED_SMT
+   { cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) },
+#endif
+   { shared_cache_mask, powerpc_shared_cache_flags, SD_INIT_NAME(CACHE) },
+   { cpu_cpu_mask, SD_INIT_NAME(DIE) },
+   { NULL, },
+};
+
 static int init_big_cores(void)
 {
int cpu;
@@ -1247,8 +1305,6 @@ static void add_cpu_to_masks(int cpu)
set_cpus_related(cpu, i, cpu_core_mask);
 }
 
-static bool shared_caches;
-
 /* Activate a secondary processor. */
 void start_secondary(void *unused)
 {
@@ -1312,62 +1368,6 @@ int setup_profiling_timer(unsigned int multiplier)
return 0;
 }
 
-#ifdef CONFIG_SCHED_SMT
-/* cpumask of CPUs with asymmetric SMT dependency */
-static int powerpc_smt_flags(void)
-{
-   int flags = SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES;
-
-   if (cpu_has_feature(CPU_FTR_ASYM_SMT)) {
-   printk_once(KERN_INFO "Enabling Asymmetric SMT scheduling\n");
-   flags |= SD_ASYM_PACKING;
-   }
-   return flags;
-}
-#endif
-
-/*
- * P9 has a slightly odd architecture where pairs of cores share an L2 cache.
- * This topology makes it *much* cheaper to migrate tasks between adjacent 
cores
- * since the migrated task remains cache hot. We want to take advantage of this
- * at the scheduler level so an extra topology level is required.
- */
-static int powerpc_shared_cache_flags(void)
-{
-   return SD_SHARE_PKG_RESOURCES;
-}
-
-/*
- * We can't just pass cpu_l2_cache_mask() directly because
- * returns a non-const pointer and the compiler barfs on that.
- */
-static const struct cpumask *shared_cache_mask(int cpu)
-{
-   if (shared_caches)
-   return cpu_l2_cache_mask(cpu);
-
-   if (has_big_cores)
-   return cpu_smallcore_mask(cpu);
-
-   return per_cpu(cpu_sibling_map, cpu);
-}
-
-#ifdef CONFIG_SCHED_SMT
-static const struct cpumask *smallcore_smt_mask(int cpu)
-{
-   return cpu_smallcore_mask(cpu);
-}
-#endif
-
-static struct sched_domain_topology_level powerpc_topology[] = {
-#ifdef CONFIG_SCHED_SMT
-   { cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) },
-#endif
-   { shared_cache_mask, powerpc_shared_cache_flags, SD_INIT_NAME(CACHE) },
-   { cpu_cpu_mask, SD_INIT_NAME(DIE) },
-   { NULL, },
-};
-
 void __init smp_cpus_done(unsigned int max_cpus)
 {
/*
-- 
2.17.1



[PATCH v4 02/10] powerpc/smp: Merge Power9 topology with Power topology

2020-07-26 Thread Srikar Dronamraju
A new sched_domain_topology_level was added just for Power9. However the
same can be achieved by merging powerpc_topology with power9_topology
and makes the code more simpler especially when adding a new sched
domain.

Cc: linuxppc-dev 
Cc: LKML 
Cc: Michael Ellerman 
Cc: Nicholas Piggin 
Cc: Anton Blanchard 
Cc: Oliver O'Halloran 
Cc: Nathan Lynch 
Cc: Michael Neuling 
Cc: Gautham R Shenoy 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Valentin Schneider 
Cc: Jordan Niethe 
Reviewed-by: Gautham R. Shenoy 
Signed-off-by: Srikar Dronamraju 
---
Changelog v1 -> v2:
Replaced a reference to cpu_smt_mask with per_cpu(cpu_sibling_map, cpu)
since cpu_smt_mask is only defined under CONFIG_SCHED_SMT

 arch/powerpc/kernel/smp.c | 33 ++---
 1 file changed, 10 insertions(+), 23 deletions(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index edf94ca64eea..283a04e54f52 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -1313,7 +1313,7 @@ int setup_profiling_timer(unsigned int multiplier)
 }
 
 #ifdef CONFIG_SCHED_SMT
-/* cpumask of CPUs with asymetric SMT dependancy */
+/* cpumask of CPUs with asymmetric SMT dependency */
 static int powerpc_smt_flags(void)
 {
int flags = SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES;
@@ -1326,14 +1326,6 @@ static int powerpc_smt_flags(void)
 }
 #endif
 
-static struct sched_domain_topology_level powerpc_topology[] = {
-#ifdef CONFIG_SCHED_SMT
-   { cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) },
-#endif
-   { cpu_cpu_mask, SD_INIT_NAME(DIE) },
-   { NULL, },
-};
-
 /*
  * P9 has a slightly odd architecture where pairs of cores share an L2 cache.
  * This topology makes it *much* cheaper to migrate tasks between adjacent 
cores
@@ -1351,7 +1343,13 @@ static int powerpc_shared_cache_flags(void)
  */
 static const struct cpumask *shared_cache_mask(int cpu)
 {
-   return cpu_l2_cache_mask(cpu);
+   if (shared_caches)
+   return cpu_l2_cache_mask(cpu);
+
+   if (has_big_cores)
+   return cpu_smallcore_mask(cpu);
+
+   return per_cpu(cpu_sibling_map, cpu);
 }
 
 #ifdef CONFIG_SCHED_SMT
@@ -1361,7 +1359,7 @@ static const struct cpumask *smallcore_smt_mask(int cpu)
 }
 #endif
 
-static struct sched_domain_topology_level power9_topology[] = {
+static struct sched_domain_topology_level powerpc_topology[] = {
 #ifdef CONFIG_SCHED_SMT
{ cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) },
 #endif
@@ -1386,21 +1384,10 @@ void __init smp_cpus_done(unsigned int max_cpus)
 #ifdef CONFIG_SCHED_SMT
if (has_big_cores) {
pr_info("Big cores detected but using small core scheduling\n");
-   power9_topology[0].mask = smallcore_smt_mask;
powerpc_topology[0].mask = smallcore_smt_mask;
}
 #endif
-   /*
-* If any CPU detects that it's sharing a cache with another CPU then
-* use the deeper topology that is aware of this sharing.
-*/
-   if (shared_caches) {
-   pr_info("Using shared cache scheduler topology\n");
-   set_sched_topology(power9_topology);
-   } else {
-   pr_info("Using standard scheduler topology\n");
-   set_sched_topology(powerpc_topology);
-   }
+   set_sched_topology(powerpc_topology);
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
-- 
2.17.1



[PATCH v4 00/10] Coregroup support on Powerpc

2020-07-26 Thread Srikar Dronamraju
0 0 0 0 0 0
domain1 ,,,,,,, 
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

On Power 9 (with device-tree enablement to show coregroups).
(hunks for mimicing a coregroup was posted at
https://lore.kernel.org/linuxppc-dev/20200714043624.5648-1-sri...@linux.vnet.ibm.com/t/#m2cb09bb11c7a93257d6123d1d27edb8212f8af21)
---
$ tail /proc/cpuinfo
processor   : 127
cpu : POWER9 (architected), altivec supported
clock   : 3000.00MHz
revision: 2.2 (pvr 004e 0202)

timebase: 51200
platform: pSeries
model   : IBM,9008-22L
machine : CHRP IBM,9008-22L
MMU : Hash

Before patchset
--
$ cat /proc/sys/kernel/sched_domain/cpu0/domain*/name
SMT
CACHE
DIE
NUMA

$ head /proc/schedstat
version 15
timestamp 4318242208
cpu0 0 0 0 0 0 0 28077107004 4773387362 78205
domain0 ,,,0055 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
domain1 ,,,00ff 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
domain2 ,,, 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
domain3 ,,, 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
cpu1 0 0 0 0 0 0 24177439200 413887604 75393
domain0 ,,,00aa 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
domain1 ,,,00ff 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

After patchset
--
$ cat /proc/sys/kernel/sched_domain/cpu0/domain*/name
SMT
CACHE
MC
DIE
NUMA

$ head /proc/schedstat
version 15
timestamp 4318242208
cpu0 0 0 0 0 0 0 28077107004 4773387362 78205
domain0 ,,,0055 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
domain1 ,,,00ff 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
domain2 ,,, 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
domain3 ,,, 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
domain4 ,,, 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
cpu1 0 0 0 0 0 0 24177439200 413887604 75393
domain0 ,,,00aa 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

Cc: linuxppc-dev 
Cc: LKML 
Cc: Michael Ellerman 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Valentin Schneider 
Cc: Nick Piggin 
Cc: Oliver OHalloran 
Cc: Nathan Lynch 
Cc: Michael Neuling 
Cc: Anton Blanchard 
Cc: Gautham R Shenoy 
Cc: Vaidyanathan Srinivasan 
Cc: Jordan Niethe 

Srikar Dronamraju (10):
  powerpc/smp: Fix a warning under !NEED_MULTIPLE_NODES
  powerpc/smp: Merge Power9 topology with Power topology
  powerpc/smp: Move powerpc_topology above
  powerpc/smp: Move topology fixups into  a new function
  powerpc/smp: Dont assume l2-cache to be superset of sibling
  powerpc/smp: Generalize 2nd sched domain
  powerpc/numa: Detect support for coregroup
  powerpc/smp: Allocate cpumask only after searching thread group
  powerpc/smp: Create coregroup domain
  powerpc/smp: Implement cpu_to_coregroup_id

 arch/powerpc/include/asm/smp.h  |   1 +
 arch/powerpc/include/asm/topology.h |  10 ++
 arch/powerpc/kernel/smp.c   | 246 +---
 arch/powerpc/mm/numa.c  |  59 +--
 4 files changed, 210 insertions(+), 106 deletions(-)

-- 
2.17.1



[PATCH v4 01/10] powerpc/smp: Fix a warning under !NEED_MULTIPLE_NODES

2020-07-26 Thread Srikar Dronamraju
Fix a build warning in a non CONFIG_NEED_MULTIPLE_NODES
"error: _numa_cpu_lookup_table_ undeclared"

Cc: linuxppc-dev 
Cc: LKML 
Cc: Michael Ellerman 
Cc: Nicholas Piggin 
Cc: Anton Blanchard 
Cc: Oliver O'Halloran 
Cc: Nathan Lynch 
Cc: Michael Neuling 
Cc: Gautham R Shenoy 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Valentin Schneider 
Cc: Jordan Niethe 
Reviewed-by: Gautham R. Shenoy 
Signed-off-by: Srikar Dronamraju 
---
Changelog v2 -> v3:
Removed node caching part. Rewrote the Commit msg (Michael Ellerman)
Renamed to powerpc/smp: Fix a warning under !NEED_MULTIPLE_NODES

 arch/powerpc/kernel/smp.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 73199470c265..edf94ca64eea 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -860,6 +860,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
GFP_KERNEL, cpu_to_node(cpu));
zalloc_cpumask_var_node(_cpu(cpu_core_map, cpu),
GFP_KERNEL, cpu_to_node(cpu));
+#ifdef CONFIG_NEED_MULTIPLE_NODES
/*
 * numa_node_id() works after this.
 */
@@ -868,6 +869,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
set_cpu_numa_mem(cpu,
local_memory_node(numa_cpu_lookup_table[cpu]));
}
+#endif
}
 
/* Init the cpumasks so the boot CPU is related to itself */
-- 
2.17.1



[PATCH v4 03/10] powerpc/smp: Move powerpc_topology above

2020-07-26 Thread Srikar Dronamraju
Just moving the powerpc_topology description above.
This will help in using functions in this file and avoid declarations.

No other functional changes

Cc: linuxppc-dev 
Cc: LKML 
Cc: Michael Ellerman 
Cc: Nicholas Piggin 
Cc: Anton Blanchard 
Cc: Oliver O'Halloran 
Cc: Nathan Lynch 
Cc: Michael Neuling 
Cc: Gautham R Shenoy 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Valentin Schneider 
Cc: Jordan Niethe 
Reviewed-by: Gautham R. Shenoy 
Signed-off-by: Srikar Dronamraju 
---
 arch/powerpc/kernel/smp.c | 116 +++---
 1 file changed, 58 insertions(+), 58 deletions(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 283a04e54f52..a685915e5941 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -818,6 +818,64 @@ static int init_cpu_l1_cache_map(int cpu)
return err;
 }
 
+static bool shared_caches;
+
+#ifdef CONFIG_SCHED_SMT
+/* cpumask of CPUs with asymmetric SMT dependency */
+static int powerpc_smt_flags(void)
+{
+   int flags = SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES;
+
+   if (cpu_has_feature(CPU_FTR_ASYM_SMT)) {
+   printk_once(KERN_INFO "Enabling Asymmetric SMT scheduling\n");
+   flags |= SD_ASYM_PACKING;
+   }
+   return flags;
+}
+#endif
+
+/*
+ * P9 has a slightly odd architecture where pairs of cores share an L2 cache.
+ * This topology makes it *much* cheaper to migrate tasks between adjacent 
cores
+ * since the migrated task remains cache hot. We want to take advantage of this
+ * at the scheduler level so an extra topology level is required.
+ */
+static int powerpc_shared_cache_flags(void)
+{
+   return SD_SHARE_PKG_RESOURCES;
+}
+
+/*
+ * We can't just pass cpu_l2_cache_mask() directly because
+ * returns a non-const pointer and the compiler barfs on that.
+ */
+static const struct cpumask *shared_cache_mask(int cpu)
+{
+   if (shared_caches)
+   return cpu_l2_cache_mask(cpu);
+
+   if (has_big_cores)
+   return cpu_smallcore_mask(cpu);
+
+   return per_cpu(cpu_sibling_map, cpu);
+}
+
+#ifdef CONFIG_SCHED_SMT
+static const struct cpumask *smallcore_smt_mask(int cpu)
+{
+   return cpu_smallcore_mask(cpu);
+}
+#endif
+
+static struct sched_domain_topology_level powerpc_topology[] = {
+#ifdef CONFIG_SCHED_SMT
+   { cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) },
+#endif
+   { shared_cache_mask, powerpc_shared_cache_flags, SD_INIT_NAME(CACHE) },
+   { cpu_cpu_mask, SD_INIT_NAME(DIE) },
+   { NULL, },
+};
+
 static int init_big_cores(void)
 {
int cpu;
@@ -1247,8 +1305,6 @@ static void add_cpu_to_masks(int cpu)
set_cpus_related(cpu, i, cpu_core_mask);
 }
 
-static bool shared_caches;
-
 /* Activate a secondary processor. */
 void start_secondary(void *unused)
 {
@@ -1312,62 +1368,6 @@ int setup_profiling_timer(unsigned int multiplier)
return 0;
 }
 
-#ifdef CONFIG_SCHED_SMT
-/* cpumask of CPUs with asymmetric SMT dependency */
-static int powerpc_smt_flags(void)
-{
-   int flags = SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES;
-
-   if (cpu_has_feature(CPU_FTR_ASYM_SMT)) {
-   printk_once(KERN_INFO "Enabling Asymmetric SMT scheduling\n");
-   flags |= SD_ASYM_PACKING;
-   }
-   return flags;
-}
-#endif
-
-/*
- * P9 has a slightly odd architecture where pairs of cores share an L2 cache.
- * This topology makes it *much* cheaper to migrate tasks between adjacent 
cores
- * since the migrated task remains cache hot. We want to take advantage of this
- * at the scheduler level so an extra topology level is required.
- */
-static int powerpc_shared_cache_flags(void)
-{
-   return SD_SHARE_PKG_RESOURCES;
-}
-
-/*
- * We can't just pass cpu_l2_cache_mask() directly because
- * returns a non-const pointer and the compiler barfs on that.
- */
-static const struct cpumask *shared_cache_mask(int cpu)
-{
-   if (shared_caches)
-   return cpu_l2_cache_mask(cpu);
-
-   if (has_big_cores)
-   return cpu_smallcore_mask(cpu);
-
-   return per_cpu(cpu_sibling_map, cpu);
-}
-
-#ifdef CONFIG_SCHED_SMT
-static const struct cpumask *smallcore_smt_mask(int cpu)
-{
-   return cpu_smallcore_mask(cpu);
-}
-#endif
-
-static struct sched_domain_topology_level powerpc_topology[] = {
-#ifdef CONFIG_SCHED_SMT
-   { cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) },
-#endif
-   { shared_cache_mask, powerpc_shared_cache_flags, SD_INIT_NAME(CACHE) },
-   { cpu_cpu_mask, SD_INIT_NAME(DIE) },
-   { NULL, },
-};
-
 void __init smp_cpus_done(unsigned int max_cpus)
 {
/*
-- 
2.17.1



[PATCH v4 01/10] powerpc/smp: Fix a warning under !NEED_MULTIPLE_NODES

2020-07-26 Thread Srikar Dronamraju
Fix a build warning in a non CONFIG_NEED_MULTIPLE_NODES
"error: _numa_cpu_lookup_table_ undeclared"

Cc: linuxppc-dev 
Cc: LKML 
Cc: Michael Ellerman 
Cc: Nicholas Piggin 
Cc: Anton Blanchard 
Cc: Oliver O'Halloran 
Cc: Nathan Lynch 
Cc: Michael Neuling 
Cc: Gautham R Shenoy 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Valentin Schneider 
Cc: Jordan Niethe 
Reviewed-by: Gautham R. Shenoy 
Signed-off-by: Srikar Dronamraju 
---
Changelog v2 -> v3:
Removed node caching part. Rewrote the Commit msg (Michael Ellerman)
Renamed to powerpc/smp: Fix a warning under !NEED_MULTIPLE_NODES

 arch/powerpc/kernel/smp.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 73199470c265..edf94ca64eea 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -860,6 +860,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
GFP_KERNEL, cpu_to_node(cpu));
zalloc_cpumask_var_node(_cpu(cpu_core_map, cpu),
GFP_KERNEL, cpu_to_node(cpu));
+#ifdef CONFIG_NEED_MULTIPLE_NODES
/*
 * numa_node_id() works after this.
 */
@@ -868,6 +869,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
set_cpu_numa_mem(cpu,
local_memory_node(numa_cpu_lookup_table[cpu]));
}
+#endif
}
 
/* Init the cpumasks so the boot CPU is related to itself */
-- 
2.17.1



[PATCH v4 10/10] powerpc/smp: Implement cpu_to_coregroup_id

2020-07-26 Thread Srikar Dronamraju
Lookup the coregroup id from the associativity array.

If unable to detect the coregroup id, fallback on the core id.
This way, ensure sched_domain degenerates and an extra sched domain is
not created.

Ideally this function should have been implemented in
arch/powerpc/kernel/smp.c. However if its implemented in mm/numa.c, we
don't need to find the primary domain again.

If the device-tree mentions more than one coregroup, then kernel
implements only the last or the smallest coregroup, which currently
corresponds to the penultimate domain in the device-tree.

Cc: linuxppc-dev 
Cc: LKML 
Cc: Michael Ellerman 
Cc: Nicholas Piggin 
Cc: Anton Blanchard 
Cc: Oliver O'Halloran 
Cc: Nathan Lynch 
Cc: Michael Neuling 
Cc: Gautham R Shenoy 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Valentin Schneider 
Cc: Jordan Niethe 
Reviewed-by : Gautham R. Shenoy 
Signed-off-by: Srikar Dronamraju 
---
Changelog v1 -> v2:
Move coregroup_enabled before getting associativity (Gautham)

 arch/powerpc/mm/numa.c | 20 
 1 file changed, 20 insertions(+)

diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 0d57779e7942..8b3b3ec7fcc4 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -1218,6 +1218,26 @@ int find_and_online_cpu_nid(int cpu)
 
 int cpu_to_coregroup_id(int cpu)
 {
+   __be32 associativity[VPHN_ASSOC_BUFSIZE] = {0};
+   int index;
+
+   if (cpu < 0 || cpu > nr_cpu_ids)
+   return -1;
+
+   if (!coregroup_enabled)
+   goto out;
+
+   if (!firmware_has_feature(FW_FEATURE_VPHN))
+   goto out;
+
+   if (vphn_get_associativity(cpu, associativity))
+   goto out;
+
+   index = of_read_number(associativity, 1);
+   if (index > min_common_depth + 1)
+   return of_read_number([index - 1], 1);
+
+out:
return cpu_to_core_id(cpu);
 }
 
-- 
2.17.1



[PATCH v4 09/10] Powerpc/smp: Create coregroup domain

2020-07-26 Thread Srikar Dronamraju
Add percpu coregroup maps and masks to create coregroup domain.
If a coregroup doesn't exist, the coregroup domain will be degenerated
in favour of SMT/CACHE domain.

Cc: linuxppc-dev 
Cc: LKML 
Cc: Michael Ellerman 
Cc: Nicholas Piggin 
Cc: Anton Blanchard 
Cc: Oliver O'Halloran 
Cc: Nathan Lynch 
Cc: Michael Neuling 
Cc: Gautham R Shenoy 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Valentin Schneider 
Cc: Jordan Niethe 
Signed-off-by: Srikar Dronamraju 
---
Changelog v3 ->v4:
if coregroup_support doesn't exist, update MC mask to the next
smaller domain mask.

Changelog v2 -> v3:
Add optimization for mask updation under coregroup_support

Changelog v1 -> v2:
Moved coregroup topology fixup to fixup_topology (Gautham)

 arch/powerpc/include/asm/topology.h | 10 ++
 arch/powerpc/kernel/smp.c   | 48 +
 arch/powerpc/mm/numa.c  |  5 +++
 3 files changed, 63 insertions(+)

diff --git a/arch/powerpc/include/asm/topology.h 
b/arch/powerpc/include/asm/topology.h
index f0b6300e7dd3..6609174918ab 100644
--- a/arch/powerpc/include/asm/topology.h
+++ b/arch/powerpc/include/asm/topology.h
@@ -88,12 +88,22 @@ static inline int cpu_distance(__be32 *cpu1_assoc, __be32 
*cpu2_assoc)
 
 #if defined(CONFIG_NUMA) && defined(CONFIG_PPC_SPLPAR)
 extern int find_and_online_cpu_nid(int cpu);
+extern int cpu_to_coregroup_id(int cpu);
 #else
 static inline int find_and_online_cpu_nid(int cpu)
 {
return 0;
 }
 
+static inline int cpu_to_coregroup_id(int cpu)
+{
+#ifdef CONFIG_SMP
+   return cpu_to_core_id(cpu);
+#else
+   return 0;
+#endif
+}
+
 #endif /* CONFIG_NUMA && CONFIG_PPC_SPLPAR */
 
 #include 
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index dab96a1203ec..95f0bf72e283 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -80,6 +80,7 @@ DEFINE_PER_CPU(cpumask_var_t, cpu_sibling_map);
 DEFINE_PER_CPU(cpumask_var_t, cpu_smallcore_map);
 DEFINE_PER_CPU(cpumask_var_t, cpu_l2_cache_map);
 DEFINE_PER_CPU(cpumask_var_t, cpu_core_map);
+DEFINE_PER_CPU(cpumask_var_t, cpu_coregroup_map);
 
 EXPORT_PER_CPU_SYMBOL(cpu_sibling_map);
 EXPORT_PER_CPU_SYMBOL(cpu_l2_cache_map);
@@ -91,6 +92,7 @@ enum {
smt_idx,
 #endif
bigcore_idx,
+   mc_idx,
die_idx,
 };
 
@@ -869,6 +871,21 @@ static const struct cpumask *smallcore_smt_mask(int cpu)
 }
 #endif
 
+static struct cpumask *cpu_coregroup_mask(int cpu)
+{
+   return per_cpu(cpu_coregroup_map, cpu);
+}
+
+static bool has_coregroup_support(void)
+{
+   return coregroup_enabled;
+}
+
+static const struct cpumask *cpu_mc_mask(int cpu)
+{
+   return cpu_coregroup_mask(cpu);
+}
+
 static const struct cpumask *cpu_bigcore_mask(int cpu)
 {
return per_cpu(cpu_sibling_map, cpu);
@@ -879,6 +896,7 @@ static struct sched_domain_topology_level 
powerpc_topology[] = {
{ cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) },
 #endif
{ cpu_bigcore_mask, SD_INIT_NAME(BIGCORE) },
+   { cpu_mc_mask, SD_INIT_NAME(MC) },
{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
{ NULL, },
 };
@@ -925,6 +943,10 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
GFP_KERNEL, cpu_to_node(cpu));
zalloc_cpumask_var_node(_cpu(cpu_core_map, cpu),
GFP_KERNEL, cpu_to_node(cpu));
+   if (has_coregroup_support())
+   zalloc_cpumask_var_node(_cpu(cpu_coregroup_map, 
cpu),
+   GFP_KERNEL, cpu_to_node(cpu));
+
 #ifdef CONFIG_NEED_MULTIPLE_NODES
/*
 * numa_node_id() works after this.
@@ -942,6 +964,9 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
cpumask_set_cpu(boot_cpuid, cpu_l2_cache_mask(boot_cpuid));
cpumask_set_cpu(boot_cpuid, cpu_core_mask(boot_cpuid));
 
+   if (has_coregroup_support())
+   cpumask_set_cpu(boot_cpuid, cpu_coregroup_mask(boot_cpuid));
+
init_big_cores();
if (has_big_cores) {
cpumask_set_cpu(boot_cpuid,
@@ -1233,6 +1258,8 @@ static void remove_cpu_from_masks(int cpu)
set_cpus_unrelated(cpu, i, cpu_sibling_mask);
if (has_big_cores)
set_cpus_unrelated(cpu, i, cpu_smallcore_mask);
+   if (has_coregroup_support())
+   set_cpus_unrelated(cpu, i, cpu_coregroup_mask);
}
 }
 #endif
@@ -1293,6 +1320,20 @@ static void add_cpu_to_masks(int cpu)
add_cpu_to_smallcore_masks(cpu);
update_mask_by_l2(cpu, cpu_l2_cache_mask);
 
+   if (has_coregroup_support()) {
+   int coregroup_id = cpu_to_coregroup_id(cpu);
+
+   cpumask_set_cpu(cpu, cpu_coregroup_mask(cpu));
+   for_each_cpu_and(i, cpu_online_mask, cpu_cpu_mask(cpu)) {
+   int fcpu = c

[PATCH v4 08/10] powerpc/smp: Allocate cpumask only after searching thread group

2020-07-26 Thread Srikar Dronamraju
If allocated earlier and the search fails, then cpumask need to be
freed. However cpu_l1_cache_map can be allocated after we search thread
group.

Cc: linuxppc-dev 
Cc: LKML 
Cc: Michael Ellerman 
Cc: Nicholas Piggin 
Cc: Anton Blanchard 
Cc: Oliver O'Halloran 
Cc: Nathan Lynch 
Cc: Michael Neuling 
Cc: Gautham R Shenoy 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Valentin Schneider 
Cc: Jordan Niethe 
Reviewed-by: Gautham R. Shenoy 
Signed-off-by: Srikar Dronamraju 
---
 arch/powerpc/kernel/smp.c | 7 +++
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 698000c7f76f..dab96a1203ec 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -797,10 +797,6 @@ static int init_cpu_l1_cache_map(int cpu)
if (err)
goto out;
 
-   zalloc_cpumask_var_node(_cpu(cpu_l1_cache_map, cpu),
-   GFP_KERNEL,
-   cpu_to_node(cpu));
-
cpu_group_start = get_cpu_thread_group_start(cpu, );
 
if (unlikely(cpu_group_start == -1)) {
@@ -809,6 +805,9 @@ static int init_cpu_l1_cache_map(int cpu)
goto out;
}
 
+   zalloc_cpumask_var_node(_cpu(cpu_l1_cache_map, cpu),
+   GFP_KERNEL, cpu_to_node(cpu));
+
for (i = first_thread; i < first_thread + threads_per_core; i++) {
int i_group_start = get_cpu_thread_group_start(i, );
 
-- 
2.17.1



[PATCH v4 07/10] Powerpc/numa: Detect support for coregroup

2020-07-26 Thread Srikar Dronamraju
Add support for grouping cores based on the device-tree classification.
- The last domain in the associativity domains always refers to the
core.
- If primary reference domain happens to be the penultimate domain in
the associativity domains device-tree property, then there are no
coregroups. However if its not a penultimate domain, then there are
coregroups. There can be more than one coregroup. For now we would be
interested in the last or the smallest coregroups.

Cc: linuxppc-dev 
Cc: LKML 
Cc: Michael Ellerman 
Cc: Nicholas Piggin 
Cc: Anton Blanchard 
Cc: Oliver O'Halloran 
Cc: Nathan Lynch 
Cc: Michael Neuling 
Cc: Gautham R Shenoy 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Valentin Schneider 
Cc: Jordan Niethe 
Reviewed-by: Gautham R. Shenoy 
Signed-off-by: Srikar Dronamraju 
---
Changelog v1 -> v2:
Explained Coregroup in commit msg (Michael Ellerman)

 arch/powerpc/include/asm/smp.h |  1 +
 arch/powerpc/kernel/smp.c  |  1 +
 arch/powerpc/mm/numa.c | 34 +-
 3 files changed, 23 insertions(+), 13 deletions(-)

diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h
index 49a25e2400f2..5bdc17a7049f 100644
--- a/arch/powerpc/include/asm/smp.h
+++ b/arch/powerpc/include/asm/smp.h
@@ -28,6 +28,7 @@
 extern int boot_cpuid;
 extern int spinning_secondaries;
 extern u32 *cpu_to_phys_id;
+extern bool coregroup_enabled;
 
 extern void cpu_die(void);
 extern int cpu_to_chip_id(int cpu);
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 3c5ccf6d2b1c..698000c7f76f 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -74,6 +74,7 @@ static DEFINE_PER_CPU(int, cpu_state) = { 0 };
 
 struct task_struct *secondary_current;
 bool has_big_cores;
+bool coregroup_enabled;
 
 DEFINE_PER_CPU(cpumask_var_t, cpu_sibling_map);
 DEFINE_PER_CPU(cpumask_var_t, cpu_smallcore_map);
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 2298899a0f0a..51cb672f113b 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -886,7 +886,9 @@ static void __init setup_node_data(int nid, u64 start_pfn, 
u64 end_pfn)
 static void __init find_possible_nodes(void)
 {
struct device_node *rtas;
-   u32 numnodes, i;
+   const __be32 *domains;
+   int prop_length, max_nodes;
+   u32 i;
 
if (!numa_enabled)
return;
@@ -895,25 +897,31 @@ static void __init find_possible_nodes(void)
if (!rtas)
return;
 
-   if (of_property_read_u32_index(rtas, 
"ibm,current-associativity-domains",
-   min_common_depth, )) {
-   /*
-* ibm,current-associativity-domains is a fairly recent
-* property. If it doesn't exist, then fallback on
-* ibm,max-associativity-domains. Current denotes what the
-* platform can support compared to max which denotes what the
-* Hypervisor can support.
-*/
-   if (of_property_read_u32_index(rtas, 
"ibm,max-associativity-domains",
-   min_common_depth, ))
+   /*
+* ibm,current-associativity-domains is a fairly recent property. If
+* it doesn't exist, then fallback on ibm,max-associativity-domains.
+* Current denotes what the platform can support compared to max
+* which denotes what the Hypervisor can support.
+*/
+   domains = of_get_property(rtas, "ibm,current-associativity-domains",
+   _length);
+   if (!domains) {
+   domains = of_get_property(rtas, "ibm,max-associativity-domains",
+   _length);
+   if (!domains)
goto out;
}
 
-   for (i = 0; i < numnodes; i++) {
+   max_nodes = of_read_number([min_common_depth], 1);
+   for (i = 0; i < max_nodes; i++) {
if (!node_possible(i))
node_set(i, node_possible_map);
}
 
+   prop_length /= sizeof(int);
+   if (prop_length > min_common_depth + 2)
+   coregroup_enabled = 1;
+
 out:
of_node_put(rtas);
 }
-- 
2.17.1



  1   2   3   4   >