Re: [RFC][PATCH] sched: Rename DIE domain

2023-07-16 Thread Gautham R. Shenoy
On Wed, Jul 12, 2023 at 04:10:56PM +0200, Peter Zijlstra wrote:
> Hi
> 
> Thomas just tripped over the x86 topology setup creating a 'DIE' domain
> for the package mask :-)
> 
> Since these names are SCHED_DEBUG only, rename them.
> I don't think anybody *should* be relying on this, but who knows.
> 
> Signed-off-by: Peter Zijlstra (Intel) 

This naming makes sense.
Acked-by: Gautham R. Shenoy 

--
Thanks and Regards
gautham.


Re: [PATCH 14/36] cpuidle: Fix rcu_idle_*() usage

2022-07-26 Thread Gautham R. Shenoy
On Wed, Jun 08, 2022 at 04:27:37PM +0200, Peter Zijlstra wrote:
> The whole disable-RCU, enable-IRQS dance is very intricate since
> changing IRQ state is traced, which depends on RCU.
> 
> Add two helpers for the cpuidle case that mirror the entry code.
> 
> Signed-off-by: Peter Zijlstra (Intel) 

Reviewed-by: Gautham R. Shenoy 

> ---
>  arch/arm/mach-imx/cpuidle-imx6q.c|4 +--
>  arch/arm/mach-imx/cpuidle-imx6sx.c   |4 +--
>  arch/arm/mach-omap2/cpuidle34xx.c|4 +--
>  arch/arm/mach-omap2/cpuidle44xx.c|8 +++---
>  drivers/acpi/processor_idle.c|   18 --
>  drivers/cpuidle/cpuidle-big_little.c |4 +--
>  drivers/cpuidle/cpuidle-mvebu-v7.c   |4 +--
>  drivers/cpuidle/cpuidle-psci.c   |4 +--
>  drivers/cpuidle/cpuidle-riscv-sbi.c  |4 +--
>  drivers/cpuidle/cpuidle-tegra.c  |8 +++---
>  drivers/cpuidle/cpuidle.c|   11 
>  include/linux/cpuidle.h  |   37 +---
>  kernel/sched/idle.c  |   45 
> ++-
>  kernel/time/tick-broadcast.c |6 +++-
>  14 files changed, 90 insertions(+), 71 deletions(-)
> 
> --- a/arch/arm/mach-imx/cpuidle-imx6q.c
> +++ b/arch/arm/mach-imx/cpuidle-imx6q.c
> @@ -24,9 +24,9 @@ static int imx6q_enter_wait(struct cpuid
>   imx6_set_lpm(WAIT_UNCLOCKED);
>   raw_spin_unlock(_lock);
>  
> - rcu_idle_enter();
> + cpuidle_rcu_enter();
>   cpu_do_idle();
> - rcu_idle_exit();
> + cpuidle_rcu_exit();
>  
>   raw_spin_lock(_lock);
>   if (num_idle_cpus-- == num_online_cpus())
> --- a/arch/arm/mach-imx/cpuidle-imx6sx.c
> +++ b/arch/arm/mach-imx/cpuidle-imx6sx.c
> @@ -47,9 +47,9 @@ static int imx6sx_enter_wait(struct cpui
>   cpu_pm_enter();
>   cpu_cluster_pm_enter();
>  
> - rcu_idle_enter();
> + cpuidle_rcu_enter();
>   cpu_suspend(0, imx6sx_idle_finish);
> - rcu_idle_exit();
> + cpuidle_rcu_exit();
>  
>   cpu_cluster_pm_exit();
>   cpu_pm_exit();
> --- a/arch/arm/mach-omap2/cpuidle34xx.c
> +++ b/arch/arm/mach-omap2/cpuidle34xx.c
> @@ -133,9 +133,9 @@ static int omap3_enter_idle(struct cpuid
>   }
>  
>   /* Execute ARM wfi */
> - rcu_idle_enter();
> + cpuidle_rcu_enter();
>   omap_sram_idle();
> - rcu_idle_exit();
> + cpuidle_rcu_exit();
>  
>   /*
>* Call idle CPU PM enter notifier chain to restore
> --- a/arch/arm/mach-omap2/cpuidle44xx.c
> +++ b/arch/arm/mach-omap2/cpuidle44xx.c
> @@ -105,9 +105,9 @@ static int omap_enter_idle_smp(struct cp
>   }
>   raw_spin_unlock_irqrestore(_lock, flag);
>  
> - rcu_idle_enter();
> + cpuidle_rcu_enter();
>   omap4_enter_lowpower(dev->cpu, cx->cpu_state);
> - rcu_idle_exit();
> + cpuidle_rcu_exit();
>  
>   raw_spin_lock_irqsave(_lock, flag);
>   if (cx->mpu_state_vote == num_online_cpus())
> @@ -186,10 +186,10 @@ static int omap_enter_idle_coupled(struc
>   }
>   }
>  
> - rcu_idle_enter();
> + cpuidle_rcu_enter();
>   omap4_enter_lowpower(dev->cpu, cx->cpu_state);
>   cpu_done[dev->cpu] = true;
> - rcu_idle_exit();
> + cpuidle_rcu_exit();
>  
>   /* Wakeup CPU1 only if it is not offlined */
>   if (dev->cpu == 0 && cpumask_test_cpu(1, cpu_online_mask)) {
> --- a/drivers/acpi/processor_idle.c
> +++ b/drivers/acpi/processor_idle.c
> @@ -607,7 +607,7 @@ static DEFINE_RAW_SPINLOCK(c3_lock);
>   * @cx: Target state context
>   * @index: index of target state
>   */
> -static int acpi_idle_enter_bm(struct cpuidle_driver *drv,
> +static noinstr int acpi_idle_enter_bm(struct cpuidle_driver *drv,
>  struct acpi_processor *pr,
>  struct acpi_processor_cx *cx,
>  int index)
> @@ -626,6 +626,8 @@ static int acpi_idle_enter_bm(struct cpu
>*/
>   bool dis_bm = pr->flags.bm_control;
>  
> + instrumentation_begin();
> +
>   /* If we can skip BM, demote to a safe state. */
>   if (!cx->bm_sts_skip && acpi_idle_bm_check()) {
>   dis_bm = false;
> @@ -647,11 +649,11 @@ static int acpi_idle_enter_bm(struct cpu
>   raw_spin_unlock(_lock);
>   }
>  
> - rcu_idle_enter();
> + cpuidle_rcu_enter();
>  
>   acpi_idle_do_entry(cx);
>  
> - rcu_idle_exit();
> + cpuidle_rcu_exit();
>  
>   /* Re-enable bus master arbitration */
>   if (dis_bm) {
> @@

Re: [PATCH v2 1/3] powerpc/smp: Fix a crash while booting kvm guest with nr_cpus=2

2021-08-31 Thread Gautham R Shenoy
On Thu, Aug 26, 2021 at 03:33:59PM +0530, Srikar Dronamraju wrote:
> Aneesh reported a crash with a fairly recent upstream kernel when
> booting kernel whose commandline was appended with nr_cpus=2
> 
> 1:mon> e
> cpu 0x1: Vector: 300 (Data Access) at [c8a67bd0]
> pc: c002557c: cpu_to_chip_id+0x3c/0x100
> lr: c0058380: start_secondary+0x460/0xb00
> sp: c8a67e70
>msr: 80001033
>dar: 10
>  dsisr: 8
>   current = 0xc891bb00
>   paca= 0xc018ff981f80   irqmask: 0x03   irq_happened: 0x01
> pid   = 0, comm = swapper/1
> Linux version 5.13.0-rc3-15704-ga050a6d2b7e8 (kvaneesh@ltc-boston8) (gcc 
> (Ubuntu 9.3.0-17ubuntu1~20.04) 9.3.0, GNU ld (GNU Binutils for Ubuntu) 2.34) 
> #433 SMP Tue May 25 02:38:49 CDT 2021
> 1:mon> t
> [link register   ] c0058380 start_secondary+0x460/0xb00
> [c8a67e70] c8a67eb0 (unreliable)
> [c8a67eb0] c00589d4 start_secondary+0xab4/0xb00
> [c8a67f90] c000c654 start_secondary_prolog+0x10/0x14
> 
> Current code assumes that num_possible_cpus() is always greater than
> threads_per_core. However this may not be true when using nr_cpus=2 or
> similar options. Handle the case where num_possible_cpus() is not an
> exact multiple of  threads_per_core.
> 
> Cc: linuxppc-dev@lists.ozlabs.org
> Cc: Aneesh Kumar K.V 
> Cc: Nathan Lynch 
> Cc: Michael Ellerman 
> Cc: Ingo Molnar 
> Cc: Peter Zijlstra 
> Cc: Valentin Schneider 
> Cc: Gautham R Shenoy 
> Cc: Vincent Guittot 
> Fixes: c1e53367dab1 ("powerpc/smp: Cache CPU to chip lookup")
> Reported-by: Aneesh Kumar K.V 
> Debugged-by: Michael Ellerman 
> Signed-off-by: Srikar Dronamraju 


This looks good to me.
Reviewed-by: Gautham R. Shenoy 

> ---
> Changelog: v1 -> v2:
> v1: - 
> https://lore.kernel.org/linuxppc-dev/20210821092419.167454-2-sri...@linux.vnet.ibm.com/t/#u
> Handled comment from Gautham Shenoy
> [ Updated to use DIV_ROUND_UP instead of max to handle more situations ]
> 
>  arch/powerpc/kernel/smp.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
> index 6c6e4d934d86..bf11b3c4eb28 100644
> --- a/arch/powerpc/kernel/smp.c
> +++ b/arch/powerpc/kernel/smp.c
> @@ -1074,7 +1074,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
>   }
> 
>   if (cpu_to_chip_id(boot_cpuid) != -1) {
> - int idx = num_possible_cpus() / threads_per_core;
> + int idx = DIV_ROUND_UP(num_possible_cpus(), threads_per_core);
> 
>   /*
>* All threads of a core will all belong to the same core,
> -- 
> 2.18.2
> 


Re: [PATCH 3/3] powerpc/smp: Enable CACHE domain for shared processor

2021-08-24 Thread Gautham R Shenoy
Hello Srikar,

On Sat, Aug 21, 2021 at 02:54:19PM +0530, Srikar Dronamraju wrote:
[..snip..]

The patch looks good to me.

Reviewed-by: Gautham R. Shenoy 

> ---
>  arch/powerpc/kernel/smp.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
> index 3d26d3c61e94..47b15f31cc29 100644
> --- a/arch/powerpc/kernel/smp.c
> +++ b/arch/powerpc/kernel/smp.c
> @@ -1365,7 +1365,7 @@ static bool update_mask_by_l2(int cpu, cpumask_var_t 
> *mask)
>   l2_cache = cpu_to_l2cache(cpu);
>   if (!l2_cache || !*mask) {
>   /* Assume only core siblings share cache with this CPU */
> - for_each_cpu(i, submask_fn(cpu))
> + for_each_cpu(i, cpu_sibling_mask(cpu))
>   set_cpus_related(cpu, i, cpu_l2_cache_mask);
> 
>   return false;
> -- 
> 2.18.2
> 


Re: [PATCH 1/3] powerpc/smp: Fix a crash while booting kvm guest with nr_cpus=2

2021-08-23 Thread Gautham R Shenoy
On Sat, Aug 21, 2021 at 02:54:17PM +0530, Srikar Dronamraju wrote:
> Aneesh reported a crash with a fairly recent upstream kernel when
> booting kernel whose commandline was appended with nr_cpus=2
> 
> 1:mon> e
> cpu 0x1: Vector: 300 (Data Access) at [c8a67bd0]
> pc: c002557c: cpu_to_chip_id+0x3c/0x100
> lr: c0058380: start_secondary+0x460/0xb00
> sp: c8a67e70
>msr: 80001033
>dar: 10
>  dsisr: 8
>   current = 0xc891bb00
>   paca= 0xc018ff981f80   irqmask: 0x03   irq_happened: 0x01
> pid   = 0, comm = swapper/1
> Linux version 5.13.0-rc3-15704-ga050a6d2b7e8 (kvaneesh@ltc-boston8) (gcc 
> (Ubuntu 9.3.0-17ubuntu1~20.04) 9.3.0, GNU ld (GNU Binutils for Ubuntu) 2.34) 
> #433 SMP Tue May 25 02:38:49 CDT 2021
> 1:mon> t
> [link register   ] c0058380 start_secondary+0x460/0xb00
> [c8a67e70] c8a67eb0 (unreliable)
> [c8a67eb0] c00589d4 start_secondary+0xab4/0xb00
> [c8a67f90] c000c654 start_secondary_prolog+0x10/0x14
> 
> Current code assumes that num_possible_cpus() is always greater than
> threads_per_core. However this may not be true when using nr_cpus=2 or
> similar options. Handle the case where num_possible_cpus is smaller than
> threads_per_core.
>
> Cc: linuxppc-dev@lists.ozlabs.org
> Cc: Aneesh Kumar K.V 
> Cc: Nathan Lynch 
> Cc: Michael Ellerman 
> Cc: Ingo Molnar 
> Cc: Peter Zijlstra 
> Cc: Valentin Schneider 
> Cc: Gautham R Shenoy 
> Cc: Vincent Guittot 
> Fixes: c1e53367dab1 ("powerpc/smp: Cache CPU to chip lookup")
> Reported-by: Aneesh Kumar K.V 
> Debugged-by: Michael Ellerman 
> Signed-off-by: Srikar Dronamraju 
> ---
>  arch/powerpc/kernel/smp.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
> index 6c6e4d934d86..3d6874fe1937 100644
> --- a/arch/powerpc/kernel/smp.c
> +++ b/arch/powerpc/kernel/smp.c
> @@ -1074,7 +1074,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
>   }
> 
>   if (cpu_to_chip_id(boot_cpuid) != -1) {
> - int idx = num_possible_cpus() / threads_per_core;
> + int idx = max((int)num_possible_cpus() / threads_per_core, 1);

I think this code was assuming that num_possible_cpus() is a multiple
of threads_per_core.

So, on a system with threads_per_core=8, if we pass nr_cpus=10, we
will still get idx=1. Thus, we will allocate only one entry in
chip_id_lookup_table[] even though there are two cores and
chip_id_lookup_table[] is expected to have one entry per core.

Is this a valid scenario ? If yes, should we use

   idx = DIV_ROUND_UP(num_possible_cpus, threads_per_core);

?

> 
>   /*
>* All threads of a core will all belong to the same core,
> -- 
> 2.18.2
> 

--
Thanks and Regards
gautham.


Re: [PATCHv2 3/3] powerpc/smp: Use existing L2 cache_map cpumask to find L3 cache siblings

2021-07-30 Thread Gautham R Shenoy
On Wed, Jul 28, 2021 at 11:26:07PM +0530, Parth Shah wrote:
> On POWER10 systems, the "ibm,thread-groups" property "2" indicates the cpus
> in thread-group share both L2 and L3 caches. Hence, use cache_property = 2
> itself to find both the L2 and L3 cache siblings.
> Hence, create a new thread_group_l3_cache_map to keep list of L3 siblings,
> but fill the mask using same property "2" array.

This version looks good to me.

Reviewed-by: Gautham R. Shenoy 

> 
> Signed-off-by: Parth Shah 

> ---
>  arch/powerpc/include/asm/smp.h  |  3 ++
>  arch/powerpc/kernel/cacheinfo.c |  3 ++
>  arch/powerpc/kernel/smp.c   | 66 ++---
>  3 files changed, 51 insertions(+), 21 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h
> index 1259040cc3a4..7ef1cd8168a0 100644
> --- a/arch/powerpc/include/asm/smp.h
> +++ b/arch/powerpc/include/asm/smp.h
> @@ -35,6 +35,7 @@ extern int *chip_id_lookup_table;
> 
>  DECLARE_PER_CPU(cpumask_var_t, thread_group_l1_cache_map);
>  DECLARE_PER_CPU(cpumask_var_t, thread_group_l2_cache_map);
> +DECLARE_PER_CPU(cpumask_var_t, thread_group_l3_cache_map);
> 
>  #ifdef CONFIG_SMP
> 
> @@ -144,6 +145,7 @@ extern int cpu_to_core_id(int cpu);
> 
>  extern bool has_big_cores;
>  extern bool thread_group_shares_l2;
> +extern bool thread_group_shares_l3;
> 
>  #define cpu_smt_mask cpu_smt_mask
>  #ifdef CONFIG_SCHED_SMT
> @@ -198,6 +200,7 @@ extern void __cpu_die(unsigned int cpu);
>  #define hard_smp_processor_id()  get_hard_smp_processor_id(0)
>  #define smp_setup_cpu_maps()
>  #define thread_group_shares_l2  0
> +#define thread_group_shares_l3   0
>  static inline void inhibit_secondary_onlining(void) {}
>  static inline void uninhibit_secondary_onlining(void) {}
>  static inline const struct cpumask *cpu_sibling_mask(int cpu)
> diff --git a/arch/powerpc/kernel/cacheinfo.c b/arch/powerpc/kernel/cacheinfo.c
> index 20d91693eac1..cf1be75b7833 100644
> --- a/arch/powerpc/kernel/cacheinfo.c
> +++ b/arch/powerpc/kernel/cacheinfo.c
> @@ -469,6 +469,9 @@ static int get_group_id(unsigned int cpu_id, int level)
>   else if (thread_group_shares_l2 && level == 2)
>   return cpumask_first(per_cpu(thread_group_l2_cache_map,
>cpu_id));
> + else if (thread_group_shares_l3 && level == 3)
> + return cpumask_first(per_cpu(thread_group_l3_cache_map,
> +  cpu_id));
>   return -1;
>  }
> 
> diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
> index a7fcac44a8e2..f2abd88e0c25 100644
> --- a/arch/powerpc/kernel/smp.c
> +++ b/arch/powerpc/kernel/smp.c
> @@ -78,6 +78,7 @@ struct task_struct *secondary_current;
>  bool has_big_cores;
>  bool coregroup_enabled;
>  bool thread_group_shares_l2;
> +bool thread_group_shares_l3;
> 
>  DEFINE_PER_CPU(cpumask_var_t, cpu_sibling_map);
>  DEFINE_PER_CPU(cpumask_var_t, cpu_smallcore_map);
> @@ -101,7 +102,7 @@ enum {
> 
>  #define MAX_THREAD_LIST_SIZE 8
>  #define THREAD_GROUP_SHARE_L1   1
> -#define THREAD_GROUP_SHARE_L2   2
> +#define THREAD_GROUP_SHARE_L2_L3 2
>  struct thread_groups {
>   unsigned int property;
>   unsigned int nr_groups;
> @@ -131,6 +132,12 @@ DEFINE_PER_CPU(cpumask_var_t, thread_group_l1_cache_map);
>   */
>  DEFINE_PER_CPU(cpumask_var_t, thread_group_l2_cache_map);
> 
> +/*
> + * On P10, thread_group_l3_cache_map for each CPU is equal to the
> + * thread_group_l2_cache_map
> + */
> +DEFINE_PER_CPU(cpumask_var_t, thread_group_l3_cache_map);
> +
>  /* SMP operations for this machine */
>  struct smp_ops_t *smp_ops;
> 
> @@ -889,19 +896,41 @@ static struct thread_groups *__init 
> get_thread_groups(int cpu,
>   return tg;
>  }
> 
> +static int update_mask_from_threadgroup(cpumask_var_t *mask, struct 
> thread_groups *tg, int cpu, int cpu_group_start)
> +{
> + int first_thread = cpu_first_thread_sibling(cpu);
> + int i;
> +
> + zalloc_cpumask_var_node(mask, GFP_KERNEL, cpu_to_node(cpu));
> +
> + for (i = first_thread; i < first_thread + threads_per_core; i++) {
> + int i_group_start = get_cpu_thread_group_start(i, tg);
> +
> + if (unlikely(i_group_start == -1)) {
> + WARN_ON_ONCE(1);
> + return -ENODATA;
> + }
> +
> + if (i_group_start == cpu_group_start)
> + cpumask_set_cpu(i, *mask);
> + }
> +
> + return 0;
> +}
> +
>  static int __init init_thread_group_cache_map(int cpu, int cache_prop

Re: [PATCH] cpufreq:powernv: Fix init_chip_info initialization in numa=off

2021-07-27 Thread Gautham R Shenoy
On Mon, Jul 26, 2021 at 10:37:57PM +0530, Pratik R. Sampat wrote:
> In the numa=off kernel command-line configuration init_chip_info() loops
> around the number of chips and attempts to copy the cpumask of that node
> which is NULL for all iterations after the first chip.
> 
> Hence, store the cpu mask for each chip instead of derving cpumask from
> node while populating the "chips" struct array and copy that to the
> chips[i].mask
> 
> Cc: sta...@vger.kernel.org
> Fixes: 053819e0bf84 ("cpufreq: powernv: Handle throttling due to Pmax capping 
> at chip level")
> Signed-off-by: Pratik R. Sampat 
> Reported-by: Shirisha Ganta 
> ---
>  drivers/cpufreq/powernv-cpufreq.c | 15 +--
>  1 file changed, 13 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/cpufreq/powernv-cpufreq.c 
> b/drivers/cpufreq/powernv-cpufreq.c
> index 005600cef273..8ec10d9aed8f 100644
> --- a/drivers/cpufreq/powernv-cpufreq.c
> +++ b/drivers/cpufreq/powernv-cpufreq.c
> @@ -1046,12 +1046,20 @@ static int init_chip_info(void)
>   unsigned int *chip;
>   unsigned int cpu, i;
>   unsigned int prev_chip_id = UINT_MAX;
> + cpumask_t *chip_cpu_mask;
>   int ret = 0;
> 
>   chip = kcalloc(num_possible_cpus(), sizeof(*chip), GFP_KERNEL);
>   if (!chip)
>   return -ENOMEM;
> 
> + /* Allocate a chip cpu mask large enough to fit mask for all chips */
> + chip_cpu_mask = kcalloc(32, sizeof(cpumask_t), GFP_KERNEL);

I suppose by 32 you mean the maximum number of chips possible. You
could use a #define for that.

Otherwise, the patch looks good to me.

Reviewed-by: Gautham R. Shenoy 



> + if (!chip_cpu_mask) {
> + ret = -ENOMEM;
> + goto free_and_return;
> + }
> +
>   for_each_possible_cpu(cpu) {
>   unsigned int id = cpu_to_chip_id(cpu);
> 
> @@ -1059,22 +1067,25 @@ static int init_chip_info(void)
>   prev_chip_id = id;
>   chip[nr_chips++] = id;
>   }
> + cpumask_set_cpu(cpu, _cpu_mask[nr_chips-1]);
>   }
> 
>   chips = kcalloc(nr_chips, sizeof(struct chip), GFP_KERNEL);
>   if (!chips) {
>   ret = -ENOMEM;
> - goto free_and_return;
> + goto out_chip_cpu_mask;
>   }
> 
>   for (i = 0; i < nr_chips; i++) {
>   chips[i].id = chip[i];
> - cpumask_copy([i].mask, cpumask_of_node(chip[i]));
> + cpumask_copy([i].mask, _cpu_mask[i]);
>   INIT_WORK([i].throttle, powernv_cpufreq_work_fn);
>   for_each_cpu(cpu, [i].mask)
>   per_cpu(chip_info, cpu) =  [i];
>   }
> 
> +out_chip_cpu_mask:
> + kfree(chip_cpu_mask);
>  free_and_return:
>   kfree(chip);
>   return ret;
> -- 
> 2.31.1
> 


Re: [PATCH 3/3] powerpc/smp: Use existing L2 cache_map cpumask to find L3 cache siblings

2021-07-20 Thread Gautham R Shenoy
Hi Parth,

Sorry for the late review.

On Tue, Jun 15, 2021 at 12:38:04PM +0530, Parth Shah wrote:
> On POWER10 systems, the "ibm,thread-groups" property "2" indicates the cpus
> in thread-group share both L2 and L3 caches. Hence, use cache_property = 2
> itself to find both the L2 and L3 cache siblings.
> Hence, rename existing macros to detect if the cache property is for L2 or
> L3 and use the L2 cache map itself to find the presence of L3 siblings.
> 
> Signed-off-by: Parth Shah 
> ---
>  arch/powerpc/include/asm/smp.h  |  2 ++
>  arch/powerpc/kernel/cacheinfo.c |  3 +++
>  arch/powerpc/kernel/smp.c   | 20 +++-
>  3 files changed, 20 insertions(+), 5 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h
> index 1259040cc3a4..55082d343bd2 100644
> --- a/arch/powerpc/include/asm/smp.h
> +++ b/arch/powerpc/include/asm/smp.h
> @@ -144,6 +144,7 @@ extern int cpu_to_core_id(int cpu);
> 
>  extern bool has_big_cores;
>  extern bool thread_group_shares_l2;
> +extern bool thread_group_shares_l3;
> 
>  #define cpu_smt_mask cpu_smt_mask
>  #ifdef CONFIG_SCHED_SMT
> @@ -198,6 +199,7 @@ extern void __cpu_die(unsigned int cpu);
>  #define hard_smp_processor_id()  get_hard_smp_processor_id(0)
>  #define smp_setup_cpu_maps()
>  #define thread_group_shares_l2  0
> +#define thread_group_shares_l3   0
>  static inline void inhibit_secondary_onlining(void) {}
>  static inline void uninhibit_secondary_onlining(void) {}
>  static inline const struct cpumask *cpu_sibling_mask(int cpu)
> diff --git a/arch/powerpc/kernel/cacheinfo.c b/arch/powerpc/kernel/cacheinfo.c
> index 20d91693eac1..378ae20d05a9 100644
> --- a/arch/powerpc/kernel/cacheinfo.c
> +++ b/arch/powerpc/kernel/cacheinfo.c
> @@ -469,6 +469,9 @@ static int get_group_id(unsigned int cpu_id, int level)
>   else if (thread_group_shares_l2 && level == 2)
>   return cpumask_first(per_cpu(thread_group_l2_cache_map,
>cpu_id));
> + else if (thread_group_shares_l3 && level == 3)
> + return cpumask_first(per_cpu(thread_group_l2_cache_map,
> +  cpu_id));

We should either rename thread_group_l2_cache_map as
thread_group_l2_l3_cache_map or we should create a separate
thread_group_l3_cache_map. I prefer the latter approach since it makes
the code consistent. 

Otherwise, the patch looks good to me.

--
Thanks and Regards
gautham.

>   return -1;
>  }
> 
> diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
> index a34877257f2d..d0c70fcd0068 100644
> --- a/arch/powerpc/kernel/smp.c
> +++ b/arch/powerpc/kernel/smp.c
> @@ -78,6 +78,7 @@ struct task_struct *secondary_current;
>  bool has_big_cores;
>  bool coregroup_enabled;
>  bool thread_group_shares_l2;
> +bool thread_group_shares_l3;
> 
>  DEFINE_PER_CPU(cpumask_var_t, cpu_sibling_map);
>  DEFINE_PER_CPU(cpumask_var_t, cpu_smallcore_map);
> @@ -101,7 +102,7 @@ enum {
> 
>  #define MAX_THREAD_LIST_SIZE 8
>  #define THREAD_GROUP_SHARE_L1   1
> -#define THREAD_GROUP_SHARE_L2   2
> +#define THREAD_GROUP_SHARE_L2_L3 2
>  struct thread_groups {
>   unsigned int property;
>   unsigned int nr_groups;
> @@ -887,9 +888,16 @@ static int __init init_thread_group_cache_map(int cpu, 
> int cache_property)
>   cpumask_var_t *mask = NULL;
> 
>   if (cache_property != THREAD_GROUP_SHARE_L1 &&
> - cache_property != THREAD_GROUP_SHARE_L2)
> + cache_property != THREAD_GROUP_SHARE_L2_L3)
>   return -EINVAL;
> 
> + /*
> +  * On P10 fused-core system, the L3 cache is shared between threads of a
> +  * small core only, but the "ibm,thread-groups" property is indicated as
> +  * "2" only which is interpreted as the thread-groups sharing both L2
> +  * and L3 caches. Hence cache_property of THREAD_GROUP_SHARE_L2_L3 is
> +  * used for both L2 and L3 cache sibling detection.
> +  */
>   tg = get_thread_groups(cpu, cache_property, );
>   if (!tg)
>   return err;
> @@ -903,7 +911,7 @@ static int __init init_thread_group_cache_map(int cpu, 
> int cache_property)
> 
>   if (cache_property == THREAD_GROUP_SHARE_L1)
>   mask = _cpu(thread_group_l1_cache_map, cpu);
> - else if (cache_property == THREAD_GROUP_SHARE_L2)
> + else if (cache_property == THREAD_GROUP_SHARE_L2_L3)
>   mask = _cpu(thread_group_l2_cache_map, cpu);
> 
>   zalloc_cpumask_var_node(mask, GFP_KERNEL, cpu_to_node(cpu));
> @@ -1009,14 +1017,16 @@ static int __init init_big_cores(void)
>   has_big_cores = true;
> 
>   for_each_possible_cpu(cpu) {
> - int err = init_thread_group_cache_map(cpu, 
> THREAD_GROUP_SHARE_L2);
> + int err = init_thread_group_cache_map(cpu, 
> THREAD_GROUP_SHARE_L2_L3);
> 
>   if (err)
>   return err;
>   }
> 
>   thread_group_shares_l2 = true;
> - 

[PATCH v5 1/2] cpuidle/pseries: Fixup CEDE0 latency only for POWER10 onwards

2021-07-19 Thread Gautham R. Shenoy
From: "Gautham R. Shenoy" 

Commit d947fb4c965c ("cpuidle: pseries: Fixup exit latency for
CEDE(0)") sets the exit latency of CEDE(0) based on the latency values
of the Extended CEDE states advertised by the platform

On POWER9 LPARs, the firmwares advertise a very low value of 2us for
CEDE1 exit latency on a Dedicated LPAR. The latency advertized by the
PHYP hypervisor corresponds to the latency required to wakeup from the
underlying hardware idle state. However the wakeup latency from the
LPAR perspective should include

1. The time taken to transition the CPU from the Hypervisor into the
   LPAR post wakeup from platform idle state

2. Time taken to send the IPI from the source CPU (waker) to the idle
   target CPU (wakee).

1. can be measured via timer idle test, where we queue a timer, say
for 1ms, and enter the CEDE state. When the timer fires, in the timer
handler we compute how much extra timer over the expected 1ms have we
consumed. On a a POWER9 LPAR the numbers are

CEDE latency measured using a timer (numbers in ns)
N   Min  Median   Avg   90%ile  99%ileMaxStddev
400 2601 5677 5668.7459176413 9299   455.01

1. and 2. combined can be determined by an IPI latency test where we
send an IPI to an idle CPU and in the handler compute the time
difference between when the IPI was sent and when the handler ran. We
see the following numbers on POWER9 LPAR.

CEDE latency measured using an IPI (numbers in ns)
N   Min  Median   Avg   90%ile  99%ileMaxStddev
400 711  7564 7369.43   85599514  9698   1200.01

Suppose, we consider the 99th percentile latency value measured using
the IPI to be the wakeup latency, the value would be 9.5us This is in
the ballpark of the default value of 10us.

Hence, use the exit latency of CEDE(0) based on the latency values
advertized by platform only from POWER10 onwards. The values
advertized on POWER10 platforms is more realistic and informed by the
latency measurements. For earlier platforms stick to the default value
of 10us. The fix was suggested by Michael Ellerman.

Reported-by: Enrico Joedecke 
Fixes: commit d947fb4c965c ("cpuidle: pseries: Fixup exit latency for
CEDE(0)")
Cc: Michal Suchanek 
Cc: Vaidyanathan Srinivasan 
Signed-off-by: Gautham R. Shenoy 
---
 drivers/cpuidle/cpuidle-pseries.c | 16 +++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/drivers/cpuidle/cpuidle-pseries.c 
b/drivers/cpuidle/cpuidle-pseries.c
index a2b5c6f..e592280d 100644
--- a/drivers/cpuidle/cpuidle-pseries.c
+++ b/drivers/cpuidle/cpuidle-pseries.c
@@ -419,7 +419,21 @@ static int pseries_idle_probe(void)
cpuidle_state_table = shared_states;
max_idle_state = ARRAY_SIZE(shared_states);
} else {
-   fixup_cede0_latency();
+   /*
+* Use firmware provided latency values
+* starting with POWER10 platforms. In the
+* case that we are running on a POWER10
+* platform but in an earlier compat mode, we
+* can still use the firmware provided values.
+*
+* However, on platforms prior to POWER10, we
+* cannot rely on the accuracy of the firmware
+* provided latency values. On such platforms,
+* go with the conservative default estimate
+* of 10us.
+*/
+   if (cpu_has_feature(CPU_FTR_ARCH_31) || 
pvr_version_is(PVR_POWER10))
+   fixup_cede0_latency();
cpuidle_state_table = dedicated_states;
max_idle_state = NR_DEDICATED_STATES;
}
-- 
1.9.4



[PATCH v5 2/2] cpuidle/pseries: Do not cap the CEDE0 latency in fixup_cede0_latency()

2021-07-19 Thread Gautham R. Shenoy
From: "Gautham R. Shenoy" 

Currently in fixup_cede0_latency() code, we perform the fixup the
CEDE(0) exit latency value only if minimum advertized extended CEDE
latency values are less than 10us. This was done so as to not break
the expected behaviour on POWER8 platforms where the advertised
latency was higher than the default 10us, which would delay the SMT
folding on the core.

However, after the earlier patch "cpuidle/pseries: Fixup CEDE0 latency
only for POWER10 onwards", we can be sure that the fixup of CEDE0
latency is going to happen only from POWER10 onwards. Hence
unconditionally use the minimum exit latency provided by the platform.

Signed-off-by: Gautham R. Shenoy 
---
 drivers/cpuidle/cpuidle-pseries.c | 59 ---
 1 file changed, 30 insertions(+), 29 deletions(-)

diff --git a/drivers/cpuidle/cpuidle-pseries.c 
b/drivers/cpuidle/cpuidle-pseries.c
index e592280d..bba449b 100644
--- a/drivers/cpuidle/cpuidle-pseries.c
+++ b/drivers/cpuidle/cpuidle-pseries.c
@@ -346,11 +346,9 @@ static int pseries_cpuidle_driver_init(void)
 static void __init fixup_cede0_latency(void)
 {
struct xcede_latency_payload *payload;
-   u64 min_latency_us;
+   u64 min_xcede_latency_us = UINT_MAX;
int i;
 
-   min_latency_us = dedicated_states[1].exit_latency; // CEDE latency
-
if (parse_cede_parameters())
return;
 
@@ -358,42 +356,45 @@ static void __init fixup_cede0_latency(void)
nr_xcede_records);
 
payload = _latency_parameter.payload;
+
+   /*
+* The CEDE idle state maps to CEDE(0). While the hypervisor
+* does not advertise CEDE(0) exit latency values, it does
+* advertise the latency values of the extended CEDE states.
+* We use the lowest advertised exit latency value as a proxy
+* for the exit latency of CEDE(0).
+*/
for (i = 0; i < nr_xcede_records; i++) {
struct xcede_latency_record *record = >records[i];
+   u8 hint = record->hint;
u64 latency_tb = be64_to_cpu(record->latency_ticks);
u64 latency_us = DIV_ROUND_UP_ULL(tb_to_ns(latency_tb), 
NSEC_PER_USEC);
 
-   if (latency_us == 0)
-   pr_warn("cpuidle: xcede record %d has an unrealistic 
latency of 0us.\n", i);
-
-   if (latency_us < min_latency_us)
-   min_latency_us = latency_us;
-   }
-
-   /*
-* By default, we assume that CEDE(0) has exit latency 10us,
-* since there is no way for us to query from the platform.
-*
-* However, if the wakeup latency of an Extended CEDE state is
-* smaller than 10us, then we can be sure that CEDE(0)
-* requires no more than that.
-*
-* Perform the fix-up.
-*/
-   if (min_latency_us < dedicated_states[1].exit_latency) {
/*
-* We set a minimum of 1us wakeup latency for cede0 to
-* distinguish it from snooze
+* We expect the exit latency of an extended CEDE
+* state to be non-zero, it to since it takes at least
+* a few nanoseconds to wakeup the idle CPU and
+* dispatch the virtual processor into the Linux
+* Guest.
+*
+* So we consider only non-zero value for performing
+* the fixup of CEDE(0) latency.
 */
-   u64 cede0_latency = 1;
+   if (latency_us == 0) {
+   pr_warn("cpuidle: Skipping xcede record %d [hint=%d]. 
Exit latency = 0us\n",
+   i, hint);
+   continue;
+   }
 
-   if (min_latency_us > cede0_latency)
-   cede0_latency = min_latency_us - 1;
+   if (latency_us < min_xcede_latency_us)
+   min_xcede_latency_us = latency_us;
+   }
 
-   dedicated_states[1].exit_latency = cede0_latency;
-   dedicated_states[1].target_residency = 10 * (cede0_latency);
+   if (min_xcede_latency_us != UINT_MAX) {
+   dedicated_states[1].exit_latency = min_xcede_latency_us;
+   dedicated_states[1].target_residency = 10 * 
(min_xcede_latency_us);
pr_info("cpuidle: Fixed up CEDE exit latency to %llu us\n",
-   cede0_latency);
+   min_xcede_latency_us);
}
 
 }
-- 
1.9.4



[PATCH v5 0/2] cpuidle/pseries: cleanup of the CEDE0 latency fixup code

2021-07-19 Thread Gautham R. Shenoy
From: "Gautham R. Shenoy" 


Hi,

This is the v5 of the patchset to fixup CEDE0 latency only from
POWER10 onwards.


The previous versions of the patches are
v4 : 
https://lore.kernel.org/linux-pm/1623048014-16451-1-git-send-email-...@linux.vnet.ibm.com/
v3 : 
https://lore.kernel.org/linuxppc-dev/1619697982-28461-1-git-send-email-...@linux.vnet.ibm.com/
v2 : 
https://lore.kernel.org/linuxppc-dev/1619673517-10853-1-git-send-email-...@linux.vnet.ibm.com/
v1 : 
https://lore.kernel.org/linuxppc-dev/1619104049-5118-1-git-send-email-...@linux.vnet.ibm.com/

v4 --> v5 changes
 * Patch 1 : Unchanged. Rebased it against the latest powerpc/merge
  tree. With this patch, on processors older than POWER10, the CEDE
  latency is set to the hardcoded value of 10us which is closer to the
  measured value (details of the measurement in Patch 1).

 * Added a Patch 2/2 titled "cpuidle/pseries: Do not cap the CEDE0
   latency in fixup_cede0_latency()" which will ensure that on POWER10
   onwards we simply take the latency value exposed by the firmware
   without keeping an upper cap of 10us. This upper cap was previously
   required to prevent regression on POWER8 which had advertized
   latency values higher than 10us while the measured values were
   lesser. With Patch 1, we don't need the upper cap any longer.


Tested the series on POWER8, POWER9 and POWER10 with the
cpuidle-smt-performance test case
(https://github.com/gautshen/misc/tree/master/cpuidle-smt-performance) .
  
This test has three classes of threads
1. Workload thread which computes fibonacci numbers (Pinned to the
   primary thread CPU 8). We are interested in the throughput of this
   workload thread.

2. Three irritator threads which are pinned to the secondary CPUs of
   the core on which the workload thread is running (CPUs 10, 12,
   14). These irritators block on a pipe until woken up by a
   waker. After being woken up, they again go back to sleep by
   blocking on a pipe read. We are interested in the wakeup latency of
   the irritator threads.

3. A waker thread which, pinned to a different core (CPU 16) from
   where the workload and the irritators are running, periodically
   wakes up the three irritator threads by writing to their respective
   pipes. The purpose of these periodic wakeups is to prime the
   cpuidle governor on the irritator CPUs to pick the idle state the
   wakeup period.

We measure the wakeup latency of the irritator threads, which tells us
the impact of entering a particular combinations of idle states. Thus
shallower the state, lower should be the wakeup latency.

We also measure the throughput of the fibonacci workload to measure
the single-thread performance in the presence of the waking irritators
on the sibling threads. Entering an idle state which performs SMT
folding should show greater throughput.

There is no observable difference in the behaviour on POWER8 and
POWER10 with and without the patch series, since the CEDE latencies on
both of them with and without the patch are 10us.

However, on POWER9, without the patch, the CEDE latency is 1us based
on the value returned by the firmware (which is not accurate), while
with the patch it is set to the default value of 10us which is closer
to the accurate measure.

The throughput, wakeup latency, throughput and the snooze, CEDE idle
percentage residency results on POWER9 with and without patch are as
follows.

We observe that for a wakeup period between 20us - 100us, the wakeup
latency of the irritator threads with the patch improves by 40-45%.

Though note that with the patch, the throughput of the fibbonacci
workload drops by 5-10% when the wakeup period of the irritator
threads is between 20us-100us. This is an acceptable tradeoff since
there are certain benchmarks on POWER9 which are very sensitive to the
wakeup latency and have a sleeping duration of less than 100us.


Avg Wakeup Latency of the irritator threads
(lower the better)

Irritator  |   |
wakeup |   Without |   With
period |   Patch   |   Patch

1 us   |   3.703 us|   3.632 us ( -1.91%)
2 us   |   3.843 us|   3.925 us ( +2.13%)
5 us   |   8.575 us|   8.656 us ( +0.94%)
   10 us   |   8.264 us|   8.242 us ( -0.27%)
   20 us   |   8.672 us|   8.256 us ( -4.80%)
   50 us   |  15.552 us|   8.257 us (-46.90%)
   80 us   |  15.603 us|   8.803 us (-43.58%)
  100 us   |  15.617 us|   8.328 us (-46.67%)
  120 us   |  15.612 us|  14.505 us ( -7.09%)
  500 us   |  15.957 us|  15.723 us ( -1.47%)
 1000 us   |  16.526 us|  16.502 us ( -0.14%)



~~
Fibonacci workload throughput in Million Operat

Re: [PATCH v2 1/1] powerpc/pseries: Interface to represent PAPR firmware attributes

2021-07-08 Thread Gautham R Shenoy
Hello Pratik,

On Tue, Jul 06, 2021 at 01:54:00PM +0530, Pratik R. Sampat wrote:
> Adds a generic interface to represent the energy and frequency related
> PAPR attributes on the system using the new H_CALL
> "H_GET_ENERGY_SCALE_INFO".
> 
> H_GET_EM_PARMS H_CALL was previously responsible for exporting this
> information in the lparcfg, however the H_GET_EM_PARMS H_CALL
> will be deprecated P10 onwards.
> 
> The H_GET_ENERGY_SCALE_INFO H_CALL is of the following call format:
> hcall(
>   uint64 H_GET_ENERGY_SCALE_INFO,  // Get energy scale info
>   uint64 flags,   // Per the flag request
>   uint64 firstAttributeId,// The attribute id
>   uint64 bufferAddress,   // Guest physical address of the output buffer
>   uint64 bufferSize   // The size in bytes of the output buffer
> );
> 
> This H_CALL can query either all the attributes at once with
> firstAttributeId = 0, flags = 0 as well as query only one attribute
> at a time with firstAttributeId = id, flags = 1.
> 
> The output buffer consists of the following
> 1. number of attributes  - 8 bytes
> 2. array offset to the data location - 8 bytes
> 3. version info  - 1 byte
> 4. A data array of size num attributes, which contains the following:
>   a. attribute ID  - 8 bytes
>   b. attribute value in number - 8 bytes
>   c. attribute name in string  - 64 bytes
>   d. attribute value in string - 64 bytes
> 
> The new H_CALL exports information in direct string value format, hence
> a new interface has been introduced in
> /sys/firmware/papr/energy_scale_info to export this information to
> userspace in an extensible pass-through format.
> 
> The H_CALL returns the name, numeric value and string value (if exists)
> 
> The format of exposing the sysfs information is as follows:
> /sys/firmware/papr/energy_scale_info/
>|-- /
>  |-- desc
>  |-- value
>  |-- value_desc (if exists)
>|-- /
>  |-- desc
>  |-- value
>  |-- value_desc (if exists)
> ...


I like this implementation. Only one comment w.r.t versioning below:

[..snip..]

> @@ -631,6 +632,26 @@ struct hv_gpci_request_buffer {
>   uint8_t bytes[HGPCI_MAX_DATA_BYTES];
>  } __packed;
> 
> +#define MAX_ESI_ATTRS10
> +#define MAX_BUF_SZ   (sizeof(struct h_energy_scale_info_hdr) + \
> + (sizeof(struct energy_scale_attribute) * MAX_ESI_ATTRS))
> +
> +struct energy_scale_attribute {
> + __be64 id;
> + __be64 value;
> + unsigned char desc[64];
> + unsigned char value_desc[64];
> +} __packed;
> +
> +struct h_energy_scale_info_hdr {
> + __be64 num_attrs;
> + __be64 array_offset;
> + __u8 data_header_version;
> +} __packed;
> +


[..snip..]

> +static int __init papr_init(void)
> +{
> + uint64_t num_attrs;
> + int ret, idx, i;
> + char *esi_buf;
> +
> + if (!firmware_has_feature(FW_FEATURE_LPAR))
> + return -ENXIO;
> +
> + esi_buf = kmalloc(MAX_BUF_SZ, GFP_KERNEL);
> + if (esi_buf == NULL)
> + return -ENOMEM;
> + /*
> +  * hcall(
> +  * uint64 H_GET_ENERGY_SCALE_INFO,  // Get energy scale info
> +  * uint64 flags,// Per the flag request
> +  * uint64 firstAttributeId, // The attribute id
> +  * uint64 bufferAddress,// Guest physical address of the output 
> buffer
> +  * uint64 bufferSize);  // The size in bytes of the output buffer
> +  */
> + ret = plpar_hcall_norets(H_GET_ENERGY_SCALE_INFO, ESI_FLAGS_ALL, 0,
> +  virt_to_phys(esi_buf), MAX_BUF_SZ);


It is good that you are passing a character buffer here and
interpreting it later. This helps in the cases where the header has
more fields than what we are aware of changed. I think even a future
platform adds newer fields to the header, those fields will be
appended after the existing fields, so we should still be able to
interpret the first three fields of the header that we are currently
aware of.


> + if (ret != H_SUCCESS) {
> + pr_warn("hcall failed: H_GET_ENERGY_SCALE_INFO");
> + goto out;
> + }
> +
> + esi_hdr = (struct h_energy_scale_info_hdr *) esi_buf;
> + num_attrs = be64_to_cpu(esi_hdr->num_attrs);


Shouldn't we check for the esi_hdr->data_header_version here?
Currently we are only aware of the version 1. If we happen to run this
kernel code on a future platform which supports a different version,
wouldn't it be safer to bail out here ?

Otherwise this patch looks good to me.

Reviewed-by: Gautham R. Shenoy 

--
Thanks and Regards
gautham.


Re: [PATCH] cpufreq:powernv: Fix init_chip_info initialization in numa=off

2021-07-08 Thread Gautham R Shenoy
Hello Pratik,

On Tue, Jun 15, 2021 at 10:39:49AM +0530, Pratik R. Sampat wrote:
> In the numa=off kernel command-line configuration init_chip_info() loops
> around the number of chips and attempts to copy the cpumask of that node
> which is NULL for all iterations after the first chip.

Thanks for taking a look into this. Indeed there is an issue here
because the code here assumes that node_mask as a proxy for the
chip_mask. This assumption breaks when run with numa=off, since there will only 
be a
single node, but multiple chips.


> 
> Hence adding a check to bail out after the first initialization if there
> is only one node.
> 
> Fixes: 053819e0bf84 ("cpufreq: powernv: Handle throttling due to Pmax capping 
> at chip level")
> Signed-off-by: Pratik R. Sampat 
> Reported-by: Shirisha Ganta 
> ---
>  drivers/cpufreq/powernv-cpufreq.c | 2 ++
>  1 file changed, 2 insertions(+)
> 
> diff --git a/drivers/cpufreq/powernv-cpufreq.c 
> b/drivers/cpufreq/powernv-cpufreq.c
> index e439b43c19eb..663f9c4b5e3a 100644
> --- a/drivers/cpufreq/powernv-cpufreq.c
> +++ b/drivers/cpufreq/powernv-cpufreq.c
> @@ -1078,6 +1078,8 @@ static int init_chip_info(void)
>   INIT_WORK([i].throttle, powernv_cpufreq_work_fn);
>   for_each_cpu(cpu, [i].mask)
>   per_cpu(chip_info, cpu) =  [i];
> + if (num_possible_nodes() == 1)
> + break;

With this we will only initialize the chip[0].throttle work function,
while for the rest of the chips chip[i].throttle will be
uninitialized. While we may be running in the numa=off mode, the fact
remains that those other chips do exist and they may experiencing
throttling, during which they will try to schedule work for chip[i] in
order to take corrective action, which will fail.

Hence a more correct approach may be to maintain a chip[i] mask
independent of the node mask.





>   }
>  
>  free_and_return:
> -- 
> 2.30.2
> 


Re: [PATCH 1/1] powerpc/pseries: Interface to represent PAPR firmware attributes

2021-06-18 Thread Gautham R Shenoy
On Wed, Jun 16, 2021 at 07:12:40PM +0530, Pratik R. Sampat wrote:
> Adds a generic interface to represent the energy and frequency related
> PAPR attributes on the system using the new H_CALL
> "H_GET_ENERGY_SCALE_INFO".
> 
> H_GET_EM_PARMS H_CALL was previously responsible for exporting this
> information in the lparcfg, however the H_GET_EM_PARMS H_CALL
> will be deprecated P10 onwards.
> 
> The H_GET_ENERGY_SCALE_INFO H_CALL is of the following call format:
> hcall(
>   uint64 H_GET_ENERGY_SCALE_INFO,  // Get energy scale info
>   uint64 flags,   // Per the flag request
>   uint64 firstAttributeId,// The attribute id
>   uint64 bufferAddress,   // Guest physical address of the output buffer
>   uint64 bufferSize   // The size in bytes of the output buffer
> );
> 
> This H_CALL can query either all the attributes at once with
> firstAttributeId = 0, flags = 0 as well as query only one attribute
> at a time with firstAttributeId = id


For a single attribute, the “firstAttributeId” must be set by the
caller to the attribute id to retrieve and the “singleAttribute” field
in “flags” must also be set to a 1.

If we don't set the "flags" to 1, while specifying a firstAttributeId,
the hypervisor will populate the buffer with the details pertaining to
all the attributes beginning with firstAttributeId.



> 
> The output buffer consists of the following
> 1. number of attributes  - 8 bytes
> 2. array offset to the data location - 8 bytes
> 3. version info  - 1 byte
> 4. A data array of size num attributes, which contains the following:
>   a. attribute ID  - 8 bytes
>   b. attribute value in number - 8 bytes
>   c. attribute name in string  - 64 bytes
>   d. attribute value in string - 64 bytes
> 
[..snip..]

> +
> +static ssize_t papr_show_value(struct kobject *kobj,
> + struct kobj_attribute *attr,
> + char *buf)
> +{
> + struct papr_attr *pattr = container_of(attr, struct papr_attr, attr);
> + struct hv_energy_scale_buffer *t_buf;
> + struct energy_scale_attributes *t_ea;
> + int data_offset, ret = 0;
> +
> + t_buf = kmalloc(sizeof(*t_buf), GFP_KERNEL);
> + if (t_buf == NULL)
> + return -ENOMEM;
> +
> + ret = plpar_hcall_norets(H_GET_ENERGY_SCALE_INFO, 0,
> +  pattr->id, virt_to_phys(t_buf),
> +  sizeof(*t_buf));
> +


In this case, since we are interested in only one attribute, we can
make the call

ret = plpar_hcall_norets(H_GET_ENERGY_SCALE_INFO, 1,
 pattr->id, virt_to_phys(t_buf),
 sizeof(*t_buf));

setting flags=1.

Same in the function papr_show_value_desc() below

--
Thanks and Regards
gautham.


> + if (ret != H_SUCCESS) {
> + pr_warn("hcall failed: H_GET_ENERGY_SCALE_INFO");
> + goto out;
> + }
> +
> + data_offset = be64_to_cpu(t_buf->array_offset) -
> + (sizeof(t_buf->num_attr) +
> + sizeof(t_buf->array_offset) +
> + sizeof(t_buf->data_header_version));
> +
> + t_ea = (struct energy_scale_attributes *) _buf->data[data_offset];
> +
> + ret = sprintf(buf, "%llu\n", be64_to_cpu(t_ea->attr_value));
> + if (ret < 0)
> + ret = -EIO;
> +out:
> + kfree(t_buf);
> +
> + return ret;
> +}
> +
> +static ssize_t papr_show_value_desc(struct kobject *kobj,
> +  struct kobj_attribute *attr,
> +  char *buf)
> +{
> + struct papr_attr *pattr = container_of(attr, struct papr_attr, attr);
> + struct hv_energy_scale_buffer *t_buf;
> + struct energy_scale_attributes *t_ea;
> + int data_offset, ret = 0;
> +
> + t_buf = kmalloc(sizeof(*t_buf), GFP_KERNEL);
> + if (t_buf == NULL)
> + return -ENOMEM;
> +
> + ret = plpar_hcall_norets(H_GET_ENERGY_SCALE_INFO, 0,
> +  pattr->id, virt_to_phys(t_buf),
> +  sizeof(*t_buf));
> +
> + if (ret != H_SUCCESS) {
> + pr_warn("hcall failed: H_GET_ENERGY_SCALE_INFO");
> + goto out;
> + }
> +
> + data_offset = be64_to_cpu(t_buf->array_offset) -
> + (sizeof(t_buf->num_attr) +
> + sizeof(t_buf->array_offset) +
> + sizeof(t_buf->data_header_version));
> +
> + t_ea = (struct energy_scale_attributes *) _buf->data[data_offset];
> +
> + ret = sprintf(buf, "%s\n", t_ea->attr_value_desc);
> + if (ret < 0)
> + ret = -EIO;
> +out:
> + kfree(t_buf);
> +
> + return ret;
> +}
> +
> +static struct papr_ops_info {
> + const char *attr_name;
> + ssize_t (*show)(struct kobject *kobj, struct kobj_attribute *attr,
> + char *buf);
> +} ops_info[] = {
> + { "desc", papr_show_desc },
> + { 

[PATCH v3] cpuidle/pseries: Fixup CEDE0 latency only for POWER10 onwards

2021-04-29 Thread Gautham R. Shenoy
From: "Gautham R. Shenoy" 

Commit d947fb4c965c ("cpuidle: pseries: Fixup exit latency for
CEDE(0)") sets the exit latency of CEDE(0) based on the latency values
of the Extended CEDE states advertised by the platform

On POWER9 LPARs, the firmwares advertise a very low value of 2us for
CEDE1 exit latency on a Dedicated LPAR. The latency advertized by the
PHYP hypervisor corresponds to the latency required to wakeup from the
underlying hardware idle state. However the wakeup latency from the
LPAR perspective should include

1. The time taken to transition the CPU from the Hypervisor into the
   LPAR post wakeup from platform idle state

2. Time taken to send the IPI from the source CPU (waker) to the idle
   target CPU (wakee).

1. can be measured via timer idle test, where we queue a timer, say
for 1ms, and enter the CEDE state. When the timer fires, in the timer
handler we compute how much extra timer over the expected 1ms have we
consumed. On a a POWER9 LPAR the numbers are

CEDE latency measured using a timer (numbers in ns)
N   Min  Median   Avg   90%ile  99%ileMaxStddev
400 2601 5677 5668.7459176413 9299   455.01

1. and 2. combined can be determined by an IPI latency test where we
send an IPI to an idle CPU and in the handler compute the time
difference between when the IPI was sent and when the handler ran. We
see the following numbers on POWER9 LPAR.

CEDE latency measured using an IPI (numbers in ns)
N   Min  Median   Avg   90%ile  99%ileMaxStddev
400 711  7564 7369.43   85599514  9698   1200.01

Suppose, we consider the 99th percentile latency value measured using
the IPI to be the wakeup latency, the value would be 9.5us This is in
the ballpark of the default value of 10us.

Hence, use the exit latency of CEDE(0) based on the latency values
advertized by platform only from POWER10 onwards. The values
advertized on POWER10 platforms is more realistic and informed by the
latency measurements. For earlier platforms stick to the default value
of 10us. The fix was suggested by Michael Ellerman.

Reported-by: Enrico Joedecke 
Fixes: commit d947fb4c965c ("cpuidle: pseries: Fixup exit latency for
CEDE(0)")
Cc: Michal Suchanek 
Cc: Vaidyanathan Srinivasan 
Signed-off-by: Gautham R. Shenoy 
---
v2-->v3: Modify the condition to preclude only the platforms prior to
 POWER10 from using the firmware provided values.
 
 drivers/cpuidle/cpuidle-pseries.c | 9 +
 1 file changed, 9 insertions(+)

diff --git a/drivers/cpuidle/cpuidle-pseries.c 
b/drivers/cpuidle/cpuidle-pseries.c
index a2b5c6f..694d71e 100644
--- a/drivers/cpuidle/cpuidle-pseries.c
+++ b/drivers/cpuidle/cpuidle-pseries.c
@@ -349,6 +349,15 @@ static void __init fixup_cede0_latency(void)
u64 min_latency_us;
int i;
 
+   /*
+* Use firmware provided latency values on POWER10 onwards and
+* also on POWER10 running in POWER9-compat mode. On platforms
+* prior to POWER10, we cannot rely on the firmware provided
+* values, so we go with the conservative default value.
+*/
+   if (!cpu_has_feature(CPU_FTR_ARCH_31) && !pvr_version_is(PVR_POWER10))
+   return;
+
min_latency_us = dedicated_states[1].exit_latency; // CEDE latency
 
if (parse_cede_parameters())
-- 
1.9.4



Re: [PATCH v2] cpuidle/pseries: Fixup CEDE0 latency only for POWER10 onwards

2021-04-29 Thread Gautham R Shenoy
Hello Michael,

On Thu, Apr 29, 2021 at 07:56:25PM +1000, Michael Ellerman wrote:
> "Gautham R. Shenoy"  writes:
> > From: "Gautham R. Shenoy" 
> >
> > Commit d947fb4c965c ("cpuidle: pseries: Fixup exit latency for
> > CEDE(0)") sets the exit latency of CEDE(0) based on the latency values
> > of the Extended CEDE states advertised by the platform
> >
> > On POWER9 LPARs, the firmwares advertise a very low value of 2us for
> > CEDE1 exit latency on a Dedicated LPAR. The latency advertized by the
> > PHYP hypervisor corresponds to the latency required to wakeup from the
> > underlying hardware idle state. However the wakeup latency from the
> > LPAR perspective should include
> >
> > 1. The time taken to transition the CPU from the Hypervisor into the
> >LPAR post wakeup from platform idle state
> >
> > 2. Time taken to send the IPI from the source CPU (waker) to the idle
> >target CPU (wakee).
> >
> > 1. can be measured via timer idle test, where we queue a timer, say
> > for 1ms, and enter the CEDE state. When the timer fires, in the timer
> > handler we compute how much extra timer over the expected 1ms have we
> > consumed. On a a POWER9 LPAR the numbers are
> >
> > CEDE latency measured using a timer (numbers in ns)
> > N   Min  Median   Avg   90%ile  99%ileMaxStddev
> > 400 2601 5677 5668.7459176413 9299   455.01
> >
> > 1. and 2. combined can be determined by an IPI latency test where we
> > send an IPI to an idle CPU and in the handler compute the time
> > difference between when the IPI was sent and when the handler ran. We
> > see the following numbers on POWER9 LPAR.
> >
> > CEDE latency measured using an IPI (numbers in ns)
> > N   Min  Median   Avg   90%ile  99%ileMaxStddev
> > 400 711  7564 7369.43   85599514  9698   1200.01
> >
> > Suppose, we consider the 99th percentile latency value measured using
> > the IPI to be the wakeup latency, the value would be 9.5us This is in
> > the ballpark of the default value of 10us.
> >
> > Hence, use the exit latency of CEDE(0) based on the latency values
> > advertized by platform only from POWER10 onwards. The values
>^^^
> > advertized on POWER10 platforms is more realistic and informed by the
> > latency measurements. For earlier platforms stick to the default value
> > of 10us.
> 
> ...
> 
> > diff --git a/drivers/cpuidle/cpuidle-pseries.c 
> > b/drivers/cpuidle/cpuidle-pseries.c
> > index a2b5c6f..7207467 100644
> > --- a/drivers/cpuidle/cpuidle-pseries.c
> > +++ b/drivers/cpuidle/cpuidle-pseries.c
> > @@ -419,7 +419,8 @@ static int pseries_idle_probe(void)
> > cpuidle_state_table = shared_states;
> > max_idle_state = ARRAY_SIZE(shared_states);
> > } else {
> > -   fixup_cede0_latency();
> > +   if (pvr_version_is(PVR_POWER10))
> > +   fixup_cede0_latency();
> 
> A PVR check like that tests for *only* Power10, not Power10 and onwards
> as you say in the change log.

Right. The accurate thing would be to check not do the fix up for


!(PVR_POWER4 || PVR_POWER4p || POWER_POWER5 || PVR_POWER5p  || PVR_POWER6  || 
PVR_POWER7
 || PVR_POWER8  || PVR_POWER9)

But that was a bit mouthful. I will go with your suggestion (from
private correspondence)

if (cpu_has_feature(CPU_FTR_ARCH_31) || pvr_version_is(PVR_POWER10))
fixup_cede0_latency(); 

since it will allow the fixup for Processors suporting ISA 3.1
(POWER10 and above) and also on POWER10 CPUs running in compat mode.


> 
> The other question is what should happen on a Power10 LPAR that's
> running in Power9 compat mode. I assume in that case we *do* want to use
> the firmware provided values, because they're tied to the underlying
> CPU, not the compat mode?
>

Yes, the firmware provided values are tied to the underlying CPU. Not
the compat mode.


> In which case a check for !PVR_POWER9 would seem to achieve what we
> want?
> 
> cheers

--
Thanks and Regards
gautham.


[PATCH v2] cpuidle/pseries: Fixup CEDE0 latency only for POWER10 onwards

2021-04-28 Thread Gautham R. Shenoy
From: "Gautham R. Shenoy" 

Commit d947fb4c965c ("cpuidle: pseries: Fixup exit latency for
CEDE(0)") sets the exit latency of CEDE(0) based on the latency values
of the Extended CEDE states advertised by the platform

On POWER9 LPARs, the firmwares advertise a very low value of 2us for
CEDE1 exit latency on a Dedicated LPAR. The latency advertized by the
PHYP hypervisor corresponds to the latency required to wakeup from the
underlying hardware idle state. However the wakeup latency from the
LPAR perspective should include

1. The time taken to transition the CPU from the Hypervisor into the
   LPAR post wakeup from platform idle state

2. Time taken to send the IPI from the source CPU (waker) to the idle
   target CPU (wakee).

1. can be measured via timer idle test, where we queue a timer, say
for 1ms, and enter the CEDE state. When the timer fires, in the timer
handler we compute how much extra timer over the expected 1ms have we
consumed. On a a POWER9 LPAR the numbers are

CEDE latency measured using a timer (numbers in ns)
N   Min  Median   Avg   90%ile  99%ileMaxStddev
400 2601 5677 5668.7459176413 9299   455.01

1. and 2. combined can be determined by an IPI latency test where we
send an IPI to an idle CPU and in the handler compute the time
difference between when the IPI was sent and when the handler ran. We
see the following numbers on POWER9 LPAR.

CEDE latency measured using an IPI (numbers in ns)
N   Min  Median   Avg   90%ile  99%ileMaxStddev
400 711  7564 7369.43   85599514  9698   1200.01

Suppose, we consider the 99th percentile latency value measured using
the IPI to be the wakeup latency, the value would be 9.5us This is in
the ballpark of the default value of 10us.

Hence, use the exit latency of CEDE(0) based on the latency values
advertized by platform only from POWER10 onwards. The values
advertized on POWER10 platforms is more realistic and informed by the
latency measurements. For earlier platforms stick to the default value
of 10us.

Reported-by: Enrico Joedecke 
Fixes: commit d947fb4c965c ("cpuidle: pseries: Fixup exit latency for
CEDE(0)")
Cc: Michal Suchanek 
Cc: Vaidyanathan Srinivasan 
Signed-off-by: Gautham R. Shenoy 
---
v1-->v2 : Updated the commit log with the details of the measured
  latencies on POWER9 LPARs.
 drivers/cpuidle/cpuidle-pseries.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/cpuidle/cpuidle-pseries.c 
b/drivers/cpuidle/cpuidle-pseries.c
index a2b5c6f..7207467 100644
--- a/drivers/cpuidle/cpuidle-pseries.c
+++ b/drivers/cpuidle/cpuidle-pseries.c
@@ -419,7 +419,8 @@ static int pseries_idle_probe(void)
cpuidle_state_table = shared_states;
max_idle_state = ARRAY_SIZE(shared_states);
} else {
-   fixup_cede0_latency();
+   if (pvr_version_is(PVR_POWER10))
+   fixup_cede0_latency();
cpuidle_state_table = dedicated_states;
max_idle_state = NR_DEDICATED_STATES;
}
-- 
1.8.3.1



Re: [PATCH] cpuidle/pseries: Fixup CEDE0 latency only for POWER10 onwards

2021-04-28 Thread Gautham R Shenoy
Hello Michal,

On Wed, Apr 28, 2021 at 10:03:26AM +0200, Michal Suchánek wrote:

> > 
> That's a nice detailed explanation. Maybe you could summarize it in the
> commit message so that people looking at the patch in the future can
> tell where the value comes from.

Sure, I will do that and send a v2 with the updated commit message.


> 
> Thanks
> 
> Michal


Re: [PATCH] cpuidle/pseries: Fixup CEDE0 latency only for POWER10 onwards

2021-04-27 Thread Gautham R Shenoy
Hello Michal,

On Sun, Apr 25, 2021 at 01:07:14PM +0200, Michal Suchánek wrote:
> On Sat, Apr 24, 2021 at 01:07:16PM +0530, Vaidyanathan Srinivasan wrote:
> > * Michal Such?nek  [2021-04-23 20:42:16]:
> > 
> > > On Fri, Apr 23, 2021 at 11:59:30PM +0530, Vaidyanathan Srinivasan wrote:
> > > > * Michal Such?nek  [2021-04-23 19:45:05]:
> > > > 
> > > > > On Fri, Apr 23, 2021 at 09:29:39PM +0530, Vaidyanathan Srinivasan 
> > > > > wrote:
> > > > > > * Michal Such?nek  [2021-04-23 09:35:51]:
> > > > > > 
> > > > > > > On Thu, Apr 22, 2021 at 08:37:29PM +0530, Gautham R. Shenoy wrote:
> > > > > > > > From: "Gautham R. Shenoy" 
> > > > > > > > 
> > > > > > > > Commit d947fb4c965c ("cpuidle: pseries: Fixup exit latency for
> > > > > > > > CEDE(0)") sets the exit latency of CEDE(0) based on the latency 
> > > > > > > > values
> > > > > > > > of the Extended CEDE states advertised by the platform
> > > > > > > > 
> > > > > > > > On some of the POWER9 LPARs, the older firmwares advertise a 
> > > > > > > > very low
> > > > > > > > value of 2us for CEDE1 exit latency on a Dedicated LPAR. 
> > > > > > > > However the
> > > > > > > Can you be more specific about 'older firmwares'?
> > > > > > 
> > > > > > Hi Michal,
> > > > > > 
> > > > > > This is POWER9 vs POWER10 difference, not really an obsolete FW.  
> > > > > > The
> > > > > > key idea behind the original patch was to make the H_CEDE latency 
> > > > > > and
> > > > > > hence target residency come from firmware instead of being decided 
> > > > > > by
> > > > > > the kernel.  The advantage is such that, different type of systems 
> > > > > > in
> > > > > > POWER10 generation can adjust this value and have an optimal H_CEDE
> > > > > > entry criteria which balances good single thread performance and
> > > > > > wakeup latency.  Further we can have additional H_CEDE state to feed
> > > > > > into the cpuidle.  
> > > > > 
> > > > > So all POWER9 machines are affected by the firmware bug where firmware
> > > > > reports CEDE1 exit latency of 2us and the real latency is 5us which
> > > > > causes the kernel to prefer CEDE1 too much when relying on the values
> > > > > supplied by the firmware. It is not about 'older firmware'.
> > > > 
> > > > Correct.  All POWER9 systems running Linux as guest LPARs will see
> > > > extra usage of CEDE idle state, but not baremetal (PowerNV).
> > > > 
> > > > The correct definition of the bug or miss-match in expectation is that
> > > > firmware reports wakeup latency from a core/thread wakeup timing, but
> > > > not end-to-end time from sending a wakeup event like an IPI using
> > > > H_calls and receiving the events on the target.  Practically there are
> > > > few extra micro-seconds needed after deciding to wakeup a target
> > > > core/thread to getting the target to start executing instructions
> > > > within the LPAR instance.
> > > 
> > > Thanks for the detailed explanation.
> > > 
> > > Maybe just adding a few microseconds to the reported time would be a
> > > more reasonable workaround than using a blanket fixed value then.
> > 
> > Yes, that is an option.  But that may only reduce the difference
> > between existing kernel and new kernel unless we make it the same
> > number.  Further we are fixing this in P10 and hence we will have to
> > add "if(P9) do the compensation" and otherwise take it as is.  That
> > would not be elegant.  Given that our goal for P9 platform is to not
> > introduce changes in H_CEDE entry behaviour, we arrived at this
> > approach (this small patch) and this also makes it easy to backport to
> > various distro products.
> 
> I don't see how this is more elegent.
> 
> The current patch is
> 
> if(p9)
>   use fixed value
> 
> the suggested patch is
> 
> if(p9)
>   apply compensation


We could do that, however, from the recent measurements the default
value is closer to the latency value measured using an IPI.

As Vaidy described earlier, on POWER9

[PATCH] cpuidle/pseries: Fixup CEDE0 latency only for POWER10 onwards

2021-04-22 Thread Gautham R. Shenoy
From: "Gautham R. Shenoy" 

Commit d947fb4c965c ("cpuidle: pseries: Fixup exit latency for
CEDE(0)") sets the exit latency of CEDE(0) based on the latency values
of the Extended CEDE states advertised by the platform

On some of the POWER9 LPARs, the older firmwares advertise a very low
value of 2us for CEDE1 exit latency on a Dedicated LPAR. However the
measured value is 5us on an average. Due to the low advertised exit
latency, we are entering CEDE(0) more aggressively on such
platforms. While this helps achieve SMT folding faster, we pay the
penalty of having to send an IPI to wakeup the CPU when the target
residency is very short. This is showing up as a performance
regression on the newer kernels running on the LPARs with older
firmware.

Hence, set the exit latency of CEDE(0) based on the latency values
advertized by platform only from POWER10 onwards. The values
advertized on POWER10 platforms is more realistic and informed by the
latency measurements.

For platforms older than POWER10, retain the hardcoded value of exit
latency, which is 10us. Though this is higher than the measured
values, we would be erring on the side of caution.

Reported-by: Enrico Joedecke 
Fixes: commit d947fb4c965c ("cpuidle: pseries: Fixup exit latency for
CEDE(0)")
Signed-off-by: Gautham R. Shenoy 
---
 drivers/cpuidle/cpuidle-pseries.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/cpuidle/cpuidle-pseries.c 
b/drivers/cpuidle/cpuidle-pseries.c
index a2b5c6f..7207467 100644
--- a/drivers/cpuidle/cpuidle-pseries.c
+++ b/drivers/cpuidle/cpuidle-pseries.c
@@ -419,7 +419,8 @@ static int pseries_idle_probe(void)
cpuidle_state_table = shared_states;
max_idle_state = ARRAY_SIZE(shared_states);
} else {
-   fixup_cede0_latency();
+   if (pvr_version_is(PVR_POWER10))
+   fixup_cede0_latency();
cpuidle_state_table = dedicated_states;
max_idle_state = NR_DEDICATED_STATES;
}
-- 
1.9.4



Re: [RFC/PATCH] powerpc/smp: Add SD_SHARE_PKG_RESOURCES flag to MC sched-domain

2021-04-19 Thread Gautham R Shenoy
Hello Mel,

On Mon, Apr 12, 2021 at 11:48:19AM +0100, Mel Gorman wrote:
> On Mon, Apr 12, 2021 at 11:06:19AM +0100, Valentin Schneider wrote:
> > On 12/04/21 10:37, Mel Gorman wrote:
> > > On Mon, Apr 12, 2021 at 11:54:36AM +0530, Srikar Dronamraju wrote:
> > >> * Gautham R. Shenoy  [2021-04-02 11:07:54]:
> > >>
> > >> >
> > >> > To remedy this, this patch proposes that the LLC be moved to the MC
> > >> > level which is a group of cores in one half of the chip.
> > >> >
> > >> >   SMT (SMT4) --> MC (Hemisphere)[LLC] --> DIE
> > >> >
> > >>
> > >> I think marking Hemisphere as a LLC in a P10 scenario is a good idea.
> > >>
> > >> > While there is no cache being shared at this level, this is still the
> > >> > level where some amount of cache-snooping takes place and it is
> > >> > relatively faster to access the data from the caches of the cores
> > >> > within this domain. With this change, we no longer see regressions on
> > >> > P10 for applications which require single threaded performance.
> > >>
> > >> Peter, Valentin, Vincent, Mel, etal
> > >>
> > >> On architectures where we have multiple levels of cache access latencies
> > >> within a DIE, (For example: one within the current LLC or SMT core and 
> > >> the
> > >> other at MC or Hemisphere, and finally across hemispheres), do you have 
> > >> any
> > >> suggestions on how we could handle the same in the core scheduler?
> > >>
> > >
> > > Minimally I think it would be worth detecting when there are multiple
> > > LLCs per node and detecting that in generic code as a static branch. In
> > > select_idle_cpu, consider taking two passes -- first on the LLC domain
> > > and if no idle CPU is found then taking a second pass if the search depth
> > > allows within the node with the LLC CPUs masked out.
> > 
> > I think that's actually a decent approach. Tying SD_SHARE_PKG_RESOURCES to
> > something other than pure cache topology in a generic manner is tough (as
> > it relies on murky, ill-defined hardware fabric properties).
> > 
> 
> Agreed. The LLC->node scan idea has been on my TODO list to try for
> a while.

If you have any patches for these, I will be happy to test them on
POWER10. Though, on POWER10, there will be an additional sd between
the LLC and the DIE domain. 




> 
> > Last I tried thinking about that, I stopped at having a core-to-core
> > latency matrix, building domains off of that, and having some knob
> > specifying the highest distance value below which we'd set
> > SD_SHARE_PKG_RESOURCES. There's a few things I 'hate' about that; for one
> > it makes cpus_share_cache() somewhat questionable.
> > 
> 
> And I thought about something like this too but worried it might get
> complex, particularly on chiplets where we do not necessarily have
> hardware info on latency depending on how it's wired up. It also might
> lead to excessive cpumask manipulation in a fast path if we have to
> traverse multiple distances with search cost exceeding gains from latency
> reduction. Hence -- keeping it simple with two level only, LLC then node
> within the allowed search depth and see what that gets us. It might be
> "good enough" in most cases and would be a basis for comparison against
> complex approaches.


> 
> At minimum, I expect IBM can evaluate the POWER10 aspect and I can run
> an evaluation on Zen generations.


> 
> -- 
> Mel Gorman
> SUSE Labs


Re: [PATCH 3/3] powerpc/smp: Cache CPU to chip lookup

2021-04-16 Thread Gautham R Shenoy
On Thu, Apr 15, 2021 at 11:21:10PM +0530, Srikar Dronamraju wrote:
> * Gautham R Shenoy  [2021-04-15 22:49:21]:
> 
> > > 
> > > +int *chip_id_lookup_table;
> > > +
> > >  #ifdef CONFIG_PPC64
> > >  int __initdata iommu_is_off;
> > >  int __initdata iommu_force_on;
> > > @@ -914,13 +916,22 @@ EXPORT_SYMBOL(of_get_ibm_chip_id);
> > >  int cpu_to_chip_id(int cpu)
> > >  {
> > >   struct device_node *np;
> > > + int ret = -1, idx;
> > > +
> > > + idx = cpu / threads_per_core;
> > > + if (chip_id_lookup_table && chip_id_lookup_table[idx] != -1)
> > 
> 
> > The value -1 is ambiguous since we won't be able to determine if
> > it is because we haven't yet made a of_get_ibm_chip_id() call
> > or if of_get_ibm_chip_id() call was made and it returned a -1.
> > 
> 
> We don't allocate chip_id_lookup_table unless cpu_to_chip_id() return
> !-1 value for the boot-cpuid. So this ensures that we dont
> unnecessarily allocate chip_id_lookup_table. Also I check for
> chip_id_lookup_table before calling cpu_to_chip_id() for other CPUs.
> So this avoids overhead of calling cpu_to_chip_id() for platforms that
> dont support it.  Also its most likely that if the
> chip_id_lookup_table is initialized then of_get_ibm_chip_id() call
> would return a valid value.
> 
> + Below we are only populating the lookup table, only when the
> of_get_cpu_node is valid.
> 
> So I dont see any drawbacks of initializing it to -1. Do you see
any?


Only if other callers of cpu_to_chip_id() don't check for whether the
chip_id_lookup_table() has been allocated or not. From a code
readability point of view, it is easier to have that check  this inside
cpu_to_chip_id() instead of requiring all its callers to make that
check.

> 
> > Thus, perhaps we can initialize chip_id_lookup_table[idx] with a
> > different unique negative value. How about S32_MIN ? and check
> > chip_id_lookup_table[idx] is different here ?
> > 
> 
> I had initially initialized to -2, But then I thought we adding in
> more confusion than necessary and it was not solving any issues.
> 
> 
> -- 
> Thanks and Regards
> Srikar Dronamraju


Re: [PATCH 3/3] powerpc/smp: Cache CPU to chip lookup

2021-04-15 Thread Gautham R Shenoy
On Thu, Apr 15, 2021 at 05:39:34PM +0530, Srikar Dronamraju wrote:
> On systems with large CPUs per node, even with the filtered matching of
> related CPUs, there can be large number of calls to cpu_to_chip_id for
> the same CPU. For example with 4096 vCPU, 1 node QEMU configuration,
> with 4 threads per core, system could be see upto 1024 calls to
> cpu_to_chip_id() for the same CPU. On a given system, cpu_to_chip_id()
> for a given CPU would always return the same. Hence cache the result in
> a lookup table for use in subsequent calls.
> 
> Since all CPUs sharing the same core will belong to the same chip, the
> lookup_table has an entry for one CPU per core.  chip_id_lookup_table is
> not being freed and would be used on subsequent CPU online post CPU
> offline.
> 
> Suggested-by: Michael Ellerman 
> Cc: linuxppc-dev@lists.ozlabs.org
> Cc: qemu-...@nongnu.org
> Cc: Cedric Le Goater 
> Cc: David Gibson 
> Cc: Nathan Lynch 
> Cc: Michael Ellerman 
> Cc: Ingo Molnar 
> Cc: Peter Zijlstra 
> Cc: Valentin Schneider 
> Cc: Gautham R Shenoy 
> Reported-by: Daniel Henrique Barboza 
> Signed-off-by: Srikar Dronamraju 
> ---
>  arch/powerpc/include/asm/smp.h |  1 +
>  arch/powerpc/kernel/prom.c | 19 +++
>  arch/powerpc/kernel/smp.c  | 21 +++--
>  3 files changed, 35 insertions(+), 6 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h
> index 47081a9e13ca..03b3d010cbab 100644
> --- a/arch/powerpc/include/asm/smp.h
> +++ b/arch/powerpc/include/asm/smp.h
> @@ -31,6 +31,7 @@ extern u32 *cpu_to_phys_id;
>  extern bool coregroup_enabled;
> 
>  extern int cpu_to_chip_id(int cpu);
> +extern int *chip_id_lookup_table;
> 
>  #ifdef CONFIG_SMP
> 
> diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
> index 9a4797d1d40d..6d2e4a5bc471 100644
> --- a/arch/powerpc/kernel/prom.c
> +++ b/arch/powerpc/kernel/prom.c
> @@ -65,6 +65,8 @@
>  #define DBG(fmt...)
>  #endif
> 
> +int *chip_id_lookup_table;
> +
>  #ifdef CONFIG_PPC64
>  int __initdata iommu_is_off;
>  int __initdata iommu_force_on;
> @@ -914,13 +916,22 @@ EXPORT_SYMBOL(of_get_ibm_chip_id);
>  int cpu_to_chip_id(int cpu)
>  {
>   struct device_node *np;
> + int ret = -1, idx;
> +
> + idx = cpu / threads_per_core;
> + if (chip_id_lookup_table && chip_id_lookup_table[idx] != -1)

The value -1 is ambiguous since we won't be able to determine if
it is because we haven't yet made a of_get_ibm_chip_id() call
or if of_get_ibm_chip_id() call was made and it returned a -1.

Thus, perhaps we can initialize chip_id_lookup_table[idx] with a
different unique negative value. How about S32_MIN ? and check
chip_id_lookup_table[idx] is different here ?


> + return chip_id_lookup_table[idx];
> 
>   np = of_get_cpu_node(cpu, NULL);
> - if (!np)
> - return -1;
> + if (np) {
> + ret = of_get_ibm_chip_id(np);
> + of_node_put(np);
> +
> + if (chip_id_lookup_table)
> + chip_id_lookup_table[idx] = ret;
> + }
> 
> - of_node_put(np);
> - return of_get_ibm_chip_id(np);
> + return ret;
>  }
>  EXPORT_SYMBOL(cpu_to_chip_id);
> 
> diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
> index 5c7ce1d50631..50520fbea424 100644
> --- a/arch/powerpc/kernel/smp.c
> +++ b/arch/powerpc/kernel/smp.c
> @@ -1073,6 +1073,20 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
>   cpu_smallcore_mask(boot_cpuid));
>   }
> 
> + if (cpu_to_chip_id(boot_cpuid) != -1) {
> + int idx = num_possible_cpus() / threads_per_core;
> +
> + /*
> +  * All threads of a core will all belong to the same core,
> +  * chip_id_lookup_table will have one entry per core.
> +  * Assumption: if boot_cpuid doesn't have a chip-id, then no
> +  * other CPUs, will also not have chip-id.
> +  */
> + chip_id_lookup_table = kcalloc(idx, sizeof(int), GFP_KERNEL);
> + if (chip_id_lookup_table)
> + memset(chip_id_lookup_table, -1, sizeof(int) * idx);
> + }
> +
>   if (smp_ops && smp_ops->probe)
>   smp_ops->probe();
>  }
> @@ -1468,8 +1482,8 @@ static void add_cpu_to_masks(int cpu)
>  {
>   struct cpumask *(*submask_fn)(int) = cpu_sibling_mask;
>   int first_thread = cpu_first_thread_sibling(cpu);
> - int chip_id = cpu_to_chip_id(cpu);
>   cpumask_var_t mask;
> + int chip_id = -1;
>   bool ret;
>   int i;
> 
> @@ -1492,7 +1506,10 

Re: [PATCH 1/3] powerpc/smp: Reintroduce cpu_core_mask

2021-04-15 Thread Gautham R Shenoy
Hi Srikar,

On Thu, Apr 15, 2021 at 05:39:32PM +0530, Srikar Dronamraju wrote:
 [..snip..]



> @@ -1485,12 +1486,36 @@ static void add_cpu_to_masks(int cpu)
>   add_cpu_to_smallcore_masks(cpu);
> 
>   /* In CPU-hotplug path, hence use GFP_ATOMIC */
> - alloc_cpumask_var_node(, GFP_ATOMIC, cpu_to_node(cpu));
> + ret = alloc_cpumask_var_node(, GFP_ATOMIC, cpu_to_node(cpu));
>   update_mask_by_l2(cpu, );
> 
>   if (has_coregroup_support())
>   update_coregroup_mask(cpu, );
> 
> + if (chip_id == -1 || !ret) {
> + cpumask_copy(per_cpu(cpu_core_map, cpu), cpu_cpu_mask(cpu));
> + goto out;
> + }
> +
> + if (shared_caches)
> + submask_fn = cpu_l2_cache_mask;
> +
> + /* Update core_mask with all the CPUs that are part of submask */
> + or_cpumasks_related(cpu, cpu, submask_fn, cpu_core_mask);
>

If coregroups exist, we can add the cpus of the coregroup to the
cpu_core_mask thereby reducing the scope of the for_each_cpu() search
below. This will still cut down the time on Baremetal systems
supporting coregroups.


> + /* Skip all CPUs already part of current CPU core mask */
> + cpumask_andnot(mask, cpu_online_mask, cpu_core_mask(cpu));
> +
> + for_each_cpu(i, mask) {
> + if (chip_id == cpu_to_chip_id(i)) {
> + or_cpumasks_related(cpu, i, submask_fn, cpu_core_mask);
> + cpumask_andnot(mask, mask, submask_fn(i));
> + } else {
> + cpumask_andnot(mask, mask, cpu_core_mask(i));
> + }
> + }
> +
> +out:
>   free_cpumask_var(mask);
>  }
> 
> -- 
> 2.25.1
> 


Re: [RFC/PATCH] powerpc/smp: Add SD_SHARE_PKG_RESOURCES flag to MC sched-domain

2021-04-14 Thread Gautham R Shenoy
On Mon, Apr 12, 2021 at 06:33:55PM +0200, Michal Suchánek wrote:
> On Mon, Apr 12, 2021 at 04:24:44PM +0100, Mel Gorman wrote:
> > On Mon, Apr 12, 2021 at 02:21:47PM +0200, Vincent Guittot wrote:
> > > > > Peter, Valentin, Vincent, Mel, etal
> > > > >
> > > > > On architectures where we have multiple levels of cache access 
> > > > > latencies
> > > > > within a DIE, (For example: one within the current LLC or SMT core 
> > > > > and the
> > > > > other at MC or Hemisphere, and finally across hemispheres), do you 
> > > > > have any
> > > > > suggestions on how we could handle the same in the core scheduler?
> > >
> > > I would say that SD_SHARE_PKG_RESOURCES is there for that and doesn't
> > > only rely on cache
> > >
> >
> > From topology.c
> >
> > SD_SHARE_PKG_RESOURCES - describes shared caches
> >
> > I'm guessing here because I am not familiar with power10 but the central
> > problem appears to be when to prefer selecting a CPU sharing L2 or L3
> > cache and the core assumes the last-level-cache is the only relevant one.
> 
> It does not seem to be the case according to original description:
> 
>  When the scheduler tries to wakeup a task, it chooses between the
>  waker-CPU and the wakee's previous-CPU. Suppose this choice is called
>  the "target", then in the target's LLC domain, the scheduler
>  
>  a) tries to find an idle core in the LLC. This helps exploit the
> This is the same as (b) Should this be SMT^^^ ?

On POWER10, without this patch, the LLC is at SMT sched-domain
domain. The difference between a) and b) is a) searches for an idle
core, while b) searches for an idle CPU. 


> SMT folding that the wakee task can benefit from. If an idle
> core is found, the wakee is woken up on it.
>  
>  b) Failing to find an idle core, the scheduler tries to find an idle
> CPU in the LLC. This helps minimise the wakeup latency for the
> wakee since it gets to run on the CPU immediately.
>  
>  c) Failing this, it will wake it up on target CPU.
>  
>  Thus, with P9-sched topology, since the CACHE domain comprises of two
>  SMT4 cores, there is a decent chance that we get an idle core, failing
>  which there is a relatively higher probability of finding an idle CPU
>  among the 8 threads in the domain.
>  
>  However, in P10-sched topology, since the SMT domain is the LLC and it
>  contains only a single SMT4 core, the probability that we find that
>  core to be idle is less. Furthermore, since there are only 4 CPUs to
>  search for an idle CPU, there is lower probability that we can get an
>  idle CPU to wake up the task on.
> 
> >
> > For this patch, I wondered if setting SD_SHARE_PKG_RESOURCES would have
> > unintended consequences for load balancing because load within a die may
> > not be spread between SMT4 domains if SD_SHARE_PKG_RESOURCES was set at
> > the MC level.
> 
> Not spreading load between SMT4 domains within MC is exactly what setting LLC
> at MC level would address, wouldn't it?
>
> As in on P10 we have two relevant levels but the topology as is describes only
> one, and moving the LLC level lower gives two levels the scheduler looks at
> again. Or am I missing something?

This is my current understanding as well, since with this patch we
would then be able to move tasks quickly between the SMT4 cores,
perhaps at the expense of losing out on cache-affinity. Which is why
it would be good to verify this using a test/benchmark.


> 
> Thanks
> 
> Michal
> 

--
Thanks and Regards
gautham.


Re: [RFC/PATCH] powerpc/smp: Add SD_SHARE_PKG_RESOURCES flag to MC sched-domain

2021-04-14 Thread Gautham R Shenoy
Hello Mel,

On Mon, Apr 12, 2021 at 04:24:44PM +0100, Mel Gorman wrote:
> On Mon, Apr 12, 2021 at 02:21:47PM +0200, Vincent Guittot wrote:
> > > > Peter, Valentin, Vincent, Mel, etal
> > > >
> > > > On architectures where we have multiple levels of cache access latencies
> > > > within a DIE, (For example: one within the current LLC or SMT core and 
> > > > the
> > > > other at MC or Hemisphere, and finally across hemispheres), do you have 
> > > > any
> > > > suggestions on how we could handle the same in the core scheduler?
> > 
> > I would say that SD_SHARE_PKG_RESOURCES is there for that and doesn't
> > only rely on cache
> > 
> 
> >From topology.c
> 
>   SD_SHARE_PKG_RESOURCES - describes shared caches
>

Yes, I was aware of this shared caches, but this current patch was the
simplest way to achieve the effect, though the cores in the MC domain
on POWER10 do not share a cache. However, it is relatively faster to
transfer data across the cores within the MC domain compared to the
cores outside the MC domain in the Die.


> I'm guessing here because I am not familiar with power10 but the central
> problem appears to be when to prefer selecting a CPU sharing L2 or L3
> cache and the core assumes the last-level-cache is the only relevant one.
>

On POWER we have traditionally preferred to keep the LLC at the
sched-domain comprising of groups of CPUs that share the L2 (since L3
is a victim cache on POWER).

On POWER9, the L2 was shared by the threads of a pair of SMT4 cores,
while on POWER10, L2 is shared by threads of a single SMT4 core.

Thus, the current task wake-up logic would have a lower probability of
finding an idle core inside an LLC since it has only one core to
search in the LLC. This is why moving the LLC to the parent domain
(MC) consisting of a group of SMT4 cores among which snooping the
cache-data is faster is helpful for workloads that require the single
threaded performance.


> For this patch, I wondered if setting SD_SHARE_PKG_RESOURCES would have
> unintended consequences for load balancing because load within a die may
> not be spread between SMT4 domains if SD_SHARE_PKG_RESOURCES was set at
> the MC level.


Since we are adding the SD_SHARE_PKG_RESOURCES to the parent of the
the only sched-domain (which is a SMT4 domain) which currently has
this flag set, would it cause issues in spreading the load between the
SMT4 domains ?

Are there any tests/benchmarks that can help bring this out? It could
be good to understand this.

> 
> > >
> > > Minimally I think it would be worth detecting when there are multiple
> > > LLCs per node and detecting that in generic code as a static branch. In
> > > select_idle_cpu, consider taking two passes -- first on the LLC domain
> > > and if no idle CPU is found then taking a second pass if the search depth
> > 
> > We have done a lot of changes to reduce and optimize the fast path and
> > I don't think re adding another layer  in the fast path makes sense as
> > you will end up unrolling the for_each_domain behind some
> > static_banches.
> > 
> 
> Searching the node would only happen if a) there was enough search depth
> left and b) there were no idle CPUs at the LLC level. As no new domain
> is added, it's not clear to me why for_each_domain would change.
> 
> But still, your comment reminded me that different architectures have
> different requirements
> 
> Power 10 appears to prefer CPU selection sharing L2 cache but desires
>   spillover to L3 when selecting and idle CPU.
>

Indeed, so on POWER10, the preference would be
1) idle core in the L2 domain.
2) idle core in the MC domain.
3) idle CPU  in the L2 domain
4) idle CPU  in the MC domain.

This patch is able to achieve this *implicitly* because of the way the
select_idle_cpu() and the select_idle_core() is currently coded, where
in the presence of idle cores in the MC level, the select_idle_core()
searches for the idle core starting with the core of the target-CPU.

If I understand your proposal correctly it would be to make this
explicit into a two level search where we first search in the LLC
domain, failing which, we carry on the search in the rest of the die
(assuming that the LLC is not in the die).


> X86 varies, it might want the Power10 approach for some families and prefer
>   L3 spilling over to a CPU on the same node in others.
> 
> S390 cares about something called books and drawers although I've no
>   what it means as such and whether it has any preferences on
>   search order.
> 
> ARM has similar requirements again according to "scheduler: expose the
>   topology of clusters and add cluster scheduler" and that one *does*
>   add another domain.
> 
> I had forgotten about the ARM patches but remembered that they were
> interesting because they potentially help the Zen situation but I didn't
> get the chance to review them before they fell off my radar again. About
> all I recall is that I thought the "cluster" terminology was vague.
> 
> The only 

Re: [RFC/PATCH] powerpc/smp: Add SD_SHARE_PKG_RESOURCES flag to MC sched-domain

2021-04-02 Thread Gautham R Shenoy
(Missed cc'ing Cc Peter in the original posting)

On Fri, Apr 02, 2021 at 11:07:54AM +0530, Gautham R. Shenoy wrote:
> From: "Gautham R. Shenoy" 
> 
> On POWER10 systems, the L2 cache is at the SMT4 small core level. The
> following commits ensure that L2 cache gets correctly discovered and
> the Last-Level-Cache domain (LLC) is set to the SMT sched-domain.
> 
> 790a166 powerpc/smp: Parse ibm,thread-groups with multiple properties
> 1fdc1d6 powerpc/smp: Rename cpu_l1_cache_map as thread_group_l1_cache_map
> fbd2b67 powerpc/smp: Rename init_thread_group_l1_cache_map() to make
>  it generic
> 538abe powerpc/smp: Add support detecting thread-groups sharing L2 cache
> 0be4763 powerpc/cacheinfo: Print correct cache-sibling map/list for L2 
> cache
> 
> However, with the LLC now on the SMT sched-domain, we are seeing some
> regressions in the performance of applications that requires
> single-threaded performance. The reason for this is as follows:
> 
> Prior to the change (we call this P9-sched below), the sched-domain
> hierarchy was:
> 
> SMT (SMT4) --> CACHE (SMT8)[LLC] --> MC (Hemisphere) --> DIE
> 
> where the CACHE sched-domain is defined to be the Last Level Cache (LLC).
> 
> On the upstream kernel, with the aforementioned commmits (P10-sched),
> the sched-domain hierarchy is:
> 
> SMT (SMT4)[LLC] --> MC (Hemisphere) --> DIE
> 
> with the SMT sched-domain as the LLC.
> 
> When the scheduler tries to wakeup a task, it chooses between the
> waker-CPU and the wakee's previous-CPU. Suppose this choice is called
> the "target", then in the target's LLC domain, the scheduler
> 
> a) tries to find an idle core in the LLC. This helps exploit the
>SMT folding that the wakee task can benefit from. If an idle
>core is found, the wakee is woken up on it.
> 
> b) Failing to find an idle core, the scheduler tries to find an idle
>CPU in the LLC. This helps minimise the wakeup latency for the
>wakee since it gets to run on the CPU immediately.
> 
> c) Failing this, it will wake it up on target CPU.
> 
> Thus, with P9-sched topology, since the CACHE domain comprises of two
> SMT4 cores, there is a decent chance that we get an idle core, failing
> which there is a relatively higher probability of finding an idle CPU
> among the 8 threads in the domain.
> 
> However, in P10-sched topology, since the SMT domain is the LLC and it
> contains only a single SMT4 core, the probability that we find that
> core to be idle is less. Furthermore, since there are only 4 CPUs to
> search for an idle CPU, there is lower probability that we can get an
> idle CPU to wake up the task on.
> 
> Thus applications which require single threaded performance will end
> up getting woken up on potentially busy core, even though there are
> idle cores in the system.
> 
> To remedy this, this patch proposes that the LLC be moved to the MC
> level which is a group of cores in one half of the chip.
> 
>   SMT (SMT4) --> MC (Hemisphere)[LLC] --> DIE
> 
> While there is no cache being shared at this level, this is still the
> level where some amount of cache-snooping takes place and it is
> relatively faster to access the data from the caches of the cores
> within this domain. With this change, we no longer see regressions on
> P10 for applications which require single threaded performance.
> 
> The patch also improves the tail latencies on schbench and the
> usecs/op on "perf bench sched pipe"
> 
> On a 10 core P10 system with 80 CPUs,
> 
> schbench
> 
> (https://git.kernel.org/pub/scm/linux/kernel/git/mason/schbench.git/)
> 
> Values : Lower the better.
> 99th percentile is the tail latency.
> 
> 
> 99th percentile
> ~~
> No. messenger
> threads   5.12-rc45.12-rc4
>   P10-sched   MC-LLC
> ~~~
> 1 70 us 85 us
> 2 81 us101 us
> 3 92 us107 us
> 4 96 us110 us
> 5103 us123 us
> 6   3412 us >  122 us
> 7   1490 us136 us
> 8   6200 us   3572 us
> 
> 
> Hackbench
> 
> (perf bench sched pipe)
> values: lower the better
> 
> ~~~
> No. of
> parallel
> instances   5.12-rc4   5.12-rc4
> P10-sched  MC-LLC 
> ~~~
> 1   24.04 us/op18.72 us/op 
> 2   24.04 us/op18.65 us/op 
> 4   24.01 us/op18.76 us/op 
> 8   24.10 us/op 

[RFC/PATCH] powerpc/smp: Add SD_SHARE_PKG_RESOURCES flag to MC sched-domain

2021-04-02 Thread Gautham R. Shenoy
From: "Gautham R. Shenoy" 

On POWER10 systems, the L2 cache is at the SMT4 small core level. The
following commits ensure that L2 cache gets correctly discovered and
the Last-Level-Cache domain (LLC) is set to the SMT sched-domain.

790a166 powerpc/smp: Parse ibm,thread-groups with multiple properties
1fdc1d6 powerpc/smp: Rename cpu_l1_cache_map as thread_group_l1_cache_map
fbd2b67 powerpc/smp: Rename init_thread_group_l1_cache_map() to make
 it generic
538abe powerpc/smp: Add support detecting thread-groups sharing L2 cache
0be4763 powerpc/cacheinfo: Print correct cache-sibling map/list for L2 cache

However, with the LLC now on the SMT sched-domain, we are seeing some
regressions in the performance of applications that requires
single-threaded performance. The reason for this is as follows:

Prior to the change (we call this P9-sched below), the sched-domain
hierarchy was:

  SMT (SMT4) --> CACHE (SMT8)[LLC] --> MC (Hemisphere) --> DIE

where the CACHE sched-domain is defined to be the Last Level Cache (LLC).

On the upstream kernel, with the aforementioned commmits (P10-sched),
the sched-domain hierarchy is:

  SMT (SMT4)[LLC] --> MC (Hemisphere) --> DIE

with the SMT sched-domain as the LLC.

When the scheduler tries to wakeup a task, it chooses between the
waker-CPU and the wakee's previous-CPU. Suppose this choice is called
the "target", then in the target's LLC domain, the scheduler

a) tries to find an idle core in the LLC. This helps exploit the
   SMT folding that the wakee task can benefit from. If an idle
   core is found, the wakee is woken up on it.

b) Failing to find an idle core, the scheduler tries to find an idle
   CPU in the LLC. This helps minimise the wakeup latency for the
   wakee since it gets to run on the CPU immediately.

c) Failing this, it will wake it up on target CPU.

Thus, with P9-sched topology, since the CACHE domain comprises of two
SMT4 cores, there is a decent chance that we get an idle core, failing
which there is a relatively higher probability of finding an idle CPU
among the 8 threads in the domain.

However, in P10-sched topology, since the SMT domain is the LLC and it
contains only a single SMT4 core, the probability that we find that
core to be idle is less. Furthermore, since there are only 4 CPUs to
search for an idle CPU, there is lower probability that we can get an
idle CPU to wake up the task on.

Thus applications which require single threaded performance will end
up getting woken up on potentially busy core, even though there are
idle cores in the system.

To remedy this, this patch proposes that the LLC be moved to the MC
level which is a group of cores in one half of the chip.

  SMT (SMT4) --> MC (Hemisphere)[LLC] --> DIE

While there is no cache being shared at this level, this is still the
level where some amount of cache-snooping takes place and it is
relatively faster to access the data from the caches of the cores
within this domain. With this change, we no longer see regressions on
P10 for applications which require single threaded performance.

The patch also improves the tail latencies on schbench and the
usecs/op on "perf bench sched pipe"

On a 10 core P10 system with 80 CPUs,

schbench

(https://git.kernel.org/pub/scm/linux/kernel/git/mason/schbench.git/)

Values : Lower the better.
99th percentile is the tail latency.


99th percentile
~~
No. messenger
threads   5.12-rc45.12-rc4
  P10-sched   MC-LLC
~~~
1 70 us 85 us
2 81 us101 us
3 92 us107 us
4 96 us110 us
5103 us123 us
6   3412 us >  122 us
7   1490 us136 us
8   6200 us   3572 us


Hackbench

(perf bench sched pipe)
values: lower the better

~~~
No. of
parallel
instances   5.12-rc4   5.12-rc4
P10-sched  MC-LLC 
~~~
1   24.04 us/op18.72 us/op 
2   24.04 us/op18.65 us/op 
4   24.01 us/op18.76 us/op 
8   24.10 us/op19.11 us/op 
~~~~~~~~~~~

Signed-off-by: Gautham R. Shenoy 
---
 arch/powerpc/kernel/smp.c | 9 -
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 5a4d59a..c75dbd4 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -976,6 +976,13 @@ static bool has_coregroup_support(void)
return coregroup_enabled;
 }
 
+static int powerpc_mc_flags(void)
+{
+   if(has_coregroup_support())
+   return SD_SHARE_PKG_RESOURCES;
+   return 0;
+}
+
 static const struct cpumask *cpu_mc_mask(int cpu)
 {
return cpu_coregroup_mask(cpu);
@@ -986,7 +993

Re: [PATCH v7 07/42] powerpc/fsl_booke/32: CacheLockingException remove args

2021-02-08 Thread Gautham R Shenoy
On Sat, Jan 30, 2021 at 11:08:17PM +1000, Nicholas Piggin wrote:
> Like other interrupt handler conversions, switch to getting registers
> from the pt_regs argument.
> 
> Signed-off-by: Nicholas Piggin 
> ---
>  arch/powerpc/kernel/head_fsl_booke.S | 6 +++---
>  arch/powerpc/kernel/traps.c  | 5 +++--
>  2 files changed, 6 insertions(+), 5 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/head_fsl_booke.S 
> b/arch/powerpc/kernel/head_fsl_booke.S
> index fdd4d274c245..0d4d9a6fcca1 100644
> --- a/arch/powerpc/kernel/head_fsl_booke.S
> +++ b/arch/powerpc/kernel/head_fsl_booke.S
> @@ -364,12 +364,12 @@ interrupt_base:
>   /* Data Storage Interrupt */
>   START_EXCEPTION(DataStorage)
>   NORMAL_EXCEPTION_PROLOG(DATA_STORAGE)
> - mfspr   r5,SPRN_ESR /* Grab the ESR, save it, pass arg3 */
> + mfspr   r5,SPRN_ESR /* Grab the ESR, save it3 */
 ^^
Sorry for the nitpick.. Should be "/* Grab the ESR, save it */"


>   stw r5,_ESR(r11)
> - mfspr   r4,SPRN_DEAR/* Grab the DEAR, save it, pass arg2 */
> + mfspr   r4,SPRN_DEAR/* Grab the DEAR, save it */
> + stw r4, _DEAR(r11)
>   andis.  r10,r5,(ESR_ILK|ESR_DLK)@h
>   bne 1f
> - stw r4, _DEAR(r11)
>   EXC_XFER_LITE(0x0300, handle_page_fault)
>  1:
>   addir3,r1,STACK_FRAME_OVERHEAD
> diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
> index 3ec7b443fe6b..1c77b1a8f7c9 100644
> --- a/arch/powerpc/kernel/traps.c
> +++ b/arch/powerpc/kernel/traps.c
> @@ -2062,9 +2062,10 @@ void altivec_assist_exception(struct pt_regs *regs)
>  #endif /* CONFIG_ALTIVEC */
> 
>  #ifdef CONFIG_FSL_BOOKE
> -void CacheLockingException(struct pt_regs *regs, unsigned long address,
> -unsigned long error_code)
> +void CacheLockingException(struct pt_regs *regs)
>  {
> + unsigned long error_code = regs->dsisr;
> +
>   /* We treat cache locking instructions from the user
>* as priv ops, in the future we could try to do
>* something smarter
> -- 
> 2.23.0
> 


[PATCH 0/2] powerpc/cacheinfo: Add "ibm,thread-groups" awareness

2021-01-18 Thread Gautham R. Shenoy
From: "Gautham R. Shenoy" 

Hi,

Currently the cacheinfo code on powerpc indexes the "cache" objects
(modelling the L1/L2/L3 caches) where the key is device-tree node
corresponding to that cache. On some of the POWER server platforms
thread-groups within the core share different sets of caches (Eg: On
SMT8 POWER9 systems, threads 0,2,4,6 of a core share L1 cache and
threads 1,3,5,7 of the same core share another L1 cache). On such
platforms, there is a single device-tree node corresponding to that
cache and the cache-configuration within the threads of the core is
indicated via "ibm,thread-groups" device-tree property.

Since the current code is not aware of the "ibm,thread-groups"
property, on the aforementoined systems, cacheinfo code still treats
all the threads in the core to be sharing the cache because of the
single device-tree node (In the earlier example, the cacheinfo code
would says CPUs 0-7 share L1 cache).

In this patch series, we make the powerpc cacheinfo code aware of the
"ibm,thread-groups" property. We indexe the "cache" objects by the
key-pair (device-tree node, thread-group id). For any CPUX, for a
given level of cache, the thread-group id is defined to be the first
CPU in the "ibm,thread-groups" cache-group containing CPUX. For levels
of cache which are not represented in "ibm,thread-groups" property,
the thread-group id is -1.

We can now remove the helper function get_shared_cpu_map() and
index_dir_to_cpu() since the cache->shared_cpu_map contains the
correct satate of the thread-siblings sharing the cache.

This has been tested on a SMT8 POWER9 system where L1 cache is split
between groups of threads in the core and on an SMT8 POWER10 system
where L1 and L2 caches are split between groups of threads in a core.

With this patch series, on POWER10 SMT8 system, we see the following
reported via sysfs:

$ grep . /sys/devices/system/cpu/cpu[89]/cache/index[0123]/shared_cpu_list
/sys/devices/system/cpu/cpu8/cache/index0/shared_cpu_list:8,10,12,14
/sys/devices/system/cpu/cpu8/cache/index1/shared_cpu_list:8,10,12,14
/sys/devices/system/cpu/cpu8/cache/index2/shared_cpu_list:8,10,12,14
/sys/devices/system/cpu/cpu8/cache/index3/shared_cpu_list:8-15
/sys/devices/system/cpu/cpu9/cache/index0/shared_cpu_list:9,11,13,15
/sys/devices/system/cpu/cpu9/cache/index1/shared_cpu_list:9,11,13,15
/sys/devices/system/cpu/cpu9/cache/index2/shared_cpu_list:9,11,13,15
/sys/devices/system/cpu/cpu9/cache/index3/shared_cpu_list:8-15

$ ppc64_cpu --smt=4
$ grep . /sys/devices/system/cpu/cpu[89]/cache/index[0123]/shared_cpu_list
/sys/devices/system/cpu/cpu8/cache/index0/shared_cpu_list:8,10
/sys/devices/system/cpu/cpu8/cache/index1/shared_cpu_list:8,10
/sys/devices/system/cpu/cpu8/cache/index2/shared_cpu_list:8,10
/sys/devices/system/cpu/cpu8/cache/index3/shared_cpu_list:8-11
/sys/devices/system/cpu/cpu9/cache/index0/shared_cpu_list:9,11
/sys/devices/system/cpu/cpu9/cache/index1/shared_cpu_list:9,11
/sys/devices/system/cpu/cpu9/cache/index2/shared_cpu_list:9,11
/sys/devices/system/cpu/cpu9/cache/index3/shared_cpu_list:8-11

$ ppc64_cpu --smt=2
$ grep . /sys/devices/system/cpu/cpu[89]/cache/index[0123]/shared_cpu_list
/sys/devices/system/cpu/cpu8/cache/index0/shared_cpu_list:8
/sys/devices/system/cpu/cpu8/cache/index1/shared_cpu_list:8
/sys/devices/system/cpu/cpu8/cache/index2/shared_cpu_list:8
/sys/devices/system/cpu/cpu8/cache/index3/shared_cpu_list:8-9
/sys/devices/system/cpu/cpu9/cache/index0/shared_cpu_list:9
/sys/devices/system/cpu/cpu9/cache/index1/shared_cpu_list:9
/sys/devices/system/cpu/cpu9/cache/index2/shared_cpu_list:9
/sys/devices/system/cpu/cpu9/cache/index3/shared_cpu_list:8-9

$ ppc64_cpu --smt=1
$ grep . /sys/devices/system/cpu/cpu[89]/cache/index[0123]/shared_cpu_list
/sys/devices/system/cpu/cpu8/cache/index0/shared_cpu_list:8
/sys/devices/system/cpu/cpu8/cache/index1/shared_cpu_list:8
/sys/devices/system/cpu/cpu8/cache/index2/shared_cpu_list:8
/sys/devices/system/cpu/cpu8/cache/index3/shared_cpu_list:8

Gautham R. Shenoy (2):
  powerpc/cacheinfo: Lookup cache by dt node and thread-group id
  powerpc/cacheinfo: Remove the redundant get_shared_cpu_map()

 arch/powerpc/include/asm/smp.h  |   3 +
 arch/powerpc/kernel/cacheinfo.c | 121 
 2 files changed, 62 insertions(+), 62 deletions(-)

-- 
1.9.4



[PATCH 2/2] powerpc/cacheinfo: Remove the redundant get_shared_cpu_map()

2021-01-18 Thread Gautham R. Shenoy
From: "Gautham R. Shenoy" 

The helper function get_shared_cpu_map() was added in

'commit 500fe5f550ec ("powerpc/cacheinfo: Report the correct
shared_cpu_map on big-cores")'

and subsequently expanded upon in

'commit 0be47634db0b ("powerpc/cacheinfo: Print correct cache-sibling
map/list for L2 cache")'

in order to help report the correct groups of threads sharing these caches
on big-core systems where groups of threads within a core can share
different sets of caches.

Now that powerpc/cacheinfo is aware of "ibm,thread-groups" property,
cache->shared_cpu_map contains the correct set of thread-siblings
sharing the cache. Hence we no longer need the functions
get_shared_cpu_map(). This patch removes this function. We also remove
the helper function index_dir_to_cpu() which was only called by
get_shared_cpu_map().

With these functions removed, we can still see the correct
cache-sibling map/list for L1 and L2 caches on systems with L1 and L2
caches distributed among groups of threads in a core.

With this patch, on a SMT8 POWER10 system where the L1 and L2 caches
are split between the two groups of threads in a core, for CPUs 8,9,
the L1-Data, L1-Instruction, L2, L3 cache CPU sibling list is as
follows:

$ grep . /sys/devices/system/cpu/cpu[89]/cache/index[0123]/shared_cpu_list
/sys/devices/system/cpu/cpu8/cache/index0/shared_cpu_list:8,10,12,14
/sys/devices/system/cpu/cpu8/cache/index1/shared_cpu_list:8,10,12,14
/sys/devices/system/cpu/cpu8/cache/index2/shared_cpu_list:8,10,12,14
/sys/devices/system/cpu/cpu8/cache/index3/shared_cpu_list:8-15
/sys/devices/system/cpu/cpu9/cache/index0/shared_cpu_list:9,11,13,15
/sys/devices/system/cpu/cpu9/cache/index1/shared_cpu_list:9,11,13,15
/sys/devices/system/cpu/cpu9/cache/index2/shared_cpu_list:9,11,13,15
/sys/devices/system/cpu/cpu9/cache/index3/shared_cpu_list:8-15

$ ppc64_cpu --smt=4
$ grep . /sys/devices/system/cpu/cpu[89]/cache/index[0123]/shared_cpu_list
/sys/devices/system/cpu/cpu8/cache/index0/shared_cpu_list:8,10
/sys/devices/system/cpu/cpu8/cache/index1/shared_cpu_list:8,10
/sys/devices/system/cpu/cpu8/cache/index2/shared_cpu_list:8,10
/sys/devices/system/cpu/cpu8/cache/index3/shared_cpu_list:8-11
/sys/devices/system/cpu/cpu9/cache/index0/shared_cpu_list:9,11
/sys/devices/system/cpu/cpu9/cache/index1/shared_cpu_list:9,11
/sys/devices/system/cpu/cpu9/cache/index2/shared_cpu_list:9,11
/sys/devices/system/cpu/cpu9/cache/index3/shared_cpu_list:8-11

$ ppc64_cpu --smt=2
$ grep . /sys/devices/system/cpu/cpu[89]/cache/index[0123]/shared_cpu_list
/sys/devices/system/cpu/cpu8/cache/index0/shared_cpu_list:8
/sys/devices/system/cpu/cpu8/cache/index1/shared_cpu_list:8
/sys/devices/system/cpu/cpu8/cache/index2/shared_cpu_list:8
/sys/devices/system/cpu/cpu8/cache/index3/shared_cpu_list:8-9
/sys/devices/system/cpu/cpu9/cache/index0/shared_cpu_list:9
/sys/devices/system/cpu/cpu9/cache/index1/shared_cpu_list:9
/sys/devices/system/cpu/cpu9/cache/index2/shared_cpu_list:9
/sys/devices/system/cpu/cpu9/cache/index3/shared_cpu_list:8-9

$ ppc64_cpu --smt=1
$ grep . /sys/devices/system/cpu/cpu[89]/cache/index[0123]/shared_cpu_list
/sys/devices/system/cpu/cpu8/cache/index0/shared_cpu_list:8
/sys/devices/system/cpu/cpu8/cache/index1/shared_cpu_list:8
/sys/devices/system/cpu/cpu8/cache/index2/shared_cpu_list:8
/sys/devices/system/cpu/cpu8/cache/index3/shared_cpu_list:8

Signed-off-by: Gautham R. Shenoy 
---
 arch/powerpc/kernel/cacheinfo.c | 41 +
 1 file changed, 1 insertion(+), 40 deletions(-)

diff --git a/arch/powerpc/kernel/cacheinfo.c b/arch/powerpc/kernel/cacheinfo.c
index 5a6925d..20d9169 100644
--- a/arch/powerpc/kernel/cacheinfo.c
+++ b/arch/powerpc/kernel/cacheinfo.c
@@ -675,45 +675,6 @@ static ssize_t level_show(struct kobject *k, struct 
kobj_attribute *attr, char *
 static struct kobj_attribute cache_level_attr =
__ATTR(level, 0444, level_show, NULL);
 
-static unsigned int index_dir_to_cpu(struct cache_index_dir *index)
-{
-   struct kobject *index_dir_kobj = >kobj;
-   struct kobject *cache_dir_kobj = index_dir_kobj->parent;
-   struct kobject *cpu_dev_kobj = cache_dir_kobj->parent;
-   struct device *dev = kobj_to_dev(cpu_dev_kobj);
-
-   return dev->id;
-}
-
-/*
- * On big-core systems, each core has two groups of CPUs each of which
- * has its own L1-cache. The thread-siblings which share l1-cache with
- * @cpu can be obtained via cpu_smallcore_mask().
- *
- * On some big-core systems, the L2 cache is shared only between some
- * groups of siblings. This is already parsed and encoded in
- * cpu_l2_cache_mask().
- *
- * TODO: cache_lookup_or_instantiate() needs to be made aware of the
- *   "ibm,thread-groups" property so that cache->shared_cpu_map
- *   reflects the correct siblings on platforms that have this
- *   device-tree property. This helper function is only a stop-gap
- *   solution so that we report the

[PATCH 1/2] powerpc/cacheinfo: Lookup cache by dt node and thread-group id

2021-01-18 Thread Gautham R. Shenoy
From: "Gautham R. Shenoy" 

Currently the cacheinfo code on powerpc indexes the "cache" objects
(modelling the L1/L2/L3 caches) where the key is device-tree node
corresponding to that cache. On some of the POWER server platforms
thread-groups within the core share different sets of caches (Eg: On
SMT8 POWER9 systems, threads 0,2,4,6 of a core share L1 cache and
threads 1,3,5,7 of the same core share another L1 cache). On such
platforms, there is a single device-tree node corresponding to that
cache and the cache-configuration within the threads of the core is
indicated via "ibm,thread-groups" device-tree property.

Since the current code is not aware of the "ibm,thread-groups"
property, on the aforementoined systems, cacheinfo code still treats
all the threads in the core to be sharing the cache because of the
single device-tree node (In the earlier example, the cacheinfo code
would says CPUs 0-7 share L1 cache).

In this patch, we make the powerpc cacheinfo code aware of the
"ibm,thread-groups" property. We indexe the "cache" objects by the
key-pair (device-tree node, thread-group id). For any CPUX, for a
given level of cache, the thread-group id is defined to be the first
CPU in the "ibm,thread-groups" cache-group containing CPUX. For levels
of cache which are not represented in "ibm,thread-groups" property,
the thread-group id is -1.

Signed-off-by: Gautham R. Shenoy 
---
 arch/powerpc/include/asm/smp.h  |  3 ++
 arch/powerpc/kernel/cacheinfo.c | 80 +
 2 files changed, 61 insertions(+), 22 deletions(-)

diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h
index c4e2d53..39de24b 100644
--- a/arch/powerpc/include/asm/smp.h
+++ b/arch/powerpc/include/asm/smp.h
@@ -32,6 +32,9 @@
 
 extern int cpu_to_chip_id(int cpu);
 
+DECLARE_PER_CPU(cpumask_var_t, thread_group_l1_cache_map);
+DECLARE_PER_CPU(cpumask_var_t, thread_group_l2_cache_map);
+
 #ifdef CONFIG_SMP
 
 struct smp_ops_t {
diff --git a/arch/powerpc/kernel/cacheinfo.c b/arch/powerpc/kernel/cacheinfo.c
index 6f903e9a..5a6925d 100644
--- a/arch/powerpc/kernel/cacheinfo.c
+++ b/arch/powerpc/kernel/cacheinfo.c
@@ -120,6 +120,7 @@ struct cache {
struct cpumask shared_cpu_map; /* online CPUs using this cache */
int type;  /* split cache disambiguation */
int level; /* level not explicit in device tree */
+   int group_id;  /* id of the group of threads that share 
this cache */
struct list_head list; /* global list of cache objects */
struct cache *next_local;  /* next cache of >= level */
 };
@@ -142,22 +143,24 @@ static const char *cache_type_string(const struct cache 
*cache)
 }
 
 static void cache_init(struct cache *cache, int type, int level,
-  struct device_node *ofnode)
+  struct device_node *ofnode, int group_id)
 {
cache->type = type;
cache->level = level;
cache->ofnode = of_node_get(ofnode);
+   cache->group_id = group_id;
INIT_LIST_HEAD(>list);
list_add(>list, _list);
 }
 
-static struct cache *new_cache(int type, int level, struct device_node *ofnode)
+static struct cache *new_cache(int type, int level,
+  struct device_node *ofnode, int group_id)
 {
struct cache *cache;
 
cache = kzalloc(sizeof(*cache), GFP_KERNEL);
if (cache)
-   cache_init(cache, type, level, ofnode);
+   cache_init(cache, type, level, ofnode, group_id);
 
return cache;
 }
@@ -309,20 +312,24 @@ static struct cache *cache_find_first_sibling(struct 
cache *cache)
return cache;
 
list_for_each_entry(iter, _list, list)
-   if (iter->ofnode == cache->ofnode && iter->next_local == cache)
+   if (iter->ofnode == cache->ofnode &&
+   iter->group_id == cache->group_id &&
+   iter->next_local == cache)
return iter;
 
return cache;
 }
 
-/* return the first cache on a local list matching node */
-static struct cache *cache_lookup_by_node(const struct device_node *node)
+/* return the first cache on a local list matching node and thread-group id */
+static struct cache *cache_lookup_by_node_group(const struct device_node *node,
+   int group_id)
 {
struct cache *cache = NULL;
struct cache *iter;
 
list_for_each_entry(iter, _list, list) {
-   if (iter->ofnode != node)
+   if (iter->ofnode != node ||
+   iter->group_id != group_id)
continue;
cache = cache_find_first_sibling(iter);
break;
@@ -352,1

[PATCH v3 0/5] Extend Parsing "ibm, thread-groups" for Shared-L2 information

2020-12-10 Thread Gautham R. Shenoy
From: "Gautham R. Shenoy" 

Hi,

This is the v2 of the patchset to extend parsing of "ibm,thread-groups" property
to discover the Shared-L2 cache information.

The previous versions can be found here :

v2 : 
https://lore.kernel.org/linuxppc-dev/1607533700-5546-1-git-send-email-...@linux.vnet.ibm.com/T/#m043ea15d3832658527fca94765202b9cbefd330d

v1 : 
https://lore.kernel.org/linuxppc-dev/1607057327-29822-1-git-send-email-...@linux.vnet.ibm.com/T/#m0fabffa1ea1a2807b362f25c849bb19415216520


Changes form v2-->v3:
 * Fixed the build errors reported by the Kernel Test Robot for Patches 4 and 5.

Changes from v1-->v2:
Incorporate the review comments from Srikar and
fix a build error on !PPC64 configs reported by the kernel bot.

 * Split Patch 1 into three patches
   * First patch ensure that parse_thread_groups() is made generic to
 support more than one property.
   * Second patch renames cpu_l1_cache_map as
 thread_group_l1_cache_map for consistency. No functional impact.
   * The third patch makes init_thread_group_l1_cache_map()
 generic. No functional impact.

* Patch 2 (Now patch 4): Incorporates the review comments from Srikar 
simplifying
   the changes to update_mask_by_l2()

* Patch 3 (Now patch 5): Fix a build errors for 32-bit configs
   reported by the kernel build bot.

Description of the Patchset
===
The "ibm,thread-groups" device-tree property is an array that is used
to indicate if groups of threads within a core share certain
properties. It provides details of which property is being shared by
which groups of threads. This array can encode information about
multiple properties being shared by different thread-groups within the
core.

Example: Suppose,
"ibm,thread-groups" = [1,2,4,8,10,12,14,9,11,13,15,2,2,4,8,10,12,14,9,11,13,15]

This can be decomposed up into two consecutive arrays:

a) [1,2,4,8,10,12,14,9,11,13,15]
b) [2,2,4,8,10,12,14,9,11,13,15]

where in,

a) provides information of Property "1" being shared by "2" groups,
   each with "4" threads each. The "ibm,ppc-interrupt-server#s" of the
   first group is {8,10,12,14} and the "ibm,ppc-interrupt-server#s" of
   the second group is {9,11,13,15}. Property "1" is indicative of
   the thread in the group sharing L1 cache, translation cache and
   Instruction Data flow.

b) provides information of Property "2" being shared by "2" groups,
   each group with "4" threads. The "ibm,ppc-interrupt-server#s" of
   the first group is {8,10,12,14} and the
   "ibm,ppc-interrupt-server#s" of the second group is
   {9,11,13,15}. Property "2" indicates that the threads in each group
   share the L2-cache.
   
The existing code assumes that the "ibm,thread-groups" encodes
information about only one property. Hence even on platforms which
encode information about multiple properties being shared by the
corresponding groups of threads, the current code will only pick the
first one. (In the above example, it will only consider
[1,2,4,8,10,12,14,9,11,13,15] but not [2,2,4,8,10,12,14,9,11,13,15]).

Furthermore, currently on platforms where groups of threads share L2
cache, we incorrectly create an extra CACHE level sched-domain that
maps to all the threads of the core.

For example, if "ibm,thread-groups" is 
 0001 0002 0004 
 0002 0004 0006 0001
 0003 0005 0007 0002
 0002 0004  0002
 0004 0006 0001 0003
 0005 0007

then, the sub-array
[0002 0002 0004
  0002 0004 0006
 0001 0003 0005 0007]
indicates that L2 (Property "2") is shared only between the threads of a single
group. There are "2" groups of threads where each group contains "4"
threads each. The groups being {0,2,4,6} and {1,3,5,7}.

However, the sched-domain hierarchy for CPUs 0,1 is
CPU0 attaching sched-domain(s):
domain-0: span=0,2,4,6 level=SMT
domain-1: span=0-7 level=CACHE
domain-2: span=0-15,24-39,48-55 level=MC
domain-3: span=0-55 level=DIE

CPU1 attaching sched-domain(s):
domain-0: span=1,3,5,7 level=SMT
domain-1: span=0-7 level=CACHE
domain-2: span=0-15,24-39,48-55 level=MC
domain-3: span=0-55 level=DIE

where the CACHE domain reports that L2 is shared across the entire
core which is incorrect on such platforms.

This patchset remedies these issues by extending the parsing support
for "ibm,thread-groups" to discover information about multiple
properties being shared by the corresponding groups of threads. In
particular we cano now detect if the groups of threads within a core
share the L2-cache. On such platf

[PATCH v3 2/5] powerpc/smp: Rename cpu_l1_cache_map as thread_group_l1_cache_map

2020-12-10 Thread Gautham R. Shenoy
From: "Gautham R. Shenoy" 

On platforms which have the "ibm,thread-groups" property, the per-cpu
variable cpu_l1_cache_map keeps a track of which group of threads
within the same core share the L1 cache, Instruction and Data flow.

This patch renames the variable to "thread_group_l1_cache_map" to make
it consistent with a subsequent patch which will introduce
thread_group_l2_cache_map.

This patch introduces no functional change.

Signed-off-by: Gautham R. Shenoy 
---
 arch/powerpc/kernel/smp.c | 14 +++---
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 88d88ad..f3290d5 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -116,10 +116,10 @@ struct thread_groups_list {
 
 static struct thread_groups_list tgl[NR_CPUS] __initdata;
 /*
- * On big-cores system, cpu_l1_cache_map for each CPU corresponds to
+ * On big-cores system, thread_group_l1_cache_map for each CPU corresponds to
  * the set its siblings that share the L1-cache.
  */
-DEFINE_PER_CPU(cpumask_var_t, cpu_l1_cache_map);
+DEFINE_PER_CPU(cpumask_var_t, thread_group_l1_cache_map);
 
 /* SMP operations for this machine */
 struct smp_ops_t *smp_ops;
@@ -866,7 +866,7 @@ static struct thread_groups *__init get_thread_groups(int 
cpu,
return tg;
 }
 
-static int init_cpu_l1_cache_map(int cpu)
+static int init_thread_group_l1_cache_map(int cpu)
 
 {
int first_thread = cpu_first_thread_sibling(cpu);
@@ -885,7 +885,7 @@ static int init_cpu_l1_cache_map(int cpu)
return -ENODATA;
}
 
-   zalloc_cpumask_var_node(_cpu(cpu_l1_cache_map, cpu),
+   zalloc_cpumask_var_node(_cpu(thread_group_l1_cache_map, cpu),
GFP_KERNEL, cpu_to_node(cpu));
 
for (i = first_thread; i < first_thread + threads_per_core; i++) {
@@ -897,7 +897,7 @@ static int init_cpu_l1_cache_map(int cpu)
}
 
if (i_group_start == cpu_group_start)
-   cpumask_set_cpu(i, per_cpu(cpu_l1_cache_map, cpu));
+   cpumask_set_cpu(i, per_cpu(thread_group_l1_cache_map, 
cpu));
}
 
return 0;
@@ -976,7 +976,7 @@ static int init_big_cores(void)
int cpu;
 
for_each_possible_cpu(cpu) {
-   int err = init_cpu_l1_cache_map(cpu);
+   int err = init_thread_group_l1_cache_map(cpu);
 
if (err)
return err;
@@ -1372,7 +1372,7 @@ static inline void add_cpu_to_smallcore_masks(int cpu)
 
cpumask_set_cpu(cpu, cpu_smallcore_mask(cpu));
 
-   for_each_cpu(i, per_cpu(cpu_l1_cache_map, cpu)) {
+   for_each_cpu(i, per_cpu(thread_group_l1_cache_map, cpu)) {
if (cpu_online(i))
set_cpus_related(i, cpu, cpu_smallcore_mask);
}
-- 
1.9.4



[PATCH v3 3/5] powerpc/smp: Rename init_thread_group_l1_cache_map() to make it generic

2020-12-10 Thread Gautham R. Shenoy
From: "Gautham R. Shenoy" 

init_thread_group_l1_cache_map() initializes the per-cpu cpumask
thread_group_l1_cache_map with the core-siblings which share L1 cache
with the CPU. Make this function generic to the cache-property (L1 or
L2) and update a suitable mask. This is a preparatory patch for the
next patch where we will introduce discovery of thread-groups that
share L2-cache.

No functional change.

Signed-off-by: Gautham R. Shenoy 
---
 arch/powerpc/kernel/smp.c | 17 ++---
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index f3290d5..9078b5b5 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -866,15 +866,18 @@ static struct thread_groups *__init get_thread_groups(int 
cpu,
return tg;
 }
 
-static int init_thread_group_l1_cache_map(int cpu)
+static int __init init_thread_group_cache_map(int cpu, int cache_property)
 
 {
int first_thread = cpu_first_thread_sibling(cpu);
int i, cpu_group_start = -1, err = 0;
struct thread_groups *tg = NULL;
+   cpumask_var_t *mask;
 
-   tg = get_thread_groups(cpu, THREAD_GROUP_SHARE_L1,
-  );
+   if (cache_property != THREAD_GROUP_SHARE_L1)
+   return -EINVAL;
+
+   tg = get_thread_groups(cpu, cache_property, );
if (!tg)
return err;
 
@@ -885,8 +888,8 @@ static int init_thread_group_l1_cache_map(int cpu)
return -ENODATA;
}
 
-   zalloc_cpumask_var_node(_cpu(thread_group_l1_cache_map, cpu),
-   GFP_KERNEL, cpu_to_node(cpu));
+   mask = _cpu(thread_group_l1_cache_map, cpu);
+   zalloc_cpumask_var_node(mask, GFP_KERNEL, cpu_to_node(cpu));
 
for (i = first_thread; i < first_thread + threads_per_core; i++) {
int i_group_start = get_cpu_thread_group_start(i, tg);
@@ -897,7 +900,7 @@ static int init_thread_group_l1_cache_map(int cpu)
}
 
if (i_group_start == cpu_group_start)
-   cpumask_set_cpu(i, per_cpu(thread_group_l1_cache_map, 
cpu));
+   cpumask_set_cpu(i, *mask);
}
 
return 0;
@@ -976,7 +979,7 @@ static int init_big_cores(void)
int cpu;
 
for_each_possible_cpu(cpu) {
-   int err = init_thread_group_l1_cache_map(cpu);
+   int err = init_thread_group_cache_map(cpu, 
THREAD_GROUP_SHARE_L1);
 
if (err)
return err;
-- 
1.9.4



[PATCH v3 4/5] powerpc/smp: Add support detecting thread-groups sharing L2 cache

2020-12-10 Thread Gautham R. Shenoy
From: "Gautham R. Shenoy" 

On POWER systems, groups of threads within a core sharing the L2-cache
can be indicated by the "ibm,thread-groups" property array with the
identifier "2".

This patch adds support for detecting this, and when present, populate
the populating the cpu_l2_cache_mask of every CPU to the core-siblings
which share L2 with the CPU as specified in the by the
"ibm,thread-groups" property array.

On a platform with the following "ibm,thread-group" configuration
 0001 0002 0004 
 0002 0004 0006 0001
 0003 0005 0007 0002
 0002 0004  0002
 0004 0006 0001 0003
 0005 0007

Without this patch, the sched-domain hierarchy for CPUs 0,1 would be
CPU0 attaching sched-domain(s):
domain-0: span=0,2,4,6 level=SMT
domain-1: span=0-7 level=CACHE
domain-2: span=0-15,24-39,48-55 level=MC
domain-3: span=0-55 level=DIE

CPU1 attaching sched-domain(s):
domain-0: span=1,3,5,7 level=SMT
domain-1: span=0-7 level=CACHE
domain-2: span=0-15,24-39,48-55 level=MC
domain-3: span=0-55 level=DIE

The CACHE domain at 0-7 is incorrect since the ibm,thread-groups
sub-array
[0002 0002 0004
  0002 0004 0006
 0001 0003 0005 0007]
indicates that L2 (Property "2") is shared only between the threads of a single
group. There are "2" groups of threads where each group contains "4"
threads each. The groups being {0,2,4,6} and {1,3,5,7}.

With this patch, the sched-domain hierarchy for CPUs 0,1 would be
CPU0 attaching sched-domain(s):
domain-0: span=0,2,4,6 level=SMT
domain-1: span=0-15,24-39,48-55 level=MC
domain-2: span=0-55 level=DIE

CPU1 attaching sched-domain(s):
domain-0: span=1,3,5,7 level=SMT
domain-1: span=0-15,24-39,48-55 level=MC
domain-2: span=0-55 level=DIE

The CACHE domain with span=0,2,4,6 for CPU 0 (span=1,3,5,7 for CPU 1
resp.) gets degenerated into the SMT domain. Furthermore, the
last-level-cache domain gets correctly set to the SMT sched-domain.

Signed-off-by: Gautham R. Shenoy 
---
 arch/powerpc/include/asm/smp.h |  2 ++
 arch/powerpc/kernel/smp.c  | 58 ++
 2 files changed, 55 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h
index b2035b2..035459c 100644
--- a/arch/powerpc/include/asm/smp.h
+++ b/arch/powerpc/include/asm/smp.h
@@ -134,6 +134,7 @@ static inline struct cpumask *cpu_smallcore_mask(int cpu)
 extern int cpu_to_core_id(int cpu);
 
 extern bool has_big_cores;
+extern bool thread_group_shares_l2;
 
 #define cpu_smt_mask cpu_smt_mask
 #ifdef CONFIG_SCHED_SMT
@@ -187,6 +188,7 @@ static inline const struct cpumask *cpu_smt_mask(int cpu)
 /* for UP */
 #define hard_smp_processor_id()get_hard_smp_processor_id(0)
 #define smp_setup_cpu_maps()
+#define thread_group_shares_l2  0
 static inline void inhibit_secondary_onlining(void) {}
 static inline void uninhibit_secondary_onlining(void) {}
 static inline const struct cpumask *cpu_sibling_mask(int cpu)
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 9078b5b5..2b9b1bb 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -76,6 +76,7 @@
 struct task_struct *secondary_current;
 bool has_big_cores;
 bool coregroup_enabled;
+bool thread_group_shares_l2;
 
 DEFINE_PER_CPU(cpumask_var_t, cpu_sibling_map);
 DEFINE_PER_CPU(cpumask_var_t, cpu_smallcore_map);
@@ -99,6 +100,7 @@ enum {
 
 #define MAX_THREAD_LIST_SIZE   8
 #define THREAD_GROUP_SHARE_L1   1
+#define THREAD_GROUP_SHARE_L2   2
 struct thread_groups {
unsigned int property;
unsigned int nr_groups;
@@ -107,7 +109,7 @@ struct thread_groups {
 };
 
 /* Maximum number of properties that groups of threads within a core can share 
*/
-#define MAX_THREAD_GROUP_PROPERTIES 1
+#define MAX_THREAD_GROUP_PROPERTIES 2
 
 struct thread_groups_list {
unsigned int nr_properties;
@@ -121,6 +123,13 @@ struct thread_groups_list {
  */
 DEFINE_PER_CPU(cpumask_var_t, thread_group_l1_cache_map);
 
+/*
+ * On some big-cores system, thread_group_l2_cache_map for each CPU
+ * corresponds to the set its siblings within the core that share the
+ * L2-cache.
+ */
+DEFINE_PER_CPU(cpumask_var_t, thread_group_l2_cache_map);
+
 /* SMP operations for this machine */
 struct smp_ops_t *smp_ops;
 
@@ -718,7 +727,9 @@ static void or_cpumasks_related(int i, int j, struct 
cpumask *(*srcmask)(int),
  *
  * ibm,thread-groups[i + 0] tells us the property based on which the
  * threads are being grouped together. If this value is 1, it implies
- * that the threads in the same group share L1, trans

[PATCH v3 5/5] powerpc/cacheinfo: Print correct cache-sibling map/list for L2 cache

2020-12-10 Thread Gautham R. Shenoy
From: "Gautham R. Shenoy" 

On POWER platforms where only some groups of threads within a core
share the L2-cache (indicated by the ibm,thread-groups device-tree
property), we currently print the incorrect shared_cpu_map/list for
L2-cache in the sysfs.

This patch reports the correct shared_cpu_map/list on such platforms.

Example:
On a platform with "ibm,thread-groups" set to
 0001 0002 0004 
 0002 0004 0006 0001
 0003 0005 0007 0002
 0002 0004  0002
 0004 0006 0001 0003
 0005 0007

This indicates that threads {0,2,4,6} in the core share the L2-cache
and threads {1,3,5,7} in the core share the L2 cache.

However, without the patch, the shared_cpu_map/list for L2 for CPUs 0,
1 is reported in the sysfs as follows:

/sys/devices/system/cpu/cpu0/cache/index2/shared_cpu_list:0-7
/sys/devices/system/cpu/cpu0/cache/index2/shared_cpu_map:00,00ff

/sys/devices/system/cpu/cpu1/cache/index2/shared_cpu_list:0-7
/sys/devices/system/cpu/cpu1/cache/index2/shared_cpu_map:00,00ff

With the patch, the shared_cpu_map/list for L2 cache for CPUs 0, 1 is
correctly reported as follows:

/sys/devices/system/cpu/cpu0/cache/index2/shared_cpu_list:0,2,4,6
/sys/devices/system/cpu/cpu0/cache/index2/shared_cpu_map:00,0055

/sys/devices/system/cpu/cpu1/cache/index2/shared_cpu_list:1,3,5,7
/sys/devices/system/cpu/cpu1/cache/index2/shared_cpu_map:00,00aa

This patch also defines cpu_l2_cache_mask() for !CONFIG_SMP case.
Signed-off-by: Gautham R. Shenoy 
---
 arch/powerpc/include/asm/smp.h  |  4 
 arch/powerpc/kernel/cacheinfo.c | 30 --
 2 files changed, 24 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h
index 035459c..c4e2d53 100644
--- a/arch/powerpc/include/asm/smp.h
+++ b/arch/powerpc/include/asm/smp.h
@@ -201,6 +201,10 @@ static inline const struct cpumask *cpu_smallcore_mask(int 
cpu)
return cpumask_of(cpu);
 }
 
+static inline const struct cpumask *cpu_l2_cache_mask(int cpu)
+{
+   return cpumask_of(cpu);
+}
 #endif /* CONFIG_SMP */
 
 #ifdef CONFIG_PPC64
diff --git a/arch/powerpc/kernel/cacheinfo.c b/arch/powerpc/kernel/cacheinfo.c
index 65ab9fc..6f903e9a 100644
--- a/arch/powerpc/kernel/cacheinfo.c
+++ b/arch/powerpc/kernel/cacheinfo.c
@@ -655,11 +655,27 @@ static unsigned int index_dir_to_cpu(struct 
cache_index_dir *index)
  * On big-core systems, each core has two groups of CPUs each of which
  * has its own L1-cache. The thread-siblings which share l1-cache with
  * @cpu can be obtained via cpu_smallcore_mask().
+ *
+ * On some big-core systems, the L2 cache is shared only between some
+ * groups of siblings. This is already parsed and encoded in
+ * cpu_l2_cache_mask().
+ *
+ * TODO: cache_lookup_or_instantiate() needs to be made aware of the
+ *   "ibm,thread-groups" property so that cache->shared_cpu_map
+ *   reflects the correct siblings on platforms that have this
+ *   device-tree property. This helper function is only a stop-gap
+ *   solution so that we report the correct siblings to the
+ *   userspace via sysfs.
  */
-static const struct cpumask *get_big_core_shared_cpu_map(int cpu, struct cache 
*cache)
+static const struct cpumask *get_shared_cpu_map(struct cache_index_dir *index, 
struct cache *cache)
 {
-   if (cache->level == 1)
-   return cpu_smallcore_mask(cpu);
+   if (has_big_cores) {
+   int cpu = index_dir_to_cpu(index);
+   if (cache->level == 1)
+   return cpu_smallcore_mask(cpu);
+   if (cache->level == 2 && thread_group_shares_l2)
+   return cpu_l2_cache_mask(cpu);
+   }
 
return >shared_cpu_map;
 }
@@ -670,17 +686,11 @@ static const struct cpumask 
*get_big_core_shared_cpu_map(int cpu, struct cache *
struct cache_index_dir *index;
struct cache *cache;
const struct cpumask *mask;
-   int cpu;
 
index = kobj_to_cache_index_dir(k);
cache = index->cache;
 
-   if (has_big_cores) {
-   cpu = index_dir_to_cpu(index);
-   mask = get_big_core_shared_cpu_map(cpu, cache);
-   } else {
-   mask  = >shared_cpu_map;
-   }
+   mask = get_shared_cpu_map(index, cache);
 
return cpumap_print_to_pagebuf(list, buf, mask);
 }
-- 
1.9.4



[PATCH v3 1/5] powerpc/smp: Parse ibm, thread-groups with multiple properties

2020-12-10 Thread Gautham R. Shenoy
From: "Gautham R. Shenoy" 

The "ibm,thread-groups" device-tree property is an array that is used
to indicate if groups of threads within a core share certain
properties. It provides details of which property is being shared by
which groups of threads. This array can encode information about
multiple properties being shared by different thread-groups within the
core.

Example: Suppose,
"ibm,thread-groups" = [1,2,4,8,10,12,14,9,11,13,15,2,2,4,8,10,12,14,9,11,13,15]

This can be decomposed up into two consecutive arrays:

a) [1,2,4,8,10,12,14,9,11,13,15]
b) [2,2,4,8,10,12,14,9,11,13,15]

where in,

a) provides information of Property "1" being shared by "2" groups,
   each with "4" threads each. The "ibm,ppc-interrupt-server#s" of the
   first group is {8,10,12,14} and the "ibm,ppc-interrupt-server#s" of
   the second group is {9,11,13,15}. Property "1" is indicative of
   the thread in the group sharing L1 cache, translation cache and
   Instruction Data flow.

b) provides information of Property "2" being shared by "2" groups,
   each group with "4" threads. The "ibm,ppc-interrupt-server#s" of
   the first group is {8,10,12,14} and the
   "ibm,ppc-interrupt-server#s" of the second group is
   {9,11,13,15}. Property "2" indicates that the threads in each group
   share the L2-cache.

The existing code assumes that the "ibm,thread-groups" encodes
information about only one property. Hence even on platforms which
encode information about multiple properties being shared by the
corresponding groups of threads, the current code will only pick the
first one. (In the above example, it will only consider
[1,2,4,8,10,12,14,9,11,13,15] but not [2,2,4,8,10,12,14,9,11,13,15]).

This patch extends the parsing support on platforms which encode
information about multiple properties being shared by the
corresponding groups of threads.

Signed-off-by: Gautham R. Shenoy 
---
 arch/powerpc/kernel/smp.c | 174 ++
 1 file changed, 113 insertions(+), 61 deletions(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 8c2857c..88d88ad 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -106,6 +106,15 @@ struct thread_groups {
unsigned int thread_list[MAX_THREAD_LIST_SIZE];
 };
 
+/* Maximum number of properties that groups of threads within a core can share 
*/
+#define MAX_THREAD_GROUP_PROPERTIES 1
+
+struct thread_groups_list {
+   unsigned int nr_properties;
+   struct thread_groups property_tgs[MAX_THREAD_GROUP_PROPERTIES];
+};
+
+static struct thread_groups_list tgl[NR_CPUS] __initdata;
 /*
  * On big-cores system, cpu_l1_cache_map for each CPU corresponds to
  * the set its siblings that share the L1-cache.
@@ -695,81 +704,98 @@ static void or_cpumasks_related(int i, int j, struct 
cpumask *(*srcmask)(int),
 /*
  * parse_thread_groups: Parses the "ibm,thread-groups" device tree
  *  property for the CPU device node @dn and stores
- *  the parsed output in the thread_groups
- *  structure @tg if the ibm,thread-groups[0]
- *  matches @property.
+ *  the parsed output in the thread_groups_list
+ *  structure @tglp.
  *
  * @dn: The device node of the CPU device.
- * @tg: Pointer to a thread group structure into which the parsed
+ * @tglp: Pointer to a thread group list structure into which the parsed
  *  output of "ibm,thread-groups" is stored.
- * @property: The property of the thread-group that the caller is
- *interested in.
  *
  * ibm,thread-groups[0..N-1] array defines which group of threads in
  * the CPU-device node can be grouped together based on the property.
  *
- * ibm,thread-groups[0] tells us the property based on which the
+ * This array can represent thread groupings for multiple properties.
+ *
+ * ibm,thread-groups[i + 0] tells us the property based on which the
  * threads are being grouped together. If this value is 1, it implies
  * that the threads in the same group share L1, translation cache.
  *
- * ibm,thread-groups[1] tells us how many such thread groups exist.
+ * ibm,thread-groups[i+1] tells us how many such thread groups exist for the
+ * property ibm,thread-groups[i]
  *
- * ibm,thread-groups[2] tells us the number of threads in each such
+ * ibm,thread-groups[i+2] tells us the number of threads in each such
  * group.
+ * Suppose k = (ibm,thread-groups[i+1] * ibm,thread-groups[i+2]), then,
  *
- * ibm,thread-groups[3..N-1] is the list of threads identified by
+ * ibm,thread-groups[i+3..i+k+2] (is the list of threads identified by
  * "ibm,ppc-interrupt-server#s" arranged as per their membership in
  * the grouping.
  *
- * Example: If ibm,thread-gro

[PATCH v2 2/5] powerpc/smp: Rename cpu_l1_cache_map as thread_group_l1_cache_map

2020-12-09 Thread Gautham R. Shenoy
From: "Gautham R. Shenoy" 

On platforms which have the "ibm,thread-groups" property, the per-cpu
variable cpu_l1_cache_map keeps a track of which group of threads
within the same core share the L1 cache, Instruction and Data flow.

This patch renames the variable to "thread_group_l1_cache_map" to make
it consistent with a subsequent patch which will introduce
thread_group_l2_cache_map.

This patch introduces no functional change.

Signed-off-by: Gautham R. Shenoy 
---
 arch/powerpc/kernel/smp.c | 14 +++---
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 88d88ad..f3290d5 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -116,10 +116,10 @@ struct thread_groups_list {
 
 static struct thread_groups_list tgl[NR_CPUS] __initdata;
 /*
- * On big-cores system, cpu_l1_cache_map for each CPU corresponds to
+ * On big-cores system, thread_group_l1_cache_map for each CPU corresponds to
  * the set its siblings that share the L1-cache.
  */
-DEFINE_PER_CPU(cpumask_var_t, cpu_l1_cache_map);
+DEFINE_PER_CPU(cpumask_var_t, thread_group_l1_cache_map);
 
 /* SMP operations for this machine */
 struct smp_ops_t *smp_ops;
@@ -866,7 +866,7 @@ static struct thread_groups *__init get_thread_groups(int 
cpu,
return tg;
 }
 
-static int init_cpu_l1_cache_map(int cpu)
+static int init_thread_group_l1_cache_map(int cpu)
 
 {
int first_thread = cpu_first_thread_sibling(cpu);
@@ -885,7 +885,7 @@ static int init_cpu_l1_cache_map(int cpu)
return -ENODATA;
}
 
-   zalloc_cpumask_var_node(_cpu(cpu_l1_cache_map, cpu),
+   zalloc_cpumask_var_node(_cpu(thread_group_l1_cache_map, cpu),
GFP_KERNEL, cpu_to_node(cpu));
 
for (i = first_thread; i < first_thread + threads_per_core; i++) {
@@ -897,7 +897,7 @@ static int init_cpu_l1_cache_map(int cpu)
}
 
if (i_group_start == cpu_group_start)
-   cpumask_set_cpu(i, per_cpu(cpu_l1_cache_map, cpu));
+   cpumask_set_cpu(i, per_cpu(thread_group_l1_cache_map, 
cpu));
}
 
return 0;
@@ -976,7 +976,7 @@ static int init_big_cores(void)
int cpu;
 
for_each_possible_cpu(cpu) {
-   int err = init_cpu_l1_cache_map(cpu);
+   int err = init_thread_group_l1_cache_map(cpu);
 
if (err)
return err;
@@ -1372,7 +1372,7 @@ static inline void add_cpu_to_smallcore_masks(int cpu)
 
cpumask_set_cpu(cpu, cpu_smallcore_mask(cpu));
 
-   for_each_cpu(i, per_cpu(cpu_l1_cache_map, cpu)) {
+   for_each_cpu(i, per_cpu(thread_group_l1_cache_map, cpu)) {
if (cpu_online(i))
set_cpus_related(i, cpu, cpu_smallcore_mask);
}
-- 
1.9.4



[PATCH v2 1/5] powerpc/smp: Parse ibm, thread-groups with multiple properties

2020-12-09 Thread Gautham R. Shenoy
From: "Gautham R. Shenoy" 

The "ibm,thread-groups" device-tree property is an array that is used
to indicate if groups of threads within a core share certain
properties. It provides details of which property is being shared by
which groups of threads. This array can encode information about
multiple properties being shared by different thread-groups within the
core.

Example: Suppose,
"ibm,thread-groups" = [1,2,4,8,10,12,14,9,11,13,15,2,2,4,8,10,12,14,9,11,13,15]

This can be decomposed up into two consecutive arrays:

a) [1,2,4,8,10,12,14,9,11,13,15]
b) [2,2,4,8,10,12,14,9,11,13,15]

where in,

a) provides information of Property "1" being shared by "2" groups,
   each with "4" threads each. The "ibm,ppc-interrupt-server#s" of the
   first group is {8,10,12,14} and the "ibm,ppc-interrupt-server#s" of
   the second group is {9,11,13,15}. Property "1" is indicative of
   the thread in the group sharing L1 cache, translation cache and
   Instruction Data flow.

b) provides information of Property "2" being shared by "2" groups,
   each group with "4" threads. The "ibm,ppc-interrupt-server#s" of
   the first group is {8,10,12,14} and the
   "ibm,ppc-interrupt-server#s" of the second group is
   {9,11,13,15}. Property "2" indicates that the threads in each group
   share the L2-cache.

The existing code assumes that the "ibm,thread-groups" encodes
information about only one property. Hence even on platforms which
encode information about multiple properties being shared by the
corresponding groups of threads, the current code will only pick the
first one. (In the above example, it will only consider
[1,2,4,8,10,12,14,9,11,13,15] but not [2,2,4,8,10,12,14,9,11,13,15]).

This patch extends the parsing support on platforms which encode
information about multiple properties being shared by the
corresponding groups of threads.

Signed-off-by: Gautham R. Shenoy 
---
 arch/powerpc/kernel/smp.c | 174 ++
 1 file changed, 113 insertions(+), 61 deletions(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 8c2857c..88d88ad 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -106,6 +106,15 @@ struct thread_groups {
unsigned int thread_list[MAX_THREAD_LIST_SIZE];
 };
 
+/* Maximum number of properties that groups of threads within a core can share 
*/
+#define MAX_THREAD_GROUP_PROPERTIES 1
+
+struct thread_groups_list {
+   unsigned int nr_properties;
+   struct thread_groups property_tgs[MAX_THREAD_GROUP_PROPERTIES];
+};
+
+static struct thread_groups_list tgl[NR_CPUS] __initdata;
 /*
  * On big-cores system, cpu_l1_cache_map for each CPU corresponds to
  * the set its siblings that share the L1-cache.
@@ -695,81 +704,98 @@ static void or_cpumasks_related(int i, int j, struct 
cpumask *(*srcmask)(int),
 /*
  * parse_thread_groups: Parses the "ibm,thread-groups" device tree
  *  property for the CPU device node @dn and stores
- *  the parsed output in the thread_groups
- *  structure @tg if the ibm,thread-groups[0]
- *  matches @property.
+ *  the parsed output in the thread_groups_list
+ *  structure @tglp.
  *
  * @dn: The device node of the CPU device.
- * @tg: Pointer to a thread group structure into which the parsed
+ * @tglp: Pointer to a thread group list structure into which the parsed
  *  output of "ibm,thread-groups" is stored.
- * @property: The property of the thread-group that the caller is
- *interested in.
  *
  * ibm,thread-groups[0..N-1] array defines which group of threads in
  * the CPU-device node can be grouped together based on the property.
  *
- * ibm,thread-groups[0] tells us the property based on which the
+ * This array can represent thread groupings for multiple properties.
+ *
+ * ibm,thread-groups[i + 0] tells us the property based on which the
  * threads are being grouped together. If this value is 1, it implies
  * that the threads in the same group share L1, translation cache.
  *
- * ibm,thread-groups[1] tells us how many such thread groups exist.
+ * ibm,thread-groups[i+1] tells us how many such thread groups exist for the
+ * property ibm,thread-groups[i]
  *
- * ibm,thread-groups[2] tells us the number of threads in each such
+ * ibm,thread-groups[i+2] tells us the number of threads in each such
  * group.
+ * Suppose k = (ibm,thread-groups[i+1] * ibm,thread-groups[i+2]), then,
  *
- * ibm,thread-groups[3..N-1] is the list of threads identified by
+ * ibm,thread-groups[i+3..i+k+2] (is the list of threads identified by
  * "ibm,ppc-interrupt-server#s" arranged as per their membership in
  * the grouping.
  *
- * Example: If ibm,thread-gro

[PATCH v2 4/5] powerpc/smp: Add support detecting thread-groups sharing L2 cache

2020-12-09 Thread Gautham R. Shenoy
From: "Gautham R. Shenoy" 

On POWER systems, groups of threads within a core sharing the L2-cache
can be indicated by the "ibm,thread-groups" property array with the
identifier "2".

This patch adds support for detecting this, and when present, populate
the populating the cpu_l2_cache_mask of every CPU to the core-siblings
which share L2 with the CPU as specified in the by the
"ibm,thread-groups" property array.

On a platform with the following "ibm,thread-group" configuration
 0001 0002 0004 
 0002 0004 0006 0001
 0003 0005 0007 0002
 0002 0004  0002
 0004 0006 0001 0003
 0005 0007

Without this patch, the sched-domain hierarchy for CPUs 0,1 would be
CPU0 attaching sched-domain(s):
domain-0: span=0,2,4,6 level=SMT
domain-1: span=0-7 level=CACHE
domain-2: span=0-15,24-39,48-55 level=MC
domain-3: span=0-55 level=DIE

CPU1 attaching sched-domain(s):
domain-0: span=1,3,5,7 level=SMT
domain-1: span=0-7 level=CACHE
domain-2: span=0-15,24-39,48-55 level=MC
domain-3: span=0-55 level=DIE

The CACHE domain at 0-7 is incorrect since the ibm,thread-groups
sub-array
[0002 0002 0004
  0002 0004 0006
 0001 0003 0005 0007]
indicates that L2 (Property "2") is shared only between the threads of a single
group. There are "2" groups of threads where each group contains "4"
threads each. The groups being {0,2,4,6} and {1,3,5,7}.

With this patch, the sched-domain hierarchy for CPUs 0,1 would be
CPU0 attaching sched-domain(s):
domain-0: span=0,2,4,6 level=SMT
domain-1: span=0-15,24-39,48-55 level=MC
domain-2: span=0-55 level=DIE

CPU1 attaching sched-domain(s):
domain-0: span=1,3,5,7 level=SMT
domain-1: span=0-15,24-39,48-55 level=MC
domain-2: span=0-55 level=DIE

The CACHE domain with span=0,2,4,6 for CPU 0 (span=1,3,5,7 for CPU 1
resp.) gets degenerated into the SMT domain. Furthermore, the
last-level-cache domain gets correctly set to the SMT sched-domain.

Signed-off-by: Gautham R. Shenoy 
---
 arch/powerpc/include/asm/smp.h |  1 +
 arch/powerpc/kernel/smp.c  | 56 +++---
 2 files changed, 53 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h
index b2035b2..8d3d081 100644
--- a/arch/powerpc/include/asm/smp.h
+++ b/arch/powerpc/include/asm/smp.h
@@ -134,6 +134,7 @@ static inline struct cpumask *cpu_smallcore_mask(int cpu)
 extern int cpu_to_core_id(int cpu);
 
 extern bool has_big_cores;
+extern bool thread_group_shares_l2;
 
 #define cpu_smt_mask cpu_smt_mask
 #ifdef CONFIG_SCHED_SMT
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 9078b5b5..a46cf3f 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -76,6 +76,7 @@
 struct task_struct *secondary_current;
 bool has_big_cores;
 bool coregroup_enabled;
+bool thread_group_shares_l2;
 
 DEFINE_PER_CPU(cpumask_var_t, cpu_sibling_map);
 DEFINE_PER_CPU(cpumask_var_t, cpu_smallcore_map);
@@ -99,6 +100,7 @@ enum {
 
 #define MAX_THREAD_LIST_SIZE   8
 #define THREAD_GROUP_SHARE_L1   1
+#define THREAD_GROUP_SHARE_L2   2
 struct thread_groups {
unsigned int property;
unsigned int nr_groups;
@@ -107,7 +109,7 @@ struct thread_groups {
 };
 
 /* Maximum number of properties that groups of threads within a core can share 
*/
-#define MAX_THREAD_GROUP_PROPERTIES 1
+#define MAX_THREAD_GROUP_PROPERTIES 2
 
 struct thread_groups_list {
unsigned int nr_properties;
@@ -121,6 +123,13 @@ struct thread_groups_list {
  */
 DEFINE_PER_CPU(cpumask_var_t, thread_group_l1_cache_map);
 
+/*
+ * On some big-cores system, thread_group_l2_cache_map for each CPU
+ * corresponds to the set its siblings within the core that share the
+ * L2-cache.
+ */
+DEFINE_PER_CPU(cpumask_var_t, thread_group_l2_cache_map);
+
 /* SMP operations for this machine */
 struct smp_ops_t *smp_ops;
 
@@ -718,7 +727,9 @@ static void or_cpumasks_related(int i, int j, struct 
cpumask *(*srcmask)(int),
  *
  * ibm,thread-groups[i + 0] tells us the property based on which the
  * threads are being grouped together. If this value is 1, it implies
- * that the threads in the same group share L1, translation cache.
+ * that the threads in the same group share L1, translation cache. If
+ * the value is 2, it implies that the threads in the same group share
+ * the same L2 cache.
  *
  * ibm,thread-groups[i+1] tells us how many such thread groups exist for the
  * property ibm,thread-groups[i]
@@ -874,7 +885,8 @@ static int __init init_thread_group_cache_map(int cpu, int 
cache_property)
struct thr

[PATCH v2 5/5] powerpc/cacheinfo: Print correct cache-sibling map/list for L2 cache

2020-12-09 Thread Gautham R. Shenoy
From: "Gautham R. Shenoy" 

On POWER platforms where only some groups of threads within a core
share the L2-cache (indicated by the ibm,thread-groups device-tree
property), we currently print the incorrect shared_cpu_map/list for
L2-cache in the sysfs.

This patch reports the correct shared_cpu_map/list on such platforms.

Example:
On a platform with "ibm,thread-groups" set to
 0001 0002 0004 
 0002 0004 0006 0001
 0003 0005 0007 0002
 0002 0004  0002
 0004 0006 0001 0003
 0005 0007

This indicates that threads {0,2,4,6} in the core share the L2-cache
and threads {1,3,5,7} in the core share the L2 cache.

However, without the patch, the shared_cpu_map/list for L2 for CPUs 0,
1 is reported in the sysfs as follows:

/sys/devices/system/cpu/cpu0/cache/index2/shared_cpu_list:0-7
/sys/devices/system/cpu/cpu0/cache/index2/shared_cpu_map:00,00ff

/sys/devices/system/cpu/cpu1/cache/index2/shared_cpu_list:0-7
/sys/devices/system/cpu/cpu1/cache/index2/shared_cpu_map:00,00ff

With the patch, the shared_cpu_map/list for L2 cache for CPUs 0, 1 is
correctly reported as follows:

/sys/devices/system/cpu/cpu0/cache/index2/shared_cpu_list:0,2,4,6
/sys/devices/system/cpu/cpu0/cache/index2/shared_cpu_map:00,0055

/sys/devices/system/cpu/cpu1/cache/index2/shared_cpu_list:1,3,5,7
/sys/devices/system/cpu/cpu1/cache/index2/shared_cpu_map:00,00aa

This patch adds #CONFIG_PPC64 checks for these cases to ensure that
32-bit configs build correctly.
Signed-off-by: Gautham R. Shenoy 
---
 arch/powerpc/kernel/cacheinfo.c | 34 --
 1 file changed, 24 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/kernel/cacheinfo.c b/arch/powerpc/kernel/cacheinfo.c
index 65ab9fc..cb87b68 100644
--- a/arch/powerpc/kernel/cacheinfo.c
+++ b/arch/powerpc/kernel/cacheinfo.c
@@ -641,6 +641,7 @@ static ssize_t level_show(struct kobject *k, struct 
kobj_attribute *attr, char *
 static struct kobj_attribute cache_level_attr =
__ATTR(level, 0444, level_show, NULL);
 
+#ifdef CONFIG_PPC64
 static unsigned int index_dir_to_cpu(struct cache_index_dir *index)
 {
struct kobject *index_dir_kobj = >kobj;
@@ -650,16 +651,35 @@ static unsigned int index_dir_to_cpu(struct 
cache_index_dir *index)
 
return dev->id;
 }
+#endif
 
 /*
  * On big-core systems, each core has two groups of CPUs each of which
  * has its own L1-cache. The thread-siblings which share l1-cache with
  * @cpu can be obtained via cpu_smallcore_mask().
+ *
+ * On some big-core systems, the L2 cache is shared only between some
+ * groups of siblings. This is already parsed and encoded in
+ * cpu_l2_cache_mask().
+ *
+ * TODO: cache_lookup_or_instantiate() needs to be made aware of the
+ *   "ibm,thread-groups" property so that cache->shared_cpu_map
+ *   reflects the correct siblings on platforms that have this
+ *   device-tree property. This helper function is only a stop-gap
+ *   solution so that we report the correct siblings to the
+ *   userspace via sysfs.
  */
-static const struct cpumask *get_big_core_shared_cpu_map(int cpu, struct cache 
*cache)
+static const struct cpumask *get_shared_cpu_map(struct cache_index_dir *index, 
struct cache *cache)
 {
-   if (cache->level == 1)
-   return cpu_smallcore_mask(cpu);
+#ifdef CONFIG_PPC64
+   if (has_big_cores) {
+   int cpu = index_dir_to_cpu(index);
+   if (cache->level == 1)
+   return cpu_smallcore_mask(cpu);
+   if (cache->level == 2 && thread_group_shares_l2)
+   return cpu_l2_cache_mask(cpu);
+   }
+#endif
 
return >shared_cpu_map;
 }
@@ -670,17 +690,11 @@ static const struct cpumask 
*get_big_core_shared_cpu_map(int cpu, struct cache *
struct cache_index_dir *index;
struct cache *cache;
const struct cpumask *mask;
-   int cpu;
 
index = kobj_to_cache_index_dir(k);
cache = index->cache;
 
-   if (has_big_cores) {
-   cpu = index_dir_to_cpu(index);
-   mask = get_big_core_shared_cpu_map(cpu, cache);
-   } else {
-   mask  = >shared_cpu_map;
-   }
+   mask = get_shared_cpu_map(index, cache);
 
return cpumap_print_to_pagebuf(list, buf, mask);
 }
-- 
1.9.4



[PATCH v2 0/5] Extend Parsing "ibm, thread-groups" for Shared-L2 information

2020-12-09 Thread Gautham R. Shenoy
From: "Gautham R. Shenoy" 

Hi,

This is the v2 of the patchset to extend parsing of "ibm,thread-groups" property
to discover the Shared-L2 cache information.

The v1 can be found here :
https://lore.kernel.org/linuxppc-dev/1607057327-29822-1-git-send-email-...@linux.vnet.ibm.com/T/#m0fabffa1ea1a2807b362f25c849bb19415216520

The key changes from v1 are as follows to incorporate the review
comments from Srikar and fix a build error on !PPC64 configs reported
by the kernel bot.

 * Split Patch 1 into three patches
   * First patch ensure that parse_thread_groups() is made generic to
 support more than one property.
   * Second patch renames cpu_l1_cache_map as
 thread_group_l1_cache_map for consistency. No functional impact.
   * The third patch makes init_thread_group_l1_cache_map()
 generic. No functional impact.

* Patch 2 (Now patch 4): Incorporates the review comments from Srikar 
simplifying
   the changes to update_mask_by_l2()

* Patch 3 (Now patch 5): Fix a build errors for 32-bit configs
   reported by the kernel build bot.
 
Description of the Patchset
===
The "ibm,thread-groups" device-tree property is an array that is used
to indicate if groups of threads within a core share certain
properties. It provides details of which property is being shared by
which groups of threads. This array can encode information about
multiple properties being shared by different thread-groups within the
core.

Example: Suppose,
"ibm,thread-groups" = [1,2,4,8,10,12,14,9,11,13,15,2,2,4,8,10,12,14,9,11,13,15]

This can be decomposed up into two consecutive arrays:

a) [1,2,4,8,10,12,14,9,11,13,15]
b) [2,2,4,8,10,12,14,9,11,13,15]

where in,

a) provides information of Property "1" being shared by "2" groups,
   each with "4" threads each. The "ibm,ppc-interrupt-server#s" of the
   first group is {8,10,12,14} and the "ibm,ppc-interrupt-server#s" of
   the second group is {9,11,13,15}. Property "1" is indicative of
   the thread in the group sharing L1 cache, translation cache and
   Instruction Data flow.

b) provides information of Property "2" being shared by "2" groups,
   each group with "4" threads. The "ibm,ppc-interrupt-server#s" of
   the first group is {8,10,12,14} and the
   "ibm,ppc-interrupt-server#s" of the second group is
   {9,11,13,15}. Property "2" indicates that the threads in each group
   share the L2-cache.
   
The existing code assumes that the "ibm,thread-groups" encodes
information about only one property. Hence even on platforms which
encode information about multiple properties being shared by the
corresponding groups of threads, the current code will only pick the
first one. (In the above example, it will only consider
[1,2,4,8,10,12,14,9,11,13,15] but not [2,2,4,8,10,12,14,9,11,13,15]).

Furthermore, currently on platforms where groups of threads share L2
cache, we incorrectly create an extra CACHE level sched-domain that
maps to all the threads of the core.

For example, if "ibm,thread-groups" is 
 0001 0002 0004 
 0002 0004 0006 0001
 0003 0005 0007 0002
 0002 0004  0002
 0004 0006 0001 0003
 0005 0007

then, the sub-array
[0002 0002 0004
  0002 0004 0006
 0001 0003 0005 0007]
indicates that L2 (Property "2") is shared only between the threads of a single
group. There are "2" groups of threads where each group contains "4"
threads each. The groups being {0,2,4,6} and {1,3,5,7}.

However, the sched-domain hierarchy for CPUs 0,1 is
CPU0 attaching sched-domain(s):
domain-0: span=0,2,4,6 level=SMT
domain-1: span=0-7 level=CACHE
domain-2: span=0-15,24-39,48-55 level=MC
domain-3: span=0-55 level=DIE

CPU1 attaching sched-domain(s):
domain-0: span=1,3,5,7 level=SMT
domain-1: span=0-7 level=CACHE
domain-2: span=0-15,24-39,48-55 level=MC
domain-3: span=0-55 level=DIE

where the CACHE domain reports that L2 is shared across the entire
core which is incorrect on such platforms.

This patchset remedies these issues by extending the parsing support
for "ibm,thread-groups" to discover information about multiple
properties being shared by the corresponding groups of threads. In
particular we cano now detect if the groups of threads within a core
share the L2-cache. On such platforms, we populate the populating the
cpu_l2_cache_mask of every CPU to the core-siblings which share L2
with the CPU as specified in the by the "ibm,thread-groups" property
array.

With the patchset, the sched-domain hierarchy is correctly
reported. F

[PATCH v2 3/5] powerpc/smp: Rename init_thread_group_l1_cache_map() to make it generic

2020-12-09 Thread Gautham R. Shenoy
From: "Gautham R. Shenoy" 

init_thread_group_l1_cache_map() initializes the per-cpu cpumask
thread_group_l1_cache_map with the core-siblings which share L1 cache
with the CPU. Make this function generic to the cache-property (L1 or
L2) and update a suitable mask. This is a preparatory patch for the
next patch where we will introduce discovery of thread-groups that
share L2-cache.

No functional change.

Signed-off-by: Gautham R. Shenoy 
---
 arch/powerpc/kernel/smp.c | 17 ++---
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index f3290d5..9078b5b5 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -866,15 +866,18 @@ static struct thread_groups *__init get_thread_groups(int 
cpu,
return tg;
 }
 
-static int init_thread_group_l1_cache_map(int cpu)
+static int __init init_thread_group_cache_map(int cpu, int cache_property)
 
 {
int first_thread = cpu_first_thread_sibling(cpu);
int i, cpu_group_start = -1, err = 0;
struct thread_groups *tg = NULL;
+   cpumask_var_t *mask;
 
-   tg = get_thread_groups(cpu, THREAD_GROUP_SHARE_L1,
-  );
+   if (cache_property != THREAD_GROUP_SHARE_L1)
+   return -EINVAL;
+
+   tg = get_thread_groups(cpu, cache_property, );
if (!tg)
return err;
 
@@ -885,8 +888,8 @@ static int init_thread_group_l1_cache_map(int cpu)
return -ENODATA;
}
 
-   zalloc_cpumask_var_node(_cpu(thread_group_l1_cache_map, cpu),
-   GFP_KERNEL, cpu_to_node(cpu));
+   mask = _cpu(thread_group_l1_cache_map, cpu);
+   zalloc_cpumask_var_node(mask, GFP_KERNEL, cpu_to_node(cpu));
 
for (i = first_thread; i < first_thread + threads_per_core; i++) {
int i_group_start = get_cpu_thread_group_start(i, tg);
@@ -897,7 +900,7 @@ static int init_thread_group_l1_cache_map(int cpu)
}
 
if (i_group_start == cpu_group_start)
-   cpumask_set_cpu(i, per_cpu(thread_group_l1_cache_map, 
cpu));
+   cpumask_set_cpu(i, *mask);
}
 
return 0;
@@ -976,7 +979,7 @@ static int init_big_cores(void)
int cpu;
 
for_each_possible_cpu(cpu) {
-   int err = init_thread_group_l1_cache_map(cpu);
+   int err = init_thread_group_cache_map(cpu, 
THREAD_GROUP_SHARE_L1);
 
if (err)
return err;
-- 
1.9.4



Re: [PATCH 3/3] powerpc/cacheinfo: Print correct cache-sibling map/list for L2 cache

2020-12-09 Thread Gautham R Shenoy
On Wed, Dec 09, 2020 at 02:09:21PM +0530, Srikar Dronamraju wrote:
> * Gautham R Shenoy  [2020-12-08 23:26:47]:
> 
> > > The drawback of this is even if cpus 0,2,4,6 are released L1 cache will 
> > > not
> > > be released. Is this as expected?
> > 
> > cacheinfo populates the cache->shared_cpu_map on the basis of which
> > CPUs share the common device-tree node for a particular cache.  There
> > is one l1-cache object in the device-tree for a CPU node corresponding
> > to a big-core. That the L1 is further split between the threads of the
> > core is shown using ibm,thread-groups.
> > 
> 
> Yes.
> 
> > The ideal thing would be to add a "group_leader" field to "struct
> > cache" so that we can create separate cache objects , one per thread
> > group. I will take a stab at this in the v2.
> > 
> 
> I am not saying this needs to be done immediately. We could add a TODO and
> get it done later. Your patch is not making it worse. Its just that there is
> still something more left to be done.

Yeah, it needs to be fixed but it may not be a 5.11 target. For now I
will fix this patch to take care of the build errors on !PPC64 !SMT
configs. I will post a separate series for making cacheinfo.c aware of
thread-groups at the time of construction of the cache-chain.

> 
> -- 
> Thanks and Regards
> Srikar Dronamraju


Re: [PATCH 1/3] powerpc/smp: Parse ibm,thread-groups with multiple properties

2020-12-09 Thread Gautham R Shenoy
On Wed, Dec 09, 2020 at 02:05:41PM +0530, Srikar Dronamraju wrote:
> * Gautham R Shenoy  [2020-12-08 22:55:40]:
> 
> > > 
> > > NIT:
> > > tglx mentions in one of his recent comments to try keep a reverse fir tree
> > > ordering of variables where possible.
> > 
> > I suppose you mean moving the longer local variable declarations to to
> > the top and shorter ones to the bottom. Thanks. Will fix this.
> > 
> 
> Yes.
> 
> > > > +   }
> > > > +
> > > > +   if (!tg)
> > > > +   return -EINVAL;
> > > > +
> > > > +   cpu_group_start = get_cpu_thread_group_start(cpu, tg);
> > > 
> > > This whole hunk should be moved to a new function and called before
> > > init_cpu_cache_map. It will simplify the logic to great extent.
> > 
> > I suppose you are referring to the part where we select the correct
> > tg. Yeah, that can move to a different helper.
> > 
> 
> Yes, I would prefer if we could call this new helper outside
> init_cpu_cache_map.
> 
> > > > 
> > > > -   zalloc_cpumask_var_node(_cpu(cpu_l1_cache_map, cpu),
> > > > -   GFP_KERNEL, cpu_to_node(cpu));
> > > > +   mask = _cpu(cpu_l1_cache_map, cpu);
> > > > +
> > > > +   zalloc_cpumask_var_node(mask, GFP_KERNEL, cpu_to_node(cpu));
> > > > 
> > > 
> > > This hunk (and the next hunk) should be moved to next patch.
> > >
> > 
> > The next patch is only about introducing  THREAD_GROUP_SHARE_L2. Hence
> > I put in any other code in this patch, since it seems to be a logical
> > place to collate whatever we have in a generic form.
> > 
> 
> While I am fine with it, having a pointer that always points to the same
> mask looks wierd.

Sure. Moving some of this to a separate preparatory patch.

> 
> -- 
> Thanks and Regards
> Srikar Dronamraju


Re: [PATCH 3/3] powerpc/cacheinfo: Print correct cache-sibling map/list for L2 cache

2020-12-08 Thread Gautham R Shenoy
On Mon, Dec 07, 2020 at 06:41:38PM +0530, Srikar Dronamraju wrote:
> * Gautham R. Shenoy  [2020-12-04 10:18:47]:
> 
> > From: "Gautham R. Shenoy" 
> > 
> > 
> > Signed-off-by: Gautham R. Shenoy 
> > ---
> > 
> > +extern bool thread_group_shares_l2;
> >  /*
> >   * On big-core systems, each core has two groups of CPUs each of which
> >   * has its own L1-cache. The thread-siblings which share l1-cache with
> >   * @cpu can be obtained via cpu_smallcore_mask().
> > + *
> > + * On some big-core systems, the L2 cache is shared only between some
> > + * groups of siblings. This is already parsed and encoded in
> > + * cpu_l2_cache_mask().
> >   */
> >  static const struct cpumask *get_big_core_shared_cpu_map(int cpu, struct 
> > cache *cache)
> >  {
> > if (cache->level == 1)
> > return cpu_smallcore_mask(cpu);
> > +   if (cache->level == 2 && thread_group_shares_l2)
> > +   return cpu_l2_cache_mask(cpu);
> > 
> > return >shared_cpu_map;
> 
> As pointed with l...@intel.org, we need to do this only with #CONFIG_SMP,
> even for cache->level = 1 too.

Yes, I have fixed that in the next version.

> 
> I agree that we are displaying shared_cpu_map correctly. Should we have also
> update /clear shared_cpu_map in the first place. For example:- If for a P9
> core with CPUs 0-7, the cache->shared_cpu_map for L1 would have 0-7 but
> would display 0,2,4,6.
> 
> The drawback of this is even if cpus 0,2,4,6 are released L1 cache will not
> be released. Is this as expected?

cacheinfo populates the cache->shared_cpu_map on the basis of which
CPUs share the common device-tree node for a particular cache.  There
is one l1-cache object in the device-tree for a CPU node corresponding
to a big-core. That the L1 is further split between the threads of the
core is shown using ibm,thread-groups.

The ideal thing would be to add a "group_leader" field to "struct
cache" so that we can create separate cache objects , one per thread
group. I will take a stab at this in the v2.

Thanks for the review comments.



> 
> 
> -- 
> Thanks and Regards
> Srikar Dronamraju


Re: [PATCH 2/3] powerpc/smp: Add support detecting thread-groups sharing L2 cache

2020-12-08 Thread Gautham R Shenoy
Hello Srikar,

On Mon, Dec 07, 2020 at 06:10:39PM +0530, Srikar Dronamraju wrote:
> * Gautham R. Shenoy  [2020-12-04 10:18:46]:
> 
> > From: "Gautham R. Shenoy" 
> > 
> > On POWER systems, groups of threads within a core sharing the L2-cache
> > can be indicated by the "ibm,thread-groups" property array with the
> > identifier "2".
> > 
> > This patch adds support for detecting this, and when present, populate
> > the populating the cpu_l2_cache_mask of every CPU to the core-siblings
> > which share L2 with the CPU as specified in the by the
> > "ibm,thread-groups" property array.
> > 
> > On a platform with the following "ibm,thread-group" configuration
> >  0001 0002 0004 
> >  0002 0004 0006 0001
> >  0003 0005 0007 0002
> >  0002 0004  0002
> >  0004 0006 0001 0003
> >  0005 0007
> > 
> > Without this patch, the sched-domain hierarchy for CPUs 0,1 would be
> > CPU0 attaching sched-domain(s):
> > domain-0: span=0,2,4,6 level=SMT
> > domain-1: span=0-7 level=CACHE
> > domain-2: span=0-15,24-39,48-55 level=MC
> > domain-3: span=0-55 level=DIE
> > 
> > CPU1 attaching sched-domain(s):
> > domain-0: span=1,3,5,7 level=SMT
> > domain-1: span=0-7 level=CACHE
> > domain-2: span=0-15,24-39,48-55 level=MC
> > domain-3: span=0-55 level=DIE
> > 
> > The CACHE domain at 0-7 is incorrect since the ibm,thread-groups
> > sub-array
> > [0002 0002 0004
> >   0002 0004 0006
> >  0001 0003 0005 0007]
> > indicates that L2 (Property "2") is shared only between the threads of a 
> > single
> > group. There are "2" groups of threads where each group contains "4"
> > threads each. The groups being {0,2,4,6} and {1,3,5,7}.
> > 
> > With this patch, the sched-domain hierarchy for CPUs 0,1 would be
> > CPU0 attaching sched-domain(s):
> > domain-0: span=0,2,4,6 level=SMT
> > domain-1: span=0-15,24-39,48-55 level=MC
> > domain-2: span=0-55 level=DIE
> > 
> > CPU1 attaching sched-domain(s):
> > domain-0: span=1,3,5,7 level=SMT
> > domain-1: span=0-15,24-39,48-55 level=MC
> > domain-2: span=0-55 level=DIE
> > 
> > The CACHE domain with span=0,2,4,6 for CPU 0 (span=1,3,5,7 for CPU 1
> > resp.) gets degenerated into the SMT domain. Furthermore, the
> > last-level-cache domain gets correctly set to the SMT sched-domain.
> > 
> > Signed-off-by: Gautham R. Shenoy 
> > ---
> >  arch/powerpc/kernel/smp.c | 66 
> > +--
> >  1 file changed, 58 insertions(+), 8 deletions(-)
> > 
> > diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
> > index 6a242a3..a116d2d 100644
> > --- a/arch/powerpc/kernel/smp.c
> > +++ b/arch/powerpc/kernel/smp.c
> > @@ -76,6 +76,7 @@
> >  struct task_struct *secondary_current;
> >  bool has_big_cores;
> >  bool coregroup_enabled;
> > +bool thread_group_shares_l2;
> 
> Either keep this as static in this patch or add its declaration
>

This will be used in Patch 3 in kernel/cacheinfo.c, but not any other
place. Hence I am not making it static here.


> > 
> >  DEFINE_PER_CPU(cpumask_var_t, cpu_sibling_map);
> >  DEFINE_PER_CPU(cpumask_var_t, cpu_smallcore_map);
> > @@ -99,6 +100,7 @@ enum {
> > 
> >  #define MAX_THREAD_LIST_SIZE   8
> >  #define THREAD_GROUP_SHARE_L1   1
> > +#define THREAD_GROUP_SHARE_L2   2
> >  struct thread_groups {
> > unsigned int property;
> > unsigned int nr_groups;
> > @@ -107,7 +109,7 @@ struct thread_groups {
> >  };
> > 
> >  /* Maximum number of properties that groups of threads within a core can 
> > share */
> > -#define MAX_THREAD_GROUP_PROPERTIES 1
> > +#define MAX_THREAD_GROUP_PROPERTIES 2
> > 
> >  struct thread_groups_list {
> > unsigned int nr_properties;
> > @@ -121,6 +123,13 @@ struct thread_groups_list {
> >   */
> >  DEFINE_PER_CPU(cpumask_var_t, cpu_l1_cache_map);
> > 
> > +/*
> > + * On some big-cores system, thread_group_l2_cache_map for each CPU
> > + * corresponds to the set its siblings within the core that share the
> > + * L2-cache.
> > + */
> > +DEFINE_PER_CPU(cpumask_var_t, thread_group_l2_cache_map);

Re: [PATCH 1/3] powerpc/smp: Parse ibm,thread-groups with multiple properties

2020-12-08 Thread Gautham R Shenoy
Hello Srikar,

Thanks for taking a look at the patch.

On Mon, Dec 07, 2020 at 05:40:42PM +0530, Srikar Dronamraju wrote:
> * Gautham R. Shenoy  [2020-12-04 10:18:45]:
> 
> > From: "Gautham R. Shenoy" 
> 
> 
> 
> > 
> >  static int parse_thread_groups(struct device_node *dn,
> > -  struct thread_groups *tg,
> > -  unsigned int property)
> > +  struct thread_groups_list *tglp)
> >  {
> > -   int i;
> > -   u32 thread_group_array[3 + MAX_THREAD_LIST_SIZE];
> > +   int i = 0;
> > +   u32 *thread_group_array;
> > u32 *thread_list;
> > size_t total_threads;
> > -   int ret;
> > +   int ret = 0, count;
> > +   unsigned int property_idx = 0;
> 
> NIT:
> tglx mentions in one of his recent comments to try keep a reverse fir tree
> ordering of variables where possible.

I suppose you mean moving the longer local variable declarations to to
the top and shorter ones to the bottom. Thanks. Will fix this.


> 
> > 
> > +   count = of_property_count_u32_elems(dn, "ibm,thread-groups");
> > +   thread_group_array = kcalloc(count, sizeof(u32), GFP_KERNEL);
> > ret = of_property_read_u32_array(dn, "ibm,thread-groups",
> > -thread_group_array, 3);
> > +thread_group_array, count);
> > if (ret)
> > -   return ret;
> > -
> > -   tg->property = thread_group_array[0];
> > -   tg->nr_groups = thread_group_array[1];
> > -   tg->threads_per_group = thread_group_array[2];
> > -   if (tg->property != property ||
> > -   tg->nr_groups < 1 ||
> > -   tg->threads_per_group < 1)
> > -   return -ENODATA;
> > +   goto out_free;
> > 
> > -   total_threads = tg->nr_groups * tg->threads_per_group;
> > +   while (i < count && property_idx < MAX_THREAD_GROUP_PROPERTIES) {
> > +   int j;
> > +   struct thread_groups *tg = >property_tgs[property_idx++];
> 
> NIT: same as above.

Ok.
> 
> > 
> > -   ret = of_property_read_u32_array(dn, "ibm,thread-groups",
> > -thread_group_array,
> > -3 + total_threads);
> > -   if (ret)
> > -   return ret;
> > +   tg->property = thread_group_array[i];
> > +   tg->nr_groups = thread_group_array[i + 1];
> > +   tg->threads_per_group = thread_group_array[i + 2];
> > +   total_threads = tg->nr_groups * tg->threads_per_group;
> > +
> > +   thread_list = _group_array[i + 3];
> > 
> > -   thread_list = _group_array[3];
> > +   for (j = 0; j < total_threads; j++)
> > +   tg->thread_list[j] = thread_list[j];
> > +   i = i + 3 + total_threads;
> 
>   Can't we simply use memcpy instead?

We could. But this one makes it more explicit.


> 
> > +   }
> > 
> > -   for (i = 0 ; i < total_threads; i++)
> > -   tg->thread_list[i] = thread_list[i];
> > +   tglp->nr_properties = property_idx;
> > 
> > -   return 0;
> > +out_free:
> > +   kfree(thread_group_array);
> > +   return ret;
> >  }
> > 
> >  /*
> > @@ -805,24 +827,39 @@ static int get_cpu_thread_group_start(int cpu, struct 
> > thread_groups *tg)
> > return -1;
> >  }
> > 
> > -static int init_cpu_l1_cache_map(int cpu)
> > +static int init_cpu_cache_map(int cpu, unsigned int cache_property)
> > 
> >  {
> > struct device_node *dn = of_get_cpu_node(cpu, NULL);
> > -   struct thread_groups tg = {.property = 0,
> > -  .nr_groups = 0,
> > -  .threads_per_group = 0};
> > +   struct thread_groups *tg = NULL;
> > int first_thread = cpu_first_thread_sibling(cpu);
> > int i, cpu_group_start = -1, err = 0;
> > +   cpumask_var_t *mask;
> > +   struct thread_groups_list *cpu_tgl = [cpu];
> 
> NIT: same as 1st comment.

Sure, will fix this.

> 
> > 
> > if (!dn)
> > return -ENODATA;
> > 
> > -   err = parse_thread_groups(dn, , THREAD_GROUP_SHARE_L1);
> > -   if (err)
> > -   goto out;
> > +   if (!(cache_property == THREAD_GROUP_SHARE_L1))
> > +   return -EINVAL;
> > 
> > -   cpu_group_start = get_cpu_thread_group_start(cpu, );
> > +   if (!cpu_tgl->n

[PATCH 1/3] powerpc/smp: Parse ibm, thread-groups with multiple properties

2020-12-03 Thread Gautham R. Shenoy
From: "Gautham R. Shenoy" 

The "ibm,thread-groups" device-tree property is an array that is used
to indicate if groups of threads within a core share certain
properties. It provides details of which property is being shared by
which groups of threads. This array can encode information about
multiple properties being shared by different thread-groups within the
core.

Example: Suppose,
"ibm,thread-groups" = [1,2,4,8,10,12,14,9,11,13,15,2,2,4,8,10,12,14,9,11,13,15]

This can be decomposed up into two consecutive arrays:

a) [1,2,4,8,10,12,14,9,11,13,15]
b) [2,2,4,8,10,12,14,9,11,13,15]

where in,

a) provides information of Property "1" being shared by "2" groups,
   each with "4" threads each. The "ibm,ppc-interrupt-server#s" of the
   first group is {8,10,12,14} and the "ibm,ppc-interrupt-server#s" of
   the second group is {9,11,13,15}. Property "1" is indicative of
   the thread in the group sharing L1 cache, translation cache and
   Instruction Data flow.

b) provides information of Property "2" being shared by "2" groups,
   each group with "4" threads. The "ibm,ppc-interrupt-server#s" of
   the first group is {8,10,12,14} and the
   "ibm,ppc-interrupt-server#s" of the second group is
   {9,11,13,15}. Property "2" indicates that the threads in each group
   share the L2-cache.
   
The existing code assumes that the "ibm,thread-groups" encodes
information about only one property. Hence even on platforms which
encode information about multiple properties being shared by the
corresponding groups of threads, the current code will only pick the
first one. (In the above example, it will only consider
[1,2,4,8,10,12,14,9,11,13,15] but not [2,2,4,8,10,12,14,9,11,13,15]).

This patch extends the parsing support on platforms which encode
information about multiple properties being shared by the
corresponding groups of threads.

Signed-off-by: Gautham R. Shenoy 
---
 arch/powerpc/kernel/smp.c | 146 +-
 1 file changed, 92 insertions(+), 54 deletions(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 8c2857c..6a242a3 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -106,6 +106,15 @@ struct thread_groups {
unsigned int thread_list[MAX_THREAD_LIST_SIZE];
 };
 
+/* Maximum number of properties that groups of threads within a core can share 
*/
+#define MAX_THREAD_GROUP_PROPERTIES 1
+
+struct thread_groups_list {
+   unsigned int nr_properties;
+   struct thread_groups property_tgs[MAX_THREAD_GROUP_PROPERTIES];
+};
+
+static struct thread_groups_list tgl[NR_CPUS] __initdata;
 /*
  * On big-cores system, cpu_l1_cache_map for each CPU corresponds to
  * the set its siblings that share the L1-cache.
@@ -695,81 +704,94 @@ static void or_cpumasks_related(int i, int j, struct 
cpumask *(*srcmask)(int),
 /*
  * parse_thread_groups: Parses the "ibm,thread-groups" device tree
  *  property for the CPU device node @dn and stores
- *  the parsed output in the thread_groups
- *  structure @tg if the ibm,thread-groups[0]
- *  matches @property.
+ *  the parsed output in the thread_groups_list
+ *  structure @tglp.
  *
  * @dn: The device node of the CPU device.
- * @tg: Pointer to a thread group structure into which the parsed
+ * @tglp: Pointer to a thread group list structure into which the parsed
  *  output of "ibm,thread-groups" is stored.
- * @property: The property of the thread-group that the caller is
- *interested in.
  *
  * ibm,thread-groups[0..N-1] array defines which group of threads in
  * the CPU-device node can be grouped together based on the property.
  *
- * ibm,thread-groups[0] tells us the property based on which the
+ * This array can represent thread groupings for multiple properties.
+ *
+ * ibm,thread-groups[i + 0] tells us the property based on which the
  * threads are being grouped together. If this value is 1, it implies
  * that the threads in the same group share L1, translation cache.
  *
- * ibm,thread-groups[1] tells us how many such thread groups exist.
+ * ibm,thread-groups[i+1] tells us how many such thread groups exist for the
+ * property ibm,thread-groups[i]
  *
- * ibm,thread-groups[2] tells us the number of threads in each such
+ * ibm,thread-groups[i+2] tells us the number of threads in each such
  * group.
+ * Suppose k = (ibm,thread-groups[i+1] * ibm,thread-groups[i+2]), then,
  *
- * ibm,thread-groups[3..N-1] is the list of threads identified by
+ * ibm,thread-groups[i+3..i+k+2] (is the list of threads identified by
  * "ibm,ppc-interrupt-server#s" arranged as per their membership in
  * the grouping.
  *
- * Example: If ibm,thread-gr

[PATCH 3/3] powerpc/cacheinfo: Print correct cache-sibling map/list for L2 cache

2020-12-03 Thread Gautham R. Shenoy
From: "Gautham R. Shenoy" 

On POWER platforms where only some groups of threads within a core
share the L2-cache (indicated by the ibm,thread-groups device-tree
property), we currently print the incorrect shared_cpu_map/list for
L2-cache in the sysfs.

This patch reports the correct shared_cpu_map/list on such platforms.

Example:
On a platform with "ibm,thread-groups" set to
 0001 0002 0004 
 0002 0004 0006 0001
 0003 0005 0007 0002
 0002 0004  0002
 0004 0006 0001 0003
 0005 0007

This indicates that threads {0,2,4,6} in the core share the L2-cache
and threads {1,3,5,7} in the core share the L2 cache.

However, without the patch, the shared_cpu_map/list for L2 for CPUs 0,
1 is reported in the sysfs as follows:

/sys/devices/system/cpu/cpu0/cache/index2/shared_cpu_list:0-7
/sys/devices/system/cpu/cpu0/cache/index2/shared_cpu_map:00,00ff

/sys/devices/system/cpu/cpu1/cache/index2/shared_cpu_list:0-7
/sys/devices/system/cpu/cpu1/cache/index2/shared_cpu_map:00,00ff

With the patch, the shared_cpu_map/list for L2 cache for CPUs 0, 1 is
correctly reported as follows:

/sys/devices/system/cpu/cpu0/cache/index2/shared_cpu_list:0,2,4,6
/sys/devices/system/cpu/cpu0/cache/index2/shared_cpu_map:00,0055

/sys/devices/system/cpu/cpu1/cache/index2/shared_cpu_list:1,3,5,7
/sys/devices/system/cpu/cpu1/cache/index2/shared_cpu_map:00,00aa

Signed-off-by: Gautham R. Shenoy 
---
 arch/powerpc/kernel/cacheinfo.c | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/arch/powerpc/kernel/cacheinfo.c b/arch/powerpc/kernel/cacheinfo.c
index 65ab9fc..1cc8f37 100644
--- a/arch/powerpc/kernel/cacheinfo.c
+++ b/arch/powerpc/kernel/cacheinfo.c
@@ -651,15 +651,22 @@ static unsigned int index_dir_to_cpu(struct 
cache_index_dir *index)
return dev->id;
 }
 
+extern bool thread_group_shares_l2;
 /*
  * On big-core systems, each core has two groups of CPUs each of which
  * has its own L1-cache. The thread-siblings which share l1-cache with
  * @cpu can be obtained via cpu_smallcore_mask().
+ *
+ * On some big-core systems, the L2 cache is shared only between some
+ * groups of siblings. This is already parsed and encoded in
+ * cpu_l2_cache_mask().
  */
 static const struct cpumask *get_big_core_shared_cpu_map(int cpu, struct cache 
*cache)
 {
if (cache->level == 1)
return cpu_smallcore_mask(cpu);
+   if (cache->level == 2 && thread_group_shares_l2)
+   return cpu_l2_cache_mask(cpu);
 
return >shared_cpu_map;
 }
-- 
1.9.4



[PATCH 2/3] powerpc/smp: Add support detecting thread-groups sharing L2 cache

2020-12-03 Thread Gautham R. Shenoy
From: "Gautham R. Shenoy" 

On POWER systems, groups of threads within a core sharing the L2-cache
can be indicated by the "ibm,thread-groups" property array with the
identifier "2".

This patch adds support for detecting this, and when present, populate
the populating the cpu_l2_cache_mask of every CPU to the core-siblings
which share L2 with the CPU as specified in the by the
"ibm,thread-groups" property array.

On a platform with the following "ibm,thread-group" configuration
 0001 0002 0004 
 0002 0004 0006 0001
 0003 0005 0007 0002
 0002 0004  0002
 0004 0006 0001 0003
 0005 0007

Without this patch, the sched-domain hierarchy for CPUs 0,1 would be
CPU0 attaching sched-domain(s):
domain-0: span=0,2,4,6 level=SMT
domain-1: span=0-7 level=CACHE
domain-2: span=0-15,24-39,48-55 level=MC
domain-3: span=0-55 level=DIE

CPU1 attaching sched-domain(s):
domain-0: span=1,3,5,7 level=SMT
domain-1: span=0-7 level=CACHE
domain-2: span=0-15,24-39,48-55 level=MC
domain-3: span=0-55 level=DIE

The CACHE domain at 0-7 is incorrect since the ibm,thread-groups
sub-array
[0002 0002 0004
  0002 0004 0006
 0001 0003 0005 0007]
indicates that L2 (Property "2") is shared only between the threads of a single
group. There are "2" groups of threads where each group contains "4"
threads each. The groups being {0,2,4,6} and {1,3,5,7}.

With this patch, the sched-domain hierarchy for CPUs 0,1 would be
CPU0 attaching sched-domain(s):
domain-0: span=0,2,4,6 level=SMT
domain-1: span=0-15,24-39,48-55 level=MC
domain-2: span=0-55 level=DIE

CPU1 attaching sched-domain(s):
domain-0: span=1,3,5,7 level=SMT
domain-1: span=0-15,24-39,48-55 level=MC
domain-2: span=0-55 level=DIE

The CACHE domain with span=0,2,4,6 for CPU 0 (span=1,3,5,7 for CPU 1
resp.) gets degenerated into the SMT domain. Furthermore, the
last-level-cache domain gets correctly set to the SMT sched-domain.

Signed-off-by: Gautham R. Shenoy 
---
 arch/powerpc/kernel/smp.c | 66 +--
 1 file changed, 58 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 6a242a3..a116d2d 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -76,6 +76,7 @@
 struct task_struct *secondary_current;
 bool has_big_cores;
 bool coregroup_enabled;
+bool thread_group_shares_l2;
 
 DEFINE_PER_CPU(cpumask_var_t, cpu_sibling_map);
 DEFINE_PER_CPU(cpumask_var_t, cpu_smallcore_map);
@@ -99,6 +100,7 @@ enum {
 
 #define MAX_THREAD_LIST_SIZE   8
 #define THREAD_GROUP_SHARE_L1   1
+#define THREAD_GROUP_SHARE_L2   2
 struct thread_groups {
unsigned int property;
unsigned int nr_groups;
@@ -107,7 +109,7 @@ struct thread_groups {
 };
 
 /* Maximum number of properties that groups of threads within a core can share 
*/
-#define MAX_THREAD_GROUP_PROPERTIES 1
+#define MAX_THREAD_GROUP_PROPERTIES 2
 
 struct thread_groups_list {
unsigned int nr_properties;
@@ -121,6 +123,13 @@ struct thread_groups_list {
  */
 DEFINE_PER_CPU(cpumask_var_t, cpu_l1_cache_map);
 
+/*
+ * On some big-cores system, thread_group_l2_cache_map for each CPU
+ * corresponds to the set its siblings within the core that share the
+ * L2-cache.
+ */
+DEFINE_PER_CPU(cpumask_var_t, thread_group_l2_cache_map);
+
 /* SMP operations for this machine */
 struct smp_ops_t *smp_ops;
 
@@ -718,7 +727,9 @@ static void or_cpumasks_related(int i, int j, struct 
cpumask *(*srcmask)(int),
  *
  * ibm,thread-groups[i + 0] tells us the property based on which the
  * threads are being grouped together. If this value is 1, it implies
- * that the threads in the same group share L1, translation cache.
+ * that the threads in the same group share L1, translation cache. If
+ * the value is 2, it implies that the threads in the same group share
+ * the same L2 cache.
  *
  * ibm,thread-groups[i+1] tells us how many such thread groups exist for the
  * property ibm,thread-groups[i]
@@ -745,10 +756,10 @@ static void or_cpumasks_related(int i, int j, struct 
cpumask *(*srcmask)(int),
  * 12}.
  *
  * b) there are 2 groups of 4 threads each, where each group of
- *threads share some property indicated by the first value 2. The
- *"ibm,ppc-interrupt-server#s" of the first group is {5,7,9,11}
- *and the "ibm,ppc-interrupt-server#s" of the second group is
- *{6,8,10,12} structure
+ *threads share some property indicated by the first value 2 (L2
+ *cache). The "ibm,ppc-interrupt-server#s" of the first group is
+ *{5,7,9,

[PATCH 0/3] Extend Parsing "ibm, thread-groups" for Shared-L2 information

2020-12-03 Thread Gautham R. Shenoy
From: "Gautham R. Shenoy" 

The "ibm,thread-groups" device-tree property is an array that is used
to indicate if groups of threads within a core share certain
properties. It provides details of which property is being shared by
which groups of threads. This array can encode information about
multiple properties being shared by different thread-groups within the
core.

Example: Suppose,
"ibm,thread-groups" = [1,2,4,8,10,12,14,9,11,13,15,2,2,4,8,10,12,14,9,11,13,15]

This can be decomposed up into two consecutive arrays:

a) [1,2,4,8,10,12,14,9,11,13,15]
b) [2,2,4,8,10,12,14,9,11,13,15]

where in,

a) provides information of Property "1" being shared by "2" groups,
   each with "4" threads each. The "ibm,ppc-interrupt-server#s" of the
   first group is {8,10,12,14} and the "ibm,ppc-interrupt-server#s" of
   the second group is {9,11,13,15}. Property "1" is indicative of
   the thread in the group sharing L1 cache, translation cache and
   Instruction Data flow.

b) provides information of Property "2" being shared by "2" groups,
   each group with "4" threads. The "ibm,ppc-interrupt-server#s" of
   the first group is {8,10,12,14} and the
   "ibm,ppc-interrupt-server#s" of the second group is
   {9,11,13,15}. Property "2" indicates that the threads in each group
   share the L2-cache.
   
The existing code assumes that the "ibm,thread-groups" encodes
information about only one property. Hence even on platforms which
encode information about multiple properties being shared by the
corresponding groups of threads, the current code will only pick the
first one. (In the above example, it will only consider
[1,2,4,8,10,12,14,9,11,13,15] but not [2,2,4,8,10,12,14,9,11,13,15]).

Furthermore, currently on platforms where groups of threads share L2
cache, we incorrectly create an extra CACHE level sched-domain that
maps to all the threads of the core.

For example, if "ibm,thread-groups" is 
 0001 0002 0004 
 0002 0004 0006 0001
 0003 0005 0007 0002
 0002 0004  0002
 0004 0006 0001 0003
 0005 0007

then, the sub-array
[0002 0002 0004
  0002 0004 0006
 0001 0003 0005 0007]
indicates that L2 (Property "2") is shared only between the threads of a single
group. There are "2" groups of threads where each group contains "4"
threads each. The groups being {0,2,4,6} and {1,3,5,7}.

However, the sched-domain hierarchy for CPUs 0,1 is
CPU0 attaching sched-domain(s):
domain-0: span=0,2,4,6 level=SMT
domain-1: span=0-7 level=CACHE
domain-2: span=0-15,24-39,48-55 level=MC
domain-3: span=0-55 level=DIE

CPU1 attaching sched-domain(s):
domain-0: span=1,3,5,7 level=SMT
domain-1: span=0-7 level=CACHE
domain-2: span=0-15,24-39,48-55 level=MC
domain-3: span=0-55 level=DIE

where the CACHE domain reports that L2 is shared across the entire
core which is incorrect on such platforms.


This patchset remedies these issues by extending the parsing support
for "ibm,thread-groups" to discover information about multiple
properties being shared by the corresponding groups of threads. In
particular we cano now detect if the groups of threads within a core
share the L2-cache. On such platforms, we populate the populating the
cpu_l2_cache_mask of every CPU to the core-siblings which share L2
with the CPU as specified in the by the "ibm,thread-groups" property
array.

With the patchset, the sched-domain hierarchy is correctly
reported. For eg for CPUs 0,1, with the patchset

CPU0 attaching sched-domain(s):
domain-0: span=0,2,4,6 level=SMT
domain-1: span=0-15,24-39,48-55 level=MC
domain-2: span=0-55 level=DIE

CPU1 attaching sched-domain(s):
domain-0: span=1,3,5,7 level=SMT
domain-1: span=0-15,24-39,48-55 level=MC
domain-2: span=0-55 level=DIE

The CACHE domain with span=0,2,4,6 for CPU 0 (span=1,3,5,7 for CPU 1
resp.) gets degenerated into the SMT domain. Furthermore, the
last-level-cache domain gets correctly set to the SMT sched-domain.

Finally, this patchset reports the correct shared_cpu_map/list in the
sysfs for L2 cache on such platforms. With the patchset for CPUs0, 1,
for L2 cache we see the correct shared_cpu_map/list

/sys/devices/system/cpu/cpu0/cache/index2/shared_cpu_list:0,2,4,6
/sys/devices/system/cpu/cpu0/cache/index2/shared_cpu_map:00,0055

/sys/devices/system/cpu/cpu1/cache/index2/shared_cpu_list:1,3,5,7
/sys/devices/system/cpu/cpu1/cache/index2/shared_cpu_map:00,00aa

The patchset has been tested on older platform

Re: [PATCH 20/33] docs: ABI: testing: make the files compatible with ReST output

2020-11-02 Thread Gautham R Shenoy
y Failure'.
> 
>   - overcurrent : This file gives the total number of times the
> - max frequency is throttled due to 'Overcurrent'.
> +   max frequency is throttled due to 'Overcurrent'.
> 
>   - occ_reset : This file gives the total number of times the max
> - frequency is throttled due to 'OCC Reset'.
> +   frequency is throttled due to 'OCC Reset'.
> 
>   The sysfs attributes representing different throttle reasons 
> like
>   powercap, overtemp, supply_fault, overcurrent and occ_reset map 
> to


This hunk for the powernv cpufreq driver looks good to me.
For these two hunks,

Reviewed-by: Gautham R. Shenoy 




[PATCH v2] cpuidle-pseries: Fix CEDE latency conversion from tb to us

2020-09-03 Thread Gautham R. Shenoy
From: "Gautham R. Shenoy" 

commit d947fb4c965c ("cpuidle: pseries: Fixup exit latency for
CEDE(0)") sets the exit latency of CEDE(0) based on the latency values
of the Extended CEDE states advertised by the platform. The values
advertised by the platform are in timebase ticks. However the cpuidle
framework requires the latency values in microseconds.

If the tb-ticks value advertised by the platform correspond to a value
smaller than 1us, during the conversion from tb-ticks to microseconds,
in the current code, the result becomes zero. This is incorrect as it
puts a CEDE state on par with the snooze state.

This patch fixes this by rounding up the result obtained while
converting the latency value from tb-ticks to microseconds. It also
prints a warning in case we discover an extended-cede state with
wakeup latency to be 0. In such a case, ensure that CEDE(0) has a
non-zero wakeup latency.

Fixes: commit d947fb4c965c ("cpuidle: pseries: Fixup exit latency for
CEDE(0)")

Signed-off-by: Gautham R. Shenoy 
---
v1-->v2: Added a warning if a CEDE state has 0 wakeup latency (Suggested by 
Joel Stanley)
 Also added code to ensure that CEDE(0) has a non-zero wakeup latency.  
 
 drivers/cpuidle/cpuidle-pseries.c | 15 +++
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/drivers/cpuidle/cpuidle-pseries.c 
b/drivers/cpuidle/cpuidle-pseries.c
index ff6d99e..a2b5c6f 100644
--- a/drivers/cpuidle/cpuidle-pseries.c
+++ b/drivers/cpuidle/cpuidle-pseries.c
@@ -361,7 +361,10 @@ static void __init fixup_cede0_latency(void)
for (i = 0; i < nr_xcede_records; i++) {
struct xcede_latency_record *record = >records[i];
u64 latency_tb = be64_to_cpu(record->latency_ticks);
-   u64 latency_us = tb_to_ns(latency_tb) / NSEC_PER_USEC;
+   u64 latency_us = DIV_ROUND_UP_ULL(tb_to_ns(latency_tb), 
NSEC_PER_USEC);
+
+   if (latency_us == 0)
+   pr_warn("cpuidle: xcede record %d has an unrealistic 
latency of 0us.\n", i);
 
if (latency_us < min_latency_us)
min_latency_us = latency_us;
@@ -378,10 +381,14 @@ static void __init fixup_cede0_latency(void)
 * Perform the fix-up.
 */
if (min_latency_us < dedicated_states[1].exit_latency) {
-   u64 cede0_latency = min_latency_us - 1;
+   /*
+* We set a minimum of 1us wakeup latency for cede0 to
+* distinguish it from snooze
+*/
+   u64 cede0_latency = 1;
 
-   if (cede0_latency <= 0)
-   cede0_latency = min_latency_us;
+   if (min_latency_us > cede0_latency)
+   cede0_latency = min_latency_us - 1;
 
dedicated_states[1].exit_latency = cede0_latency;
dedicated_states[1].target_residency = 10 * (cede0_latency);
-- 
1.9.4



Re: [PATCH] cpuidle-pseries: Fix CEDE latency conversion from tb to us

2020-09-02 Thread Gautham R Shenoy
Hello Joel,

On Wed, Sep 02, 2020 at 01:08:35AM +, Joel Stanley wrote:
> On Tue, 1 Sep 2020 at 14:09, Gautham R. Shenoy  
> wrote:
> >
> > From: "Gautham R. Shenoy" 
> >
> > commit d947fb4c965c ("cpuidle: pseries: Fixup exit latency for
> > CEDE(0)") sets the exit latency of CEDE(0) based on the latency values
> > of the Extended CEDE states advertised by the platform. The values
> > advertised by the platform are in timebase ticks. However the cpuidle
> > framework requires the latency values in microseconds.
> >
> > If the tb-ticks value advertised by the platform correspond to a value
> > smaller than 1us, during the conversion from tb-ticks to microseconds,
> > in the current code, the result becomes zero. This is incorrect as it
> > puts a CEDE state on par with the snooze state.
> >
> > This patch fixes this by rounding up the result obtained while
> > converting the latency value from tb-ticks to microseconds.
> >
> > Fixes: commit d947fb4c965c ("cpuidle: pseries: Fixup exit latency for
> > CEDE(0)")
> >
> > Signed-off-by: Gautham R. Shenoy 
> 
> Reviewed-by: Joel Stanley 
>

Thanks for reviewing the fix.

> Should you check for the zero case and print a warning?

Yes, that would be better. I will post a v2 with that.

> 
> > ---
> >  drivers/cpuidle/cpuidle-pseries.c | 2 +-
> >  1 file changed, 1 insertion(+), 1 deletion(-)
> >
> > diff --git a/drivers/cpuidle/cpuidle-pseries.c 
> > b/drivers/cpuidle/cpuidle-pseries.c
> > index ff6d99e..9043358 100644
> > --- a/drivers/cpuidle/cpuidle-pseries.c
> > +++ b/drivers/cpuidle/cpuidle-pseries.c
> > @@ -361,7 +361,7 @@ static void __init fixup_cede0_latency(void)
> > for (i = 0; i < nr_xcede_records; i++) {
> > struct xcede_latency_record *record = >records[i];
> > u64 latency_tb = be64_to_cpu(record->latency_ticks);
> > -   u64 latency_us = tb_to_ns(latency_tb) / NSEC_PER_USEC;
> > +   u64 latency_us = DIV_ROUND_UP_ULL(tb_to_ns(latency_tb), 
> > NSEC_PER_USEC);
> >
> > if (latency_us < min_latency_us)
> > min_latency_us = latency_us;
> > --
> > 1.9.4
> >


[PATCH] cpuidle-pseries: Fix CEDE latency conversion from tb to us

2020-09-01 Thread Gautham R. Shenoy
From: "Gautham R. Shenoy" 

commit d947fb4c965c ("cpuidle: pseries: Fixup exit latency for
CEDE(0)") sets the exit latency of CEDE(0) based on the latency values
of the Extended CEDE states advertised by the platform. The values
advertised by the platform are in timebase ticks. However the cpuidle
framework requires the latency values in microseconds.

If the tb-ticks value advertised by the platform correspond to a value
smaller than 1us, during the conversion from tb-ticks to microseconds,
in the current code, the result becomes zero. This is incorrect as it
puts a CEDE state on par with the snooze state.

This patch fixes this by rounding up the result obtained while
converting the latency value from tb-ticks to microseconds.

Fixes: commit d947fb4c965c ("cpuidle: pseries: Fixup exit latency for
CEDE(0)")

Signed-off-by: Gautham R. Shenoy 
---
 drivers/cpuidle/cpuidle-pseries.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/cpuidle/cpuidle-pseries.c 
b/drivers/cpuidle/cpuidle-pseries.c
index ff6d99e..9043358 100644
--- a/drivers/cpuidle/cpuidle-pseries.c
+++ b/drivers/cpuidle/cpuidle-pseries.c
@@ -361,7 +361,7 @@ static void __init fixup_cede0_latency(void)
for (i = 0; i < nr_xcede_records; i++) {
struct xcede_latency_record *record = >records[i];
u64 latency_tb = be64_to_cpu(record->latency_ticks);
-   u64 latency_us = tb_to_ns(latency_tb) / NSEC_PER_USEC;
+   u64 latency_us = DIV_ROUND_UP_ULL(tb_to_ns(latency_tb), 
NSEC_PER_USEC);
 
if (latency_us < min_latency_us)
min_latency_us = latency_us;
-- 
1.9.4



Re: [PATCH] powerpc/powernv/idle: add a basic stop 0-3 driver for POWER10

2020-08-19 Thread Gautham R Shenoy
On Wed, Aug 19, 2020 at 07:47:00PM +1000, Nicholas Piggin wrote:
> This driver does not restore stop > 3 state, so it limits itself
> to states which do not lose full state or TB.
> 
> The POWER10 SPRs are sufficiently different from P9 that it seems
> easier to split out the P10 code. The POWER10 deep sleep code
> (e.g., the BHRB restore) has been taken out, but it can be re-added
> when stop > 3 support is added.

MMCRA[BHRB] save/restore was in the shallow stop-state path. But we
can add it back later.

> 
> Cc: Ryan P Grimm 
> Cc: Michael Neuling 
> Cc: Gautham R. Shenoy 
> Cc: Pratik Rajesh Sampat 
> Signed-off-by: Nicholas Piggin 

Just a minor comment below. But otherwise the patch looks good to me.

[..snip..]

> @@ -1093,11 +1200,15 @@ int validate_psscr_val_mask(u64 *psscr_val, u64 
> *psscr_mask, u32 flags)
>   * @dt_idle_states: Number of idle state entries
>   * Returns 0 on success
>   */
> -static void __init pnv_power9_idle_init(void)
> +static void __init pnv_arch300_idle_init(void)
>  {
>   u64 max_residency_ns = 0;
>   int i;
> 
> + /* stop is not really architected, we only have p9,p10 drivers */
> + if (!pvr_version_is(PVR_POWER10) && !pvr_version_is(PVR_POWER9))
> + return;
> +
>   /*
>* pnv_deepest_stop_{val,mask} should be set to values corresponding to
>* the deepest stop state.
> @@ -1112,6 +1223,11 @@ static void __init pnv_power9_idle_init(void)
>   struct pnv_idle_states_t *state = _idle_states[i];
>   u64 psscr_rl = state->psscr_val & PSSCR_RL_MASK;
> 
> + /* No deep loss driver implemented for POWER10 yet */
> + if (pvr_version_is(PVR_POWER10) &&
> + state->flags & 
> (OPAL_PM_TIMEBASE_STOP|OPAL_PM_LOSE_FULL_CONTEXT))
> + continue;
> +

Should we have a pr_info() informing the user the kernel is skipping
over these stop states ?

Reviewed-by: Gautham R. Shenoy 

>   if ((state->flags & OPAL_PM_TIMEBASE_STOP) &&
>(pnv_first_tb_loss_level > psscr_rl))
>   pnv_first_tb_loss_level = psscr_rl;

--
Thanks and Regards
gautham.


Re: [PATCH v5 06/10] powerpc/smp: Optimize start_secondary

2020-08-11 Thread Gautham R Shenoy
Hi Srikar,

On Mon, Aug 10, 2020 at 12:48:30PM +0530, Srikar Dronamraju wrote:
> In start_secondary, even if shared_cache was already set, system does a
> redundant match for cpumask. This redundant check can be removed by
> checking if shared_cache is already set.
> 
> While here, localize the sibling_mask variable to within the if
> condition.
> 
> Cc: linuxppc-dev 
> Cc: LKML 
> Cc: Michael Ellerman 
> Cc: Nicholas Piggin 
> Cc: Anton Blanchard 
> Cc: Oliver O'Halloran 
> Cc: Nathan Lynch 
> Cc: Michael Neuling 
> Cc: Gautham R Shenoy 
> Cc: Ingo Molnar 
> Cc: Peter Zijlstra 
> Cc: Valentin Schneider 
> Cc: Jordan Niethe 
> Cc: Vaidyanathan Srinivasan 
> Signed-off-by: Srikar Dronamraju 

The change looks good to me.

Reviewed-by: Gautham R. Shenoy 

> ---
> Changelog v4 ->v5:
>   Retain cache domain, no need for generalization
>        (Michael Ellerman, Peter Zijlstra,
>Valentin Schneider, Gautham R. Shenoy)
> 
> Changelog v1 -> v2:
>   Moved shared_cache topology fixup to fixup_topology (Gautham)
> 
>  arch/powerpc/kernel/smp.c | 17 +++--
>  1 file changed, 11 insertions(+), 6 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
> index 0c960ce3be42..91cf5d05e7ec 100644
> --- a/arch/powerpc/kernel/smp.c
> +++ b/arch/powerpc/kernel/smp.c
> @@ -851,7 +851,7 @@ static int powerpc_shared_cache_flags(void)
>   */
>  static const struct cpumask *shared_cache_mask(int cpu)
>  {
> - return cpu_l2_cache_mask(cpu);
> + return per_cpu(cpu_l2_cache_map, cpu);
>  }
> 
>  #ifdef CONFIG_SCHED_SMT
> @@ -1305,7 +1305,6 @@ static void add_cpu_to_masks(int cpu)
>  void start_secondary(void *unused)
>  {
>   unsigned int cpu = smp_processor_id();
> - struct cpumask *(*sibling_mask)(int) = cpu_sibling_mask;
> 
>   mmgrab(_mm);
>   current->active_mm = _mm;
> @@ -1331,14 +1330,20 @@ void start_secondary(void *unused)
>   /* Update topology CPU masks */
>   add_cpu_to_masks(cpu);
> 
> - if (has_big_cores)
> - sibling_mask = cpu_smallcore_mask;
>   /*
>* Check for any shared caches. Note that this must be done on a
>* per-core basis because one core in the pair might be disabled.
>*/
> - if (!cpumask_equal(cpu_l2_cache_mask(cpu), sibling_mask(cpu)))
> - shared_caches = true;
> + if (!shared_caches) {
> + struct cpumask *(*sibling_mask)(int) = cpu_sibling_mask;
> + struct cpumask *mask = cpu_l2_cache_mask(cpu);
> +
> + if (has_big_cores)
> + sibling_mask = cpu_smallcore_mask;
> +
> + if (cpumask_weight(mask) > cpumask_weight(sibling_mask(cpu)))
> + shared_caches = true;
> + }
> 
>   set_numa_node(numa_cpu_lookup_table[cpu]);
>   set_numa_mem(local_memory_node(numa_cpu_lookup_table[cpu]));
> -- 
> 2.18.2
> 


Re: [RFC PATCH 2/2] powerpc/powernv/cpufreq: Don't assume chip id is same as Linux node id

2020-08-04 Thread Gautham R Shenoy
On Fri, Jul 31, 2020 at 04:52:05PM +0530, Aneesh Kumar K.V wrote:
> On PowerNV platforms we always have 1:1 mapping between chip ID and
> firmware group id. Use the helper to convert firmware group id to
> node id instead of directly using chip ID as Linux node id.
> 
> NOTE: This doesn't have any functional change. On PowerNV platforms
> we continue to have 1:1 mapping between firmware group id and
> Linux node id.
> 
> Signed-off-by: Aneesh Kumar K.V 
> ---
>  drivers/cpufreq/powernv-cpufreq.c | 9 -
>  1 file changed, 8 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/cpufreq/powernv-cpufreq.c 
> b/drivers/cpufreq/powernv-cpufreq.c
> index 8646eb197cd9..ca82b6ac0989 100644
> --- a/drivers/cpufreq/powernv-cpufreq.c
> +++ b/drivers/cpufreq/powernv-cpufreq.c
> @@ -27,6 +27,7 @@
>  #include 
>  #include  /* Required for cpu_sibling_mask() in UP configs */
>  #include 
> +#include 
>  #include 
> 
>  #define POWERNV_MAX_PSTATES_ORDER  8
> @@ -1069,8 +1070,14 @@ static int init_chip_info(void)
>   }
> 
>   for (i = 0; i < nr_chips; i++) {
> + unsigned int nid;
> +
>   chips[i].id = chip[i];
> - cpumask_copy([i].mask, cpumask_of_node(chip[i]));
> + /*
> +  * On powervn platforms firmware group id is same as chipd id.

But doesn't hurt to be safe :-)

Reviewed-by: Gautham R. Shenoy 

> +  */
> + nid = firmware_group_id_to_nid(chip[i]);
> + cpumask_copy([i].mask, cpumask_of_node(nid));
>   INIT_WORK([i].throttle, powernv_cpufreq_work_fn);
>   for_each_cpu(cpu, [i].mask)
>   per_cpu(chip_info, cpu) =  [i];
> -- 
> 2.26.2
> 


Re: [PATCH v2] powerpc: Warn about use of smt_snooze_delay

2020-08-04 Thread Gautham R Shenoy
Hi Joel,

On Tue, Jun 30, 2020 at 11:29:35AM +0930, Joel Stanley wrote:
> It's not done anything for a long time. Save the percpu variable, and
> emit a warning to remind users to not expect it to do anything.
> 
> Fixes: 3fa8cad82b94 ("powerpc/pseries/cpuidle: smt-snooze-delay cleanup.")
> Cc: sta...@vger.kernel.org # v3.14
> Signed-off-by: Joel Stanley 


Sorry I missed this v2.

The patch looks good to me.

Acked-by: Gautham R. Shenoy 

> --
> v2:
>  Use pr_warn instead of WARN
>  Reword and print proccess name with pid in message
>  Leave CPU_FTR_SMT test in
>  Add Fixes line
> 
> mpe, if you don't agree then feel free to drop the cc stable.
> 
> Testing 'ppc64_cpu --smt=off' on a 24 core / 4 SMT system it's quite noisy
> as the online/offline loop that ppc64_cpu runs is slow.
> 
> This could be fixed by open coding pr_warn_ratelimit with the ratelimit
> parameters tweaked if someone was concerned. I'll leave that to someone
> else as a future enhancement.
> 
> [  237.642088][ T1197] ppc64_cpu (1197) used unsupported smt_snooze_delay, 
> this has no effect
> [  237.642175][ T1197] ppc64_cpu (1197) used unsupported smt_snooze_delay, 
> this has no effect
> [  237.642261][ T1197] ppc64_cpu (1197) used unsupported smt_snooze_delay, 
> this has no effect
> [  237.642345][ T1197] ppc64_cpu (1197) used unsupported smt_snooze_delay, 
> this has no effect
> [  237.642430][ T1197] ppc64_cpu (1197) used unsupported smt_snooze_delay, 
> this has no effect
> [  237.642516][ T1197] ppc64_cpu (1197) used unsupported smt_snooze_delay, 
> this has no effect
> [  237.642625][ T1197] ppc64_cpu (1197) used unsupported smt_snooze_delay, 
> this has no effect
> [  237.642709][ T1197] ppc64_cpu (1197) used unsupported smt_snooze_delay, 
> this has no effect
> [  237.642793][ T1197] ppc64_cpu (1197) used unsupported smt_snooze_delay, 
> this has no effect
> [  237.642878][ T1197] ppc64_cpu (1197) used unsupported smt_snooze_delay, 
> this has no effect
> [  254.264030][ T1197] store_smt_snooze_delay: 14 callbacks suppressed
> [  254.264033][ T1197] ppc64_cpu (1197) used unsupported smt_snooze_delay, 
> this has no effect
> [  254.264048][ T1197] ppc64_cpu (1197) used unsupported smt_snooze_delay, 
> this has no effect
> [  254.264062][ T1197] ppc64_cpu (1197) used unsupported smt_snooze_delay, 
> this has no effect
> [  254.264075][ T1197] ppc64_cpu (1197) used unsupported smt_snooze_delay, 
> this has no effect
> [  254.264089][ T1197] ppc64_cpu (1197) used unsupported smt_snooze_delay, 
> this has no effect
> [  254.264103][ T1197] ppc64_cpu (1197) used unsupported smt_snooze_delay, 
> this has no effect
> [  254.264116][ T1197] ppc64_cpu (1197) used unsupported smt_snooze_delay, 
> this has no effect
> [  254.264130][ T1197] ppc64_cpu (1197) used unsupported smt_snooze_delay, 
> this has no effect
> [  254.264143][ T1197] ppc64_cpu (1197) used unsupported smt_snooze_delay, 
> this has no effect
> [  254.264157][ T1197] ppc64_cpu (1197) used unsupported smt_snooze_delay, 
> this has no effect
> 
> Signed-off-by: Joel Stanley 
> ---
>  arch/powerpc/kernel/sysfs.c | 41 +++--
>  1 file changed, 16 insertions(+), 25 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c
> index 571b3259697e..ba6d4cee19ef 100644
> --- a/arch/powerpc/kernel/sysfs.c
> +++ b/arch/powerpc/kernel/sysfs.c
> @@ -32,29 +32,26 @@
> 
>  static DEFINE_PER_CPU(struct cpu, cpu_devices);
> 
> -/*
> - * SMT snooze delay stuff, 64-bit only for now
> - */
> -
>  #ifdef CONFIG_PPC64
> 
> -/* Time in microseconds we delay before sleeping in the idle loop */
> -static DEFINE_PER_CPU(long, smt_snooze_delay) = { 100 };
> +/*
> + * Snooze delay has not been hooked up since 3fa8cad82b94 
> ("powerpc/pseries/cpuidle:
> + * smt-snooze-delay cleanup.") and has been broken even longer. As was 
> foretold in
> + * 2014:
> + *
> + *  "ppc64_util currently utilises it. Once we fix ppc64_util, propose to 
> clean
> + *  up the kernel code."
> + *
> + * At some point in the future this code should be removed.
> + */
> 
>  static ssize_t store_smt_snooze_delay(struct device *dev,
> struct device_attribute *attr,
> const char *buf,
> size_t count)
>  {
> - struct cpu *cpu = container_of(dev, struct cpu, dev);
> - ssize_t ret;
> - long snooze;
> -
> - ret = sscanf(buf, "%ld", );
> - if (ret != 1)
> - return -EINVAL;
> -
> - per_cpu(smt_snooze_delay, cpu->dev.id) = snooze;
> + pr_warn_rateli

Re: [PATCH v4 09/10] Powerpc/smp: Create coregroup domain

2020-07-31 Thread Gautham R Shenoy
Hi Srikar, Valentin,

On Wed, Jul 29, 2020 at 11:43:55AM +0530, Srikar Dronamraju wrote:
> * Valentin Schneider  [2020-07-28 16:03:11]:
>

[..snip..]

> At this time the current topology would be good enough i.e BIGCORE would
> always be equal to a MC. However in future we could have chips that can have
> lesser/larger number of CPUs in llc than in a BIGCORE or we could have
> granular or split L3 caches within a DIE. In such a case BIGCORE != MC.
> 
> Also in the current P9 itself, two neighbouring core-pairs form a quad.
> Cache latency within a quad is better than a latency to a distant core-pair.
> Cache latency within a core pair is way better than latency within a quad.
> So if we have only 4 threads running on a DIE all of them accessing the same
> cache-lines, then we could probably benefit if all the tasks were to run
> within the quad aka MC/Coregroup.
> 
> I have found some benchmarks which are latency sensitive to benefit by
> having a grouping a quad level (using kernel hacks and not backed by
> firmware changes). Gautham also found similar results in his experiments
> but he only used binding within the stock kernel.
> 
> I am not setting SD_SHARE_PKG_RESOURCES in MC/Coregroup sd_flags as in MC
> domain need not be LLC domain for Power.

I am observing that SD_SHARE_PKG_RESOURCES at L2 provides the best
results for POWER9 in terms of cache-benefits during wakeup.  On a
POWER9 Boston machine, running a producer-consumer test case
(https://github.com/gautshen/misc/blob/master/producer_consumer/producer_consumer.c)

The test case creates two threads, one Producer and another
Consumer. Both work on a fairly large shared array of size 64M.  In an
interation the Producer performs stores to 1024 random locations and
wakes up the Consumer. In the Consumer's iteration, loads from those
exact 1024 locations.

We measure the number of Consumer iterations per second and the
average time for each Consumer iteration. The smaller the time, the
better it is.

The following results are when I pinned the Producer and Consumer to
different combinations of CPUs to cover Small core , Big-core,
Neighbouring Big-core, Far off core within the same chip, and across
chips. There is a also a case where they are not affined anywhere, and
we let the scheduler wake them up correctly.

We find the best results when the Producer and Consumer are within the
same L2 domain. These numbers are also close to the numbers that we
get when we let the Scheduler wake them up (where LLC is L2).

## Same Small core (4 threads: Shares L1, L2, L3, Frequency Domain)

Consumer affined to  CPU 3
Producer affined to  CPU 1
4698 iterations, avg time: 20034 ns
4951 iterations, avg time: 20012 ns
4957 iterations, avg time: 19971 ns
4968 iterations, avg time: 19985 ns
4970 iterations, avg time: 19977 ns


## Same Big Core (8 threads: Shares L2, L3, Frequency Domain)

Consumer affined to  CPU 7
Producer affined to  CPU 1
4580 iterations, avg time: 19403 ns
4851 iterations, avg time: 19373 ns
4849 iterations, avg time: 19394 ns
4856 iterations, avg time: 19394 ns
4867 iterations, avg time: 19353 ns

## Neighbouring Big-core (Faster data-snooping from L2. Shares L3, Frequency 
Domain)

Producer affined to  CPU 1
Consumer affined to  CPU 11
4270 iterations, avg time: 24158 ns
4491 iterations, avg time: 24157 ns
4500 iterations, avg time: 24148 ns
4516 iterations, avg time: 24164 ns
4518 iterations, avg time: 24165 ns


## Any other Big-core from Same Chip (Shares L3)

Producer affined to  CPU 1
Consumer affined to  CPU 87
4176 iterations, avg time: 27953 ns
4417 iterations, avg time: 27925 ns
4415 iterations, avg time: 27934 ns
4417 iterations, avg time: 27983 ns
4430 iterations, avg time: 27958 ns


## Different Chips (No cache-sharing)

Consumer affined to  CPU 175
Producer affined to  CPU 1
3277 iterations, avg time: 50786 ns
3063 iterations, avg time: 50732 ns
2831 iterations, avg time: 50737 ns
2859 iterations, avg time: 50688 ns
2849 iterations, avg time: 50722 ns

## Without affining them (Let Scheduler wake-them up appropriately)
Consumer affined to  CPU 0-175
Producer affined to  CPU 0-175
4821 iterations, avg time: 19412 ns
4863 iterations, avg time: 19435 ns
4855 iterations, avg time: 19381 ns
4811 iterations, avg time: 19458 ns
4892 iterations, avg time: 19429 ns


--
Thanks and Regards
gautham.


Re: [PATCH v4 06/10] powerpc/smp: Generalize 2nd sched domain

2020-07-29 Thread Gautham R Shenoy
On Mon, Jul 27, 2020 at 11:02:26AM +0530, Srikar Dronamraju wrote:
> Currently "CACHE" domain happens to be the 2nd sched domain as per
> powerpc_topology. This domain will collapse if cpumask of l2-cache is
> same as SMT domain. However we could generalize this domain such that it
> could mean either be a "CACHE" domain or a "BIGCORE" domain.
> 
> While setting up the "CACHE" domain, check if shared_cache is already
> set.
> 
> Cc: linuxppc-dev 
> Cc: LKML 
> Cc: Michael Ellerman 
> Cc: Nicholas Piggin 
> Cc: Anton Blanchard 
> Cc: Oliver O'Halloran 
> Cc: Nathan Lynch 
> Cc: Michael Neuling 
> Cc: Gautham R Shenoy 
> Cc: Ingo Molnar 
> Cc: Peter Zijlstra 
> Cc: Valentin Schneider 
> Cc: Jordan Niethe 
> Signed-off-by: Srikar Dronamraju 

Reviewed-by: Gautham R. Shenoy 

> ---
> Changelog v1 -> v2:
>   Moved shared_cache topology fixup to fixup_topology (Gautham)
> 
>  arch/powerpc/kernel/smp.c | 48 +++
>  1 file changed, 34 insertions(+), 14 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
> index d997c7411664..3c5ccf6d2b1c 100644
> --- a/arch/powerpc/kernel/smp.c
> +++ b/arch/powerpc/kernel/smp.c
> @@ -85,6 +85,14 @@ EXPORT_PER_CPU_SYMBOL(cpu_l2_cache_map);
>  EXPORT_PER_CPU_SYMBOL(cpu_core_map);
>  EXPORT_SYMBOL_GPL(has_big_cores);
> 
> +enum {
> +#ifdef CONFIG_SCHED_SMT
> + smt_idx,
> +#endif
> + bigcore_idx,
> + die_idx,
> +};
> +
>  #define MAX_THREAD_LIST_SIZE 8
>  #define THREAD_GROUP_SHARE_L1   1
>  struct thread_groups {
> @@ -851,13 +859,7 @@ static int powerpc_shared_cache_flags(void)
>   */
>  static const struct cpumask *shared_cache_mask(int cpu)
>  {
> - if (shared_caches)
> - return cpu_l2_cache_mask(cpu);
> -
> - if (has_big_cores)
> - return cpu_smallcore_mask(cpu);
> -
> - return per_cpu(cpu_sibling_map, cpu);
> + return per_cpu(cpu_l2_cache_map, cpu);
>  }
> 
>  #ifdef CONFIG_SCHED_SMT
> @@ -867,11 +869,16 @@ static const struct cpumask *smallcore_smt_mask(int cpu)
>  }
>  #endif
> 
> +static const struct cpumask *cpu_bigcore_mask(int cpu)
> +{
> + return per_cpu(cpu_sibling_map, cpu);
> +}
> +
>  static struct sched_domain_topology_level powerpc_topology[] = {
>  #ifdef CONFIG_SCHED_SMT
>   { cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) },
>  #endif
> - { shared_cache_mask, powerpc_shared_cache_flags, SD_INIT_NAME(CACHE) },
> + { cpu_bigcore_mask, SD_INIT_NAME(BIGCORE) },
>   { cpu_cpu_mask, SD_INIT_NAME(DIE) },
>   { NULL, },
>  };
> @@ -1311,7 +1318,6 @@ static void add_cpu_to_masks(int cpu)
>  void start_secondary(void *unused)
>  {
>   unsigned int cpu = smp_processor_id();
> - struct cpumask *(*sibling_mask)(int) = cpu_sibling_mask;
> 
>   mmgrab(_mm);
>   current->active_mm = _mm;
> @@ -1337,14 +1343,20 @@ void start_secondary(void *unused)
>   /* Update topology CPU masks */
>   add_cpu_to_masks(cpu);
> 
> - if (has_big_cores)
> - sibling_mask = cpu_smallcore_mask;
>   /*
>* Check for any shared caches. Note that this must be done on a
>* per-core basis because one core in the pair might be disabled.
>*/
> - if (!cpumask_equal(cpu_l2_cache_mask(cpu), sibling_mask(cpu)))
> - shared_caches = true;
> + if (!shared_caches) {
> + struct cpumask *(*sibling_mask)(int) = cpu_sibling_mask;
> + struct cpumask *mask = cpu_l2_cache_mask(cpu);
> +
> + if (has_big_cores)
> + sibling_mask = cpu_smallcore_mask;
> +
> + if (cpumask_weight(mask) > cpumask_weight(sibling_mask(cpu)))
> + shared_caches = true;
> + }
> 
>   set_numa_node(numa_cpu_lookup_table[cpu]);
>   set_numa_mem(local_memory_node(numa_cpu_lookup_table[cpu]));
> @@ -1375,9 +1387,17 @@ static void fixup_topology(void)
>  #ifdef CONFIG_SCHED_SMT
>   if (has_big_cores) {
>   pr_info("Big cores detected but using small core scheduling\n");
> - powerpc_topology[0].mask = smallcore_smt_mask;
> + powerpc_topology[smt_idx].mask = smallcore_smt_mask;
>   }
>  #endif
> + if (shared_caches) {
> + pr_info("Using shared cache scheduler topology\n");
> + powerpc_topology[bigcore_idx].mask = shared_cache_mask;
> + powerpc_topology[bigcore_idx].sd_flags = 
> powerpc_shared_cache_flags;
> +#ifdef CONFIG_SCHED_DEBUG
> + powerpc_topology[bigcore_idx].name = "CACHE";
> +#endif
> + }
>  }
> 
>  void __init smp_cpus_done(unsigned int max_cpus)
> -- 
> 2.17.1
> 


[PATCH v3 3/3] cpuidle-pseries : Fixup exit latency for CEDE(0)

2020-07-29 Thread Gautham R. Shenoy
From: "Gautham R. Shenoy" 

We are currently assuming that CEDE(0) has exit latency 10us, since
there is no way for us to query from the platform.  However, if the
wakeup latency of an Extended CEDE state is smaller than 10us, then we
can be sure that the exit latency of CEDE(0) cannot be more than that.
that.

In this patch, we fix the exit latency of CEDE(0) if we discover an
Extended CEDE state with wakeup latency smaller than 10us.

Benchmark results:

On POWER8, this patch does not have any impact since the advertized
latency of Extended CEDE (1) is 30us which is higher than the default
latency of CEDE (0) which is 10us.

On POWER9 we see improvement the single-threaded performance of ebizzy,
and no regression in the wakeup latency or the number of
context-switches.

ebizzy:
2 ebizzy threads bound to the same big-core. 25% improvement in the
avg records/s with patch.
x without_patch
* with_patch
N   Min   MaxMedian   AvgStddev
x  10   2491089   5834307   5398375   4244335 1596244.9
*  10   2893813   5834474   5832448 5327281.3 1055941.4

context_switch2 :
There is no major regression observed with this patch as seen from the
context_switch2 benchmark.

context_switch2 across CPU0 CPU1 (Both belong to same big-core, but different
small cores). We observe a minor 0.14% regression in the number of
context-switches (higher is better).
x without_patch
* with_patch
N   Min   MaxMedian   AvgStddev
x 500348872362236354712 354745.69  2711.827
* 500349422361452353942  354215.4 2576.9258
Difference at 99.0% confidence
-530.288 +/- 430.963
-0.149484% +/- 0.121485%
(Student's t, pooled s = 2645.24)

context_switch2 across CPU0 CPU8 (Different big-cores). We observe a 0.37%
improvement in the number of context-switches (higher is better).
x without_patch
* with_patch
N   Min   MaxMedian   AvgStddev
x 500287956294940288896 288977.23 646.59295
* 500288300294646289582 290064.76 1161.9992
Difference at 99.0% confidence
1087.53 +/- 153.194
0.376337% +/- 0.0530125%
(Student's t, pooled s = 940.299)

schbench:
No major difference could be seen until the 99.9th percentile.

Without-patch
Latency percentiles (usec)
50.0th: 29
75.0th: 39
90.0th: 49
95.0th: 59
*99.0th: 13104
99.5th: 14672
99.9th: 15824
min=0, max=17993

With-patch:
Latency percentiles (usec)
50.0th: 29
75.0th: 40
90.0th: 50
95.0th: 61
*99.0th: 13648
99.5th: 14768
99.9th: 15664
min=0, max=29812

Signed-off-by: Gautham R. Shenoy 
---
v2-->v3 : Made notation consistent with first two patches.
 drivers/cpuidle/cpuidle-pseries.c | 41 +--
 1 file changed, 39 insertions(+), 2 deletions(-)

diff --git a/drivers/cpuidle/cpuidle-pseries.c 
b/drivers/cpuidle/cpuidle-pseries.c
index f528da7..8d19820 100644
--- a/drivers/cpuidle/cpuidle-pseries.c
+++ b/drivers/cpuidle/cpuidle-pseries.c
@@ -350,13 +350,50 @@ static int pseries_cpuidle_driver_init(void)
return 0;
 }
 
-static void __init parse_xcede_idle_states(void)
+static void __init fixup_cede0_latency(void)
 {
+   int i;
+   u64 min_latency_us = dedicated_states[1].exit_latency; /* CEDE latency 
*/
+   struct xcede_latency_payload *payload;
+
if (parse_cede_parameters())
return;
 
pr_info("cpuidle : Skipping the %d Extended CEDE idle states\n",
nr_xcede_records);
+
+   payload = _latency_parameter.payload;
+   for (i = 0; i < nr_xcede_records; i++) {
+   struct xcede_latency_record *record = >records[i];
+   u64 latency_tb = be64_to_cpu(record->latency_ticks);
+   u64 latency_us = tb_to_ns(latency_tb) / NSEC_PER_USEC;
+
+   if (latency_us < min_latency_us)
+   min_latency_us = latency_us;
+   }
+
+   /*
+* By default, we assume that CEDE(0) has exit latency 10us,
+* since there is no way for us to query from the platform.
+*
+* However, if the wakeup latency of an Extended CEDE state is
+* smaller than 10us, then we can be sure that CEDE(0)
+* requires no more than that.
+*
+* Perform the fix-up.
+*/
+   if (min_latency_us < dedicated_states[1].exit_latency) {
+   u64 cede0_latency = min_latency_us - 1;
+
+   if (cede0_latency <= 0)
+   cede0_latency = min_latency_us;
+
+   dedicated_states[1].exit_latency = cede0_latency;
+   dedicated_states[1].target_residency = 10 * (cede0_latency);
+   pr_i

[PATCH v3 2/3] cpuidle-pseries: Add function to parse extended CEDE records

2020-07-29 Thread Gautham R. Shenoy
From: "Gautham R. Shenoy" 

Currently we use CEDE with latency-hint 0 as the only other idle state
on a dedicated LPAR apart from the polling "snooze" state.

The platform might support additional extended CEDE idle states, which
can be discovered through the "ibm,get-system-parameter" rtas-call
made with CEDE_LATENCY_TOKEN.

This patch adds a function to obtain information about the extended
CEDE idle states from the platform and parse the contents to populate
an array of extended CEDE states. These idle states thus discovered
will be added to the cpuidle framework in the next patch.

dmesg on a POWER8 and POWER9 LPAR, demonstrating the output of parsing
the extended CEDE latency parameters are as follows

POWER8
[   10.093279] xcede : xcede_record_size = 10
[   10.093285] xcede : Record 0 : hint = 1, latency = 0x3c00 tb ticks, 
Wake-on-irq = 1
[   10.093291] xcede : Record 1 : hint = 2, latency = 0x4e2000 tb ticks, 
Wake-on-irq = 0
[   10.093297] cpuidle : Skipping the 2 Extended CEDE idle states

POWER9
[5.913180] xcede : xcede_record_size = 10
[5.913183] xcede : Record 0 : hint = 1, latency = 0x400 tb ticks, 
Wake-on-irq = 1
[5.913188] xcede : Record 1 : hint = 2, latency = 0x3e8000 tb ticks, 
Wake-on-irq = 0
[5.913193] cpuidle : Skipping the 2 Extended CEDE idle states

Signed-off-by: Gautham R. Shenoy 
---
v2-->v3 : Cleaned up parse_cede_parameters(). Silenced some sparse warnings.
drivers/cpuidle/cpuidle-pseries.c | 142 ++
 1 file changed, 142 insertions(+)

diff --git a/drivers/cpuidle/cpuidle-pseries.c 
b/drivers/cpuidle/cpuidle-pseries.c
index f5865a2..f528da7 100644
--- a/drivers/cpuidle/cpuidle-pseries.c
+++ b/drivers/cpuidle/cpuidle-pseries.c
@@ -21,6 +21,7 @@
 #include 
 #include 
 #include 
+#include 
 
 static struct cpuidle_driver pseries_idle_driver = {
.name = "pseries_idle",
@@ -87,6 +88,137 @@ static void check_and_cede_processor(void)
 }
 
 #define NR_DEDICATED_STATES2 /* snooze, CEDE */
+/*
+ * XCEDE : Extended CEDE states discovered through the
+ * "ibm,get-systems-parameter" rtas-call with the token
+ * CEDE_LATENCY_TOKEN
+ */
+#define MAX_XCEDE_STATES   4
+#defineXCEDE_LATENCY_RECORD_SIZE   10
+#define XCEDE_LATENCY_PARAM_MAX_LENGTH (2 + 2 + \
+   (MAX_XCEDE_STATES * 
XCEDE_LATENCY_RECORD_SIZE))
+
+/*
+ * Section 7.3.16 System Parameters Option of PAPR version 2.8.1 has a
+ * table with all the parameters to ibm,get-system-parameters.
+ * CEDE_LATENCY_TOKEN corresponds to the token value for Cede Latency
+ * Settings Information.
+ */
+#define CEDE_LATENCY_TOKEN 45
+
+/*
+ * If the platform supports the cede latency settings
+ * information system parameter it must provide the following
+ * information in the NULL terminated parameter string:
+ *
+ * a. The first byte is the length “N” of each cede
+ *latency setting record minus one (zero indicates a length
+ *of 1 byte).
+ *
+ * b. For each supported cede latency setting a cede latency
+ *setting record consisting of the first “N” bytes as per
+ *the following table.
+ *
+ * -
+ * | Field   | Field  |
+ * | Name| Length |
+ * -
+ * | Cede Latency| 1 Byte |
+ * | Specifier Value ||
+ * -
+ * | Maximum wakeup  ||
+ * | latency in  | 8 Bytes|
+ * | tb-ticks||
+ * -
+ * | Responsive to   ||
+ * | external| 1 Byte |
+ * | interrupts  ||
+ * -
+ *
+ * This version has cede latency record size = 10.
+ *
+ * The structure xcede_latency_payload represents a) and b) with
+ * xcede_latency_record representing the table in b).
+ *
+ * xcede_latency_parameter is what gets returned by
+ * ibm,get-systems-parameter rtas-call when made with
+ * CEDE_LATENCY_TOKEN.
+ *
+ * These structures are only used to represent the data sent obtained
+ * by the rtas-call. The data is in Big-Endian.
+ */
+struct xcede_latency_record {
+   u8  hint;
+   __be64  latency_ticks;
+   u8  wake_on_irqs;
+} __packed;
+
+struct xcede_latency_payload {
+   u8 record_size;
+   struct xcede_latency_record records[MAX_XCEDE_STATES];
+} __packed;
+
+struct xcede_latency_parameter {
+   __be16  payload_size;
+   struct xcede_latency_payload payload;
+   u8 null_char;
+} __packed;
+
+static unsigned int nr_xcede_records;
+static struct xcede_latency_parameter xcede_latency_parameter __initdata;
+
+static int __init parse_cede_parameters(void)
+{
+   int ret, i;
+   u16 payload_size;
+   u8 xcede_record_size;
+   u32 total_xcede_records_size;
+   struct xcede_latency_payload *pa

[PATCH v3 0/3] cpuidle-pseries: Parse extended CEDE information for idle.

2020-07-29 Thread Gautham R. Shenoy
From: "Gautham R. Shenoy" 

This is a v3 of the patch series to parse the extended CEDE
information in the pseries-cpuidle driver.

The previous two versions of the patches can be found here:

v2: 
https://lore.kernel.org/lkml/1596005254-25753-1-git-send-email-...@linux.vnet.ibm.com/

v1: 
https://lore.kernel.org/linuxppc-dev/1594120299-31389-1-git-send-email-...@linux.vnet.ibm.com/

The change from v2 --> v1 :

 * Patch 1: Got rid of some #define-s which were needed mainly for Patches4 and
   5 of v1, but were retained in v2.

 * Patch 2:

* Based on feedback from Michael Ellerman, rewrote the
  function to parse the extended idle states by explicitly
  defining the structure of the object that is returned by
  ibm,get-system-parameters(CEDE_LATENCY_TOKEN) rtas-call. In
  the previous versions we were passing a character array and
  subsequently parsing the individual elements which can be
  bug-prone. This also gets rid of the excessive (cast *)ing
  that was in the previous versions.

  * Marked some of the functions static and annotated some of
  the functions with __init and data with __initdata. This
  makes Sparse happy.

  * Added comments for CEDE_LATENCY_TOKEN.

  * Renamed add_pseries_idle_states() to
parse_xcede_idle_states(). Again, this is because Patch 4
and 5 from v1 are no longer there.

 * Patch 3: No functional changes, but minor changes to be consistent
   with Patch 1 and 2 of this series.
 
I have additionally tested the code on POWER8 dedicated LPAR and found
that it has no impact, since the wakeup latency of CEDE(1) is 30us
which is greater that default latency that we are assuming for
CEDE(0). So we do not need to fixup CEDE(0) latency on POWER8.

Vaidy, I have removed your Reviewed-by for v1, since the code has
changed a little bit.

Gautham R. Shenoy (3):
  cpuidle-pseries: Set the latency-hint before entering CEDE
  cpuidle-pseries: Add function to parse extended CEDE records
  cpuidle-pseries : Fixup exit latency for CEDE(0)

 drivers/cpuidle/cpuidle-pseries.c | 190 +-
 1 file changed, 188 insertions(+), 2 deletions(-)

-- 
1.9.4



[PATCH v3 1/3] cpuidle-pseries: Set the latency-hint before entering CEDE

2020-07-29 Thread Gautham R. Shenoy
From: "Gautham R. Shenoy" 

As per the PAPR, each H_CEDE call is associated with a latency-hint to
be passed in the VPA field "cede_latency_hint". The CEDE states that
we were implicitly entering so far is CEDE with latency-hint = 0.

This patch explicitly sets the latency hint corresponding to the CEDE
state that we are currently entering. While at it, we save the
previous hint, to be restored once we wakeup from CEDE. This will be
required in the future when we expose extended-cede states through the
cpuidle framework, where each of them will have a different
cede-latency hint.

Signed-off-by: Gautham R. Shenoy 
---
v2-->v3 : Got rid of the usused NR_CEDE_STATES definition

 drivers/cpuidle/cpuidle-pseries.c | 11 +--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/drivers/cpuidle/cpuidle-pseries.c 
b/drivers/cpuidle/cpuidle-pseries.c
index 3e058ad2..f5865a2 100644
--- a/drivers/cpuidle/cpuidle-pseries.c
+++ b/drivers/cpuidle/cpuidle-pseries.c
@@ -86,19 +86,26 @@ static void check_and_cede_processor(void)
}
 }
 
+#define NR_DEDICATED_STATES2 /* snooze, CEDE */
+
+u8 cede_latency_hint[NR_DEDICATED_STATES];
 static int dedicated_cede_loop(struct cpuidle_device *dev,
struct cpuidle_driver *drv,
int index)
 {
+   u8 old_latency_hint;
 
pseries_idle_prolog();
get_lppaca()->donate_dedicated_cpu = 1;
+   old_latency_hint = get_lppaca()->cede_latency_hint;
+   get_lppaca()->cede_latency_hint = cede_latency_hint[index];
 
HMT_medium();
check_and_cede_processor();
 
local_irq_disable();
get_lppaca()->donate_dedicated_cpu = 0;
+   get_lppaca()->cede_latency_hint = old_latency_hint;
 
pseries_idle_epilog();
 
@@ -130,7 +137,7 @@ static int shared_cede_loop(struct cpuidle_device *dev,
 /*
  * States for dedicated partition case.
  */
-static struct cpuidle_state dedicated_states[] = {
+static struct cpuidle_state dedicated_states[NR_DEDICATED_STATES] = {
{ /* Snooze */
.name = "snooze",
.desc = "snooze",
@@ -233,7 +240,7 @@ static int pseries_idle_probe(void)
max_idle_state = ARRAY_SIZE(shared_states);
} else {
cpuidle_state_table = dedicated_states;
-   max_idle_state = ARRAY_SIZE(dedicated_states);
+   max_idle_state = NR_DEDICATED_STATES;
}
} else
return -ENODEV;
-- 
1.9.4



[PATCH v2 3/3] cpuidle-pseries : Fixup exit latency for CEDE(0)

2020-07-29 Thread Gautham R. Shenoy
From: "Gautham R. Shenoy" 

We are currently assuming that CEDE(0) has exit latency 10us, since
there is no way for us to query from the platform.  However, if the
wakeup latency of an Extended CEDE state is smaller than 10us, then we
can be sure that the exit latency of CEDE(0) cannot be more than that.
that.

In this patch, we fix the exit latency of CEDE(0) if we discover an
Extended CEDE state with wakeup latency smaller than 10us.

Benchmark results:

ebizzy:
2 ebizzy threads bound to the same big-core. 25% improvement in the
avg records/s with patch.
x without_patch
+ with_patch
N   Min   MaxMedian   AvgStddev
x  10   2491089   5834307   5398375   4244335 1596244.9
+  10   2893813   5834474   5832448 5327281.3 1055941.4

context_switch2 :
There is no major regression observed with this patch as seen from the
context_switch2 benchmark.

context_switch2 across CPU0 CPU1 (Both belong to same big-core, but different
small cores). We observe a minor 0.14% regression in the number of
context-switches (higher is better).
x without_patch
+ with_patch
N   Min   MaxMedian   AvgStddev
x 500348872362236354712 354745.69  2711.827
+ 500349422361452353942  354215.4 2576.9258
Difference at 99.0% confidence
-530.288 +/- 430.963
-0.149484% +/- 0.121485%
(Student's t, pooled s = 2645.24)

context_switch2 across CPU0 CPU8 (Different big-cores). We observe a 0.37%
improvement in the number of context-switches (higher is better).
x without_patch
+ with_patch
N   Min   MaxMedian   AvgStddev
x 500287956294940288896 288977.23 646.59295
+ 500288300294646289582 290064.76 1161.9992
Difference at 99.0% confidence
1087.53 +/- 153.194
0.376337% +/- 0.0530125%
(Student's t, pooled s = 940.299)

schbench:
No major difference could be seen until the 99.9th percentile.

Without-patch
Latency percentiles (usec)
50.0th: 29
75.0th: 39
90.0th: 49
95.0th: 59
*99.0th: 13104
99.5th: 14672
99.9th: 15824
min=0, max=17993

With-patch:
Latency percentiles (usec)
50.0th: 29
75.0th: 40
90.0th: 50
95.0th: 61
*99.0th: 13648
99.5th: 14768
99.9th: 15664
min=0, max=29812

Reviewed-by: Vaidyanathan Srinivasan 
Signed-off-by: Gautham R. Shenoy 
---
 drivers/cpuidle/cpuidle-pseries.c | 34 --
 1 file changed, 32 insertions(+), 2 deletions(-)

diff --git a/drivers/cpuidle/cpuidle-pseries.c 
b/drivers/cpuidle/cpuidle-pseries.c
index b1dc24d..0b2f115 100644
--- a/drivers/cpuidle/cpuidle-pseries.c
+++ b/drivers/cpuidle/cpuidle-pseries.c
@@ -334,12 +334,42 @@ static int pseries_cpuidle_driver_init(void)
 static int add_pseries_idle_states(void)
 {
int nr_states = 2; /* By default we have snooze, CEDE */
+   int i;
+   u64 min_latency_us = dedicated_states[1].exit_latency; /* CEDE latency 
*/
 
if (parse_cede_parameters())
return nr_states;
 
-   pr_info("cpuidle : Skipping the %d Extended CEDE idle states\n",
-   nr_xcede_records);
+   for (i = 0; i < nr_xcede_records; i++) {
+   u64 latency_tb = xcede_records[i].wakeup_latency_tb_ticks;
+   u64 latency_us = tb_to_ns(latency_tb) / NSEC_PER_USEC;
+
+   if (latency_us < min_latency_us)
+   min_latency_us = latency_us;
+   }
+
+   /*
+* We are currently assuming that CEDE(0) has exit latency
+* 10us, since there is no way for us to query from the
+* platform.
+*
+* However, if the wakeup latency of an Extended CEDE state is
+* smaller than 10us, then we can be sure that CEDE(0)
+* requires no more than that.
+*
+* Perform the fix-up.
+*/
+   if (min_latency_us < dedicated_states[1].exit_latency) {
+   u64 cede0_latency = min_latency_us - 1;
+
+   if (cede0_latency <= 0)
+   cede0_latency = min_latency_us;
+
+   dedicated_states[1].exit_latency = cede0_latency;
+   dedicated_states[1].target_residency = 10 * (cede0_latency);
+   pr_info("cpuidle : Fixed up CEDE exit latency to %llu us\n",
+   cede0_latency);
+   }
 
return nr_states;
 }
-- 
1.9.4



[PATCH v2 0/3] cpuidle-pseries: Parse extended CEDE information for idle.

2020-07-29 Thread Gautham R. Shenoy
From: "Gautham R. Shenoy" 

Hi,

This is a v2 of the patch series to parse the extended CEDE
information in the pseries-cpuidle driver.

The v1 of this patchset can be found here :
https://lore.kernel.org/linuxppc-dev/1594120299-31389-1-git-send-email-...@linux.vnet.ibm.com/

The change from v1 --> v2 :

 * Dropped Patches 4 and 5 which would expose extended idle-states,
   that wakeup on external interrupts, to cpuidle framework.  These
   were RFC patches in v1. Dropped them because currently the only
   extended CEDE state that wakesup on external interrupts is CEDE(1)
   which adds no signifcant value over CEDE(0).
   
 * Rebased the patches onto powerpc/merge.
 
 * No changes in code for Patches 1-3.

Motivation:
===
On pseries Dedicated Linux LPARs, apart from the polling snooze idle
state, we currently have the CEDE idle state which cedes the CPU to
the hypervisor with latency-hint = 0.

However, the PowerVM hypervisor supports additional extended CEDE
states, which can be queried through the "ibm,get-systems-parameter"
rtas-call with the CEDE_LATENCY_TOKEN. The hypervisor maps these
extended CEDE states to appropriate platform idle-states in order to
provide energy-savings as well as shifting power to the active
units. On existing pseries LPARs today we have extended CEDE with
latency-hints {1,2} supported.

The patches in this patchset, adds code to parse the CEDE latency
records provided by the hypervisor. We use this information to
determine the wakeup latency of the regular CEDE (which we have been
so far hardcoding to 10us while experimentally it is much lesser ~
1us), by looking at the wakeup latency provided by the hypervisor for
Extended CEDE states. Since the platform currently advertises Extended
CEDE 1 to have wakeup latency of 2us, we can be sure that the wakeup
latency of the regular CEDE is no more than this.

With Patches 1-3, we see an improvement in the single-threaded
performance on ebizzy.

2 ebizzy threads bound to the same big-core. 25% improvement in the
avg records/s (higher the better) with patches 1-3.
x without_patches
* with_patches
N   Min   MaxMedian   AvgStddev
x  10   2491089   5834307   5398375   4244335 1596244.9
*  10   2893813   5834474   5832448 5327281.3 1055941.4

We do not observe any major regression in either the context_switch2
benchmark or the schbench benchmark

context_switch2 across CPU0 CPU1 (Both belong to same big-core, but different
small cores). We observe a minor 0.14% regression in the number of
context-switches (higher is better).
x without_patch
* with_patch
N   Min   MaxMedian   AvgStddev
x 500348872362236354712 354745.69  2711.827
* 500349422361452353942  354215.4 2576.9258

context_switch2 across CPU0 CPU8 (Different big-cores). We observe a 0.37%
improvement in the number of context-switches (higher is better).
x without_patch
* with_patch
N   Min   MaxMedian   AvgStddev
x 500287956294940288896 288977.23 646.59295
* 500288300294646289582 290064.76 1161.9992

schbench:
No major difference could be seen until the 99.9th percentile.

Without-patch
Latency percentiles (usec)
50.0th: 29
75.0th: 39
90.0th: 49
95.0th: 59
*99.0th: 13104
99.5th: 14672
99.9th: 15824
min=0, max=17993

With-patch:
Latency percentiles (usec)
50.0th: 29
75.0th: 40
90.0th: 50
95.0th: 61
*99.0th: 13648
99.5th: 14768
99.9th: 15664
    min=0, max=29812

Gautham R. Shenoy (3):
  cpuidle-pseries: Set the latency-hint before entering CEDE
  cpuidle-pseries: Add function to parse extended CEDE records
  cpuidle-pseries : Fixup exit latency for CEDE(0)

 drivers/cpuidle/cpuidle-pseries.c | 167 +-
 1 file changed, 165 insertions(+), 2 deletions(-)

-- 
1.9.4



[PATCH v2 1/3] cpuidle-pseries: Set the latency-hint before entering CEDE

2020-07-29 Thread Gautham R. Shenoy
From: "Gautham R. Shenoy" 

As per the PAPR, each H_CEDE call is associated with a latency-hint to
be passed in the VPA field "cede_latency_hint". The CEDE states that
we were implicitly entering so far is CEDE with latency-hint = 0.

This patch explicitly sets the latency hint corresponding to the CEDE
state that we are currently entering. While at it, we save the
previous hint, to be restored once we wakeup from CEDE. This will be
required in the future when we expose extended-cede states through the
cpuidle framework, where each of them will have a different
cede-latency hint.

Reviewed-by: Vaidyanathan Srinivasan 
Signed-off-by: Gautham R. Shenoy 
---
 drivers/cpuidle/cpuidle-pseries.c | 10 +-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/drivers/cpuidle/cpuidle-pseries.c 
b/drivers/cpuidle/cpuidle-pseries.c
index 3e058ad2..88e71c3 100644
--- a/drivers/cpuidle/cpuidle-pseries.c
+++ b/drivers/cpuidle/cpuidle-pseries.c
@@ -86,19 +86,27 @@ static void check_and_cede_processor(void)
}
 }
 
+#define NR_CEDE_STATES 1  /* CEDE with latency-hint 0 */
+#define NR_DEDICATED_STATES(NR_CEDE_STATES + 1) /* Includes snooze */
+
+u8 cede_latency_hint[NR_DEDICATED_STATES];
 static int dedicated_cede_loop(struct cpuidle_device *dev,
struct cpuidle_driver *drv,
int index)
 {
+   u8 old_latency_hint;
 
pseries_idle_prolog();
get_lppaca()->donate_dedicated_cpu = 1;
+   old_latency_hint = get_lppaca()->cede_latency_hint;
+   get_lppaca()->cede_latency_hint = cede_latency_hint[index];
 
HMT_medium();
check_and_cede_processor();
 
local_irq_disable();
get_lppaca()->donate_dedicated_cpu = 0;
+   get_lppaca()->cede_latency_hint = old_latency_hint;
 
pseries_idle_epilog();
 
@@ -130,7 +138,7 @@ static int shared_cede_loop(struct cpuidle_device *dev,
 /*
  * States for dedicated partition case.
  */
-static struct cpuidle_state dedicated_states[] = {
+static struct cpuidle_state dedicated_states[NR_DEDICATED_STATES] = {
{ /* Snooze */
.name = "snooze",
.desc = "snooze",
-- 
1.9.4



[PATCH v2 2/3] cpuidle-pseries: Add function to parse extended CEDE records

2020-07-29 Thread Gautham R. Shenoy
From: "Gautham R. Shenoy" 

Currently we use CEDE with latency-hint 0 as the only other idle state
on a dedicated LPAR apart from the polling "snooze" state.

The platform might support additional extended CEDE idle states, which
can be discovered through the "ibm,get-system-parameter" rtas-call
made with CEDE_LATENCY_TOKEN.

This patch adds a function to obtain information about the extended
CEDE idle states from the platform and parse the contents to populate
an array of extended CEDE states. These idle states thus discovered
will be added to the cpuidle framework in the next patch.

dmesg on a POWER9 LPAR, demonstrating the output of parsing the
extended CEDE latency parameters.

[5.913180] xcede : xcede_record_size = 10
[5.913183] xcede : Record 0 : hint = 1, latency =0x400 tb-ticks, 
Wake-on-irq = 1
[5.913188] xcede : Record 1 : hint = 2, latency =0x3e8000 tb-ticks, 
Wake-on-irq = 0
[5.913193] cpuidle : Skipping the 2 Extended CEDE idle states

Reviewed-by: Vaidyanathan Srinivasan 
Signed-off-by: Gautham R. Shenoy 
---
 drivers/cpuidle/cpuidle-pseries.c | 129 +-
 1 file changed, 127 insertions(+), 2 deletions(-)

diff --git a/drivers/cpuidle/cpuidle-pseries.c 
b/drivers/cpuidle/cpuidle-pseries.c
index 88e71c3..b1dc24d 100644
--- a/drivers/cpuidle/cpuidle-pseries.c
+++ b/drivers/cpuidle/cpuidle-pseries.c
@@ -21,6 +21,7 @@
 #include 
 #include 
 #include 
+#include 
 
 static struct cpuidle_driver pseries_idle_driver = {
.name = "pseries_idle",
@@ -86,9 +87,120 @@ static void check_and_cede_processor(void)
}
 }
 
-#define NR_CEDE_STATES 1  /* CEDE with latency-hint 0 */
+struct xcede_latency_records {
+   u8  latency_hint;
+   u64 wakeup_latency_tb_ticks;
+   u8  responsive_to_irqs;
+};
+
+/*
+ * XCEDE : Extended CEDE states discovered through the
+ * "ibm,get-systems-parameter" rtas-call with the token
+ * CEDE_LATENCY_TOKEN
+ */
+#define MAX_XCEDE_STATES   4
+#defineXCEDE_LATENCY_RECORD_SIZE   10
+#define XCEDE_LATENCY_PARAM_MAX_LENGTH (2 + 2 + \
+   (MAX_XCEDE_STATES * 
XCEDE_LATENCY_RECORD_SIZE))
+
+#define CEDE_LATENCY_TOKEN 45
+
+#define NR_CEDE_STATES (MAX_XCEDE_STATES + 1) /* CEDE with 
latency-hint 0 */
 #define NR_DEDICATED_STATES(NR_CEDE_STATES + 1) /* Includes snooze */
 
+struct xcede_latency_records xcede_records[MAX_XCEDE_STATES];
+unsigned int nr_xcede_records;
+char xcede_parameters[XCEDE_LATENCY_PARAM_MAX_LENGTH];
+
+static int parse_cede_parameters(void)
+{
+   int ret = -1, i;
+   u16 payload_length;
+   u8 xcede_record_size;
+   u32 total_xcede_records_size;
+   char *payload;
+
+   memset(xcede_parameters, 0, XCEDE_LATENCY_PARAM_MAX_LENGTH);
+
+   ret = rtas_call(rtas_token("ibm,get-system-parameter"), 3, 1,
+   NULL, CEDE_LATENCY_TOKEN, __pa(xcede_parameters),
+   XCEDE_LATENCY_PARAM_MAX_LENGTH);
+
+   if (ret) {
+   pr_err("xcede: Error parsing CEDE_LATENCY_TOKEN\n");
+   return ret;
+   }
+
+   payload_length = be16_to_cpu(*(__be16 *)(_parameters[0]));
+   payload = _parameters[2];
+
+   /*
+* If the platform supports the cede latency settings
+* information system parameter it must provide the following
+* information in the NULL terminated parameter string:
+*
+* a. The first byte is the length “N” of each cede
+*latency setting record minus one (zero indicates a length
+*of 1 byte).
+*
+* b. For each supported cede latency setting a cede latency
+*setting record consisting of the first “N” bytes as per
+*the following table.
+*
+*  -
+*  | Field   | Field  |
+*  | Name| Length |
+*  -
+*  | Cede Latency| 1 Byte |
+*  | Specifier Value ||
+*  -
+*  | Maximum wakeup  ||
+*  | latency in  | 8 Bytes|
+*  | tb-ticks||
+*  -
+*  | Responsive to   ||
+*  | external| 1 Byte |
+*  | interrupts  ||
+*  -
+*
+* This version has cede latency record size = 10.
+*/
+   xcede_record_size = (u8)payload[0] + 1;
+
+   if (xcede_record_size != XCEDE_LATENCY_RECORD_SIZE) {
+   pr_err("xcede : Expected record-size %d. Observed size %d.\n",
+  XCEDE_LATENCY_RECORD_SIZE, xcede_record_size);
+   return -EINVAL;
+

Re: [PATCH 0/5] cpuidle-pseries: Parse extended CEDE information for idle.

2020-07-27 Thread Gautham R Shenoy
Hello Rafael,

On Mon, Jul 27, 2020 at 04:14:12PM +0200, Rafael J. Wysocki wrote:
> On Tue, Jul 7, 2020 at 1:32 PM Gautham R Shenoy  
> wrote:
> >
> > Hi,
> >
> > On Tue, Jul 07, 2020 at 04:41:34PM +0530, Gautham R. Shenoy wrote:
> > > From: "Gautham R. Shenoy" 
> > >
> > > Hi,
> > >
> > >
> > >
> > >
> > > Gautham R. Shenoy (5):
> > >   cpuidle-pseries: Set the latency-hint before entering CEDE
> > >   cpuidle-pseries: Add function to parse extended CEDE records
> > >   cpuidle-pseries : Fixup exit latency for CEDE(0)
> > >   cpuidle-pseries : Include extended CEDE states in cpuidle framework
> > >   cpuidle-pseries: Block Extended CEDE(1) which adds no additional
> > > value.
> >
> > Forgot to mention that these patches are on top of Nathan's series to
> > remove extended CEDE offline and bogus topology update code :
> > https://lore.kernel.org/linuxppc-dev/20200612051238.1007764-1-nath...@linux.ibm.com/
> 
> OK, so this is targeted at the powerpc maintainers, isn't it?

Yes, the code is powerpc specific.

Also, I noticed that Nathan's patches have been merged by Michael
Ellerman in the powerpc/merge tree. I will rebase and post a v2 of
this patch series.

--
Thanks and Regards
gautham.


Re: [PATCH v4 09/10] Powerpc/smp: Create coregroup domain

2020-07-27 Thread Gautham R Shenoy
Hi Srikar,

On Mon, Jul 27, 2020 at 11:02:29AM +0530, Srikar Dronamraju wrote:
> Add percpu coregroup maps and masks to create coregroup domain.
> If a coregroup doesn't exist, the coregroup domain will be degenerated
> in favour of SMT/CACHE domain.
> 
> Cc: linuxppc-dev 
> Cc: LKML 
> Cc: Michael Ellerman 
> Cc: Nicholas Piggin 
> Cc: Anton Blanchard 
> Cc: Oliver O'Halloran 
> Cc: Nathan Lynch 
> Cc: Michael Neuling 
> Cc: Gautham R Shenoy 
> Cc: Ingo Molnar 
> Cc: Peter Zijlstra 
> Cc: Valentin Schneider 
> Cc: Jordan Niethe 
> Signed-off-by: Srikar Dronamraju 

This version looks good to me.

Reviewed-by: Gautham R. Shenoy 


> ---
> Changelog v3 ->v4:
>   if coregroup_support doesn't exist, update MC mask to the next
>   smaller domain mask.
> 
> Changelog v2 -> v3:
>   Add optimization for mask updation under coregroup_support
> 
> Changelog v1 -> v2:
>   Moved coregroup topology fixup to fixup_topology (Gautham)
> 
>  arch/powerpc/include/asm/topology.h | 10 +++
>  arch/powerpc/kernel/smp.c   | 44 +
>  arch/powerpc/mm/numa.c  |  5 
>  3 files changed, 59 insertions(+)
> 
> diff --git a/arch/powerpc/include/asm/topology.h 
> b/arch/powerpc/include/asm/topology.h
> index f0b6300e7dd3..6609174918ab 100644
> --- a/arch/powerpc/include/asm/topology.h
> +++ b/arch/powerpc/include/asm/topology.h
> @@ -88,12 +88,22 @@ static inline int cpu_distance(__be32 *cpu1_assoc, __be32 
> *cpu2_assoc)
> 
>  #if defined(CONFIG_NUMA) && defined(CONFIG_PPC_SPLPAR)
>  extern int find_and_online_cpu_nid(int cpu);
> +extern int cpu_to_coregroup_id(int cpu);
>  #else
>  static inline int find_and_online_cpu_nid(int cpu)
>  {
>   return 0;
>  }
> 
> +static inline int cpu_to_coregroup_id(int cpu)
> +{
> +#ifdef CONFIG_SMP
> + return cpu_to_core_id(cpu);
> +#else
> + return 0;
> +#endif
> +}
> +
>  #endif /* CONFIG_NUMA && CONFIG_PPC_SPLPAR */
> 
>  #include 
> diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
> index dab96a1203ec..95f0bf72e283 100644
> --- a/arch/powerpc/kernel/smp.c
> +++ b/arch/powerpc/kernel/smp.c
> @@ -80,6 +80,7 @@ DEFINE_PER_CPU(cpumask_var_t, cpu_sibling_map);
>  DEFINE_PER_CPU(cpumask_var_t, cpu_smallcore_map);
>  DEFINE_PER_CPU(cpumask_var_t, cpu_l2_cache_map);
>  DEFINE_PER_CPU(cpumask_var_t, cpu_core_map);
> +DEFINE_PER_CPU(cpumask_var_t, cpu_coregroup_map);
> 
>  EXPORT_PER_CPU_SYMBOL(cpu_sibling_map);
>  EXPORT_PER_CPU_SYMBOL(cpu_l2_cache_map);
> @@ -91,6 +92,7 @@ enum {
>   smt_idx,
>  #endif
>   bigcore_idx,
> + mc_idx,
>   die_idx,
>  };
> 
> @@ -869,6 +871,21 @@ static const struct cpumask *smallcore_smt_mask(int cpu)
>  }
>  #endif
> 
> +static struct cpumask *cpu_coregroup_mask(int cpu)
> +{
> + return per_cpu(cpu_coregroup_map, cpu);
> +}
> +
> +static bool has_coregroup_support(void)
> +{
> + return coregroup_enabled;
> +}
> +
> +static const struct cpumask *cpu_mc_mask(int cpu)
> +{
> + return cpu_coregroup_mask(cpu);
> +}
> +
>  static const struct cpumask *cpu_bigcore_mask(int cpu)
>  {
>   return per_cpu(cpu_sibling_map, cpu);
> @@ -879,6 +896,7 @@ static struct sched_domain_topology_level 
> powerpc_topology[] = {
>   { cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) },
>  #endif
>   { cpu_bigcore_mask, SD_INIT_NAME(BIGCORE) },
> + { cpu_mc_mask, SD_INIT_NAME(MC) },
>   { cpu_cpu_mask, SD_INIT_NAME(DIE) },
>   { NULL, },
>  };
> @@ -925,6 +943,10 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
>   GFP_KERNEL, cpu_to_node(cpu));
>   zalloc_cpumask_var_node(_cpu(cpu_core_map, cpu),
>   GFP_KERNEL, cpu_to_node(cpu));
> + if (has_coregroup_support())
> + zalloc_cpumask_var_node(_cpu(cpu_coregroup_map, 
> cpu),
> + GFP_KERNEL, cpu_to_node(cpu));
> +
>  #ifdef CONFIG_NEED_MULTIPLE_NODES
>   /*
>* numa_node_id() works after this.
> @@ -942,6 +964,9 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
>   cpumask_set_cpu(boot_cpuid, cpu_l2_cache_mask(boot_cpuid));
>   cpumask_set_cpu(boot_cpuid, cpu_core_mask(boot_cpuid));
> 
> + if (has_coregroup_support())
> + cpumask_set_cpu(boot_cpuid, cpu_coregroup_mask(boot_cpuid));
> +
>   init_big_cores();
>   if (has_big_cores) {
>   cpumask_set_cpu(boot_cpuid,
> @@ -1233,6 +1258,8 @@ static void remove_

Re: [PATCH v3 09/10] powerpc/smp: Create coregroup domain

2020-07-26 Thread Gautham R Shenoy
On Thu, Jul 23, 2020 at 02:21:15PM +0530, Srikar Dronamraju wrote:
> Add percpu coregroup maps and masks to create coregroup domain.
> If a coregroup doesn't exist, the coregroup domain will be degenerated
> in favour of SMT/CACHE domain.
> 
> Cc: linuxppc-dev 
> Cc: LKML 
> Cc: Michael Ellerman 
> Cc: Nicholas Piggin 
> Cc: Anton Blanchard 
> Cc: Oliver O'Halloran 
> Cc: Nathan Lynch 
> Cc: Michael Neuling 
> Cc: Gautham R Shenoy 
> Cc: Ingo Molnar 
> Cc: Peter Zijlstra 
> Cc: Valentin Schneider 
> Cc: Jordan Niethe 
> Signed-off-by: Srikar Dronamraju 
> ---
> Changelog v2 -> v3:
>   Add optimization for mask updation under coregroup_support
> 
> Changelog v1 -> v2:
>   Moved coregroup topology fixup to fixup_topology (Gautham)
> 
>  arch/powerpc/include/asm/topology.h | 10 +++
>  arch/powerpc/kernel/smp.c   | 44 +
>  arch/powerpc/mm/numa.c  |  5 
>  3 files changed, 59 insertions(+)
> 
> diff --git a/arch/powerpc/include/asm/topology.h 
> b/arch/powerpc/include/asm/topology.h
> index f0b6300e7dd3..6609174918ab 100644
> --- a/arch/powerpc/include/asm/topology.h
> +++ b/arch/powerpc/include/asm/topology.h
> @@ -88,12 +88,22 @@ static inline int cpu_distance(__be32 *cpu1_assoc, __be32 
> *cpu2_assoc)
> 
>  #if defined(CONFIG_NUMA) && defined(CONFIG_PPC_SPLPAR)
>  extern int find_and_online_cpu_nid(int cpu);
> +extern int cpu_to_coregroup_id(int cpu);
>  #else
>  static inline int find_and_online_cpu_nid(int cpu)
>  {
>   return 0;
>  }
> 
> +static inline int cpu_to_coregroup_id(int cpu)
> +{
> +#ifdef CONFIG_SMP
> + return cpu_to_core_id(cpu);
> +#else
> + return 0;
> +#endif
> +}
> +
>  #endif /* CONFIG_NUMA && CONFIG_PPC_SPLPAR */
> 
>  #include 
> diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
> index 7d8d44cbab11..1faedde3e406 100644
> --- a/arch/powerpc/kernel/smp.c
> +++ b/arch/powerpc/kernel/smp.c
> @@ -80,6 +80,7 @@ DEFINE_PER_CPU(cpumask_var_t, cpu_sibling_map);
>  DEFINE_PER_CPU(cpumask_var_t, cpu_smallcore_map);
>  DEFINE_PER_CPU(cpumask_var_t, cpu_l2_cache_map);
>  DEFINE_PER_CPU(cpumask_var_t, cpu_core_map);
> +DEFINE_PER_CPU(cpumask_var_t, cpu_coregroup_map);
> 
>  EXPORT_PER_CPU_SYMBOL(cpu_sibling_map);
>  EXPORT_PER_CPU_SYMBOL(cpu_l2_cache_map);
> @@ -91,6 +92,7 @@ enum {
>   smt_idx,
>  #endif
>   bigcore_idx,
> + mc_idx,
>   die_idx,
>  };
> 
> @@ -869,6 +871,21 @@ static const struct cpumask *smallcore_smt_mask(int cpu)
>  }
>  #endif
> 
> +static struct cpumask *cpu_coregroup_mask(int cpu)
> +{
> + return per_cpu(cpu_coregroup_map, cpu);
> +}
> +
> +static bool has_coregroup_support(void)
> +{
> + return coregroup_enabled;
> +}
> +
> +static const struct cpumask *cpu_mc_mask(int cpu)
> +{
> + return cpu_coregroup_mask(cpu);
> +}
> +
>  static const struct cpumask *cpu_bigcore_mask(int cpu)
>  {
>   return per_cpu(cpu_sibling_map, cpu);
> @@ -879,6 +896,7 @@ static struct sched_domain_topology_level 
> powerpc_topology[] = {
>   { cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) },
>  #endif
>   { cpu_bigcore_mask, SD_INIT_NAME(BIGCORE) },
> + { cpu_mc_mask, SD_INIT_NAME(MC) },
>   { cpu_cpu_mask, SD_INIT_NAME(DIE) },
>   { NULL, },
>  };

[..snip..]

> @@ -1384,6 +1425,9 @@ int setup_profiling_timer(unsigned int multiplier)
> 
>  static void fixup_topology(void)
>  {
> + if (!has_coregroup_support())
> + powerpc_topology[mc_idx].mask = cpu_bigcore_mask;
> +
>   if (shared_caches) {
>   pr_info("Using shared cache scheduler topology\n");
>   powerpc_topology[bigcore_idx].mask = shared_cache_mask;


Suppose we consider a topology which does not have coregroup_support,
but has shared_caches. In that case, we would want our coregroup
domain to degenerate.

>From the above code, after the fixup, our topology will look as
follows:

static struct sched_domain_topology_level powerpc_topology[] = {
{ cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) },
{ shared_cache_mask, powerpc_shared_cache_flags, SD_INIT_NAME(CACHE) },
{ cpu_bigcore_mask, SD_INIT_NAME(MC) },
{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
{ NULL, },

So, in this case, the core-group domain (identified by MC) will
degenerate only if cpu_bigcore_mask() and shared_cache_mask() return
the same value. This may work for existing platforms, because either
shared_caches don't exist, or when they do, cpu_bigcore_mask and
shared_cache_mask return the same set of CPUs. But this may or may not
continue to hold good in the futur

Re: [PATCH v3 05/10] powerpc/smp: Dont assume l2-cache to be superset of sibling

2020-07-24 Thread Gautham R Shenoy


On Thu, Jul 23, 2020 at 02:21:11PM +0530, Srikar Dronamraju wrote:
> Current code assumes that cpumask of cpus sharing a l2-cache mask will
> always be a superset of cpu_sibling_mask.
> 
> Lets stop that assumption. cpu_l2_cache_mask is a superset of
> cpu_sibling_mask if and only if shared_caches is set.
> 
> Cc: linuxppc-dev 
> Cc: LKML 
> Cc: Michael Ellerman 
> Cc: Nicholas Piggin 
> Cc: Anton Blanchard 
> Cc: Oliver O'Halloran 
> Cc: Nathan Lynch 
> Cc: Michael Neuling 
> Cc: Gautham R Shenoy 
> Cc: Ingo Molnar 
> Cc: Peter Zijlstra 
> Cc: Valentin Schneider 
> Cc: Jordan Niethe 
> Signed-off-by: Srikar Dronamraju 

Reviewed-by: Gautham R. Shenoy 

> ---
> Changelog v1 -> v2:
>   Set cpumask after verifying l2-cache. (Gautham)
> 
>  arch/powerpc/kernel/smp.c | 28 +++-
>  1 file changed, 15 insertions(+), 13 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
> index da27f6909be1..d997c7411664 100644
> --- a/arch/powerpc/kernel/smp.c
> +++ b/arch/powerpc/kernel/smp.c
> @@ -1194,6 +1194,7 @@ static bool update_mask_by_l2(int cpu, struct cpumask 
> *(*mask_fn)(int))
>   if (!l2_cache)
>   return false;
> 
> + cpumask_set_cpu(cpu, mask_fn(cpu));
>   for_each_cpu(i, cpu_online_mask) {
>   /*
>* when updating the marks the current CPU has not been marked
> @@ -1276,29 +1277,30 @@ static void add_cpu_to_masks(int cpu)
>* add it to it's own thread sibling mask.
>*/
>   cpumask_set_cpu(cpu, cpu_sibling_mask(cpu));
> + cpumask_set_cpu(cpu, cpu_core_mask(cpu));
> 
>   for (i = first_thread; i < first_thread + threads_per_core; i++)
>   if (cpu_online(i))
>   set_cpus_related(i, cpu, cpu_sibling_mask);
> 
>   add_cpu_to_smallcore_masks(cpu);
> - /*
> -  * Copy the thread sibling mask into the cache sibling mask
> -  * and mark any CPUs that share an L2 with this CPU.
> -  */
> - for_each_cpu(i, cpu_sibling_mask(cpu))
> - set_cpus_related(cpu, i, cpu_l2_cache_mask);
>   update_mask_by_l2(cpu, cpu_l2_cache_mask);
> 
> - /*
> -  * Copy the cache sibling mask into core sibling mask and mark
> -  * any CPUs on the same chip as this CPU.
> -  */
> - for_each_cpu(i, cpu_l2_cache_mask(cpu))
> - set_cpus_related(cpu, i, cpu_core_mask);
> + if (pkg_id == -1) {
> + struct cpumask *(*mask)(int) = cpu_sibling_mask;
> +
> + /*
> +  * Copy the sibling mask into core sibling mask and
> +  * mark any CPUs on the same chip as this CPU.
> +  */
> + if (shared_caches)
> + mask = cpu_l2_cache_mask;
> +
> + for_each_cpu(i, mask(cpu))
> + set_cpus_related(cpu, i, cpu_core_mask);
> 
> - if (pkg_id == -1)
>   return;
> + }
> 
>   for_each_cpu(i, cpu_online_mask)
>   if (get_physical_package_id(i) == pkg_id)
> -- 
> 2.18.2
> 


Re: [PATCH v2 05/10] powerpc/smp: Dont assume l2-cache to be superset of sibling

2020-07-24 Thread Gautham R Shenoy
On Wed, Jul 22, 2020 at 12:27:47PM +0530, Srikar Dronamraju wrote:
> * Gautham R Shenoy  [2020-07-22 11:51:14]:
> 
> > Hi Srikar,
> > 
> > > diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
> > > index 72f16dc0cb26..57468877499a 100644
> > > --- a/arch/powerpc/kernel/smp.c
> > > +++ b/arch/powerpc/kernel/smp.c
> > > @@ -1196,6 +1196,7 @@ static bool update_mask_by_l2(int cpu, struct 
> > > cpumask *(*mask_fn)(int))
> > >   if (!l2_cache)
> > >   return false;
> > > 
> > > + cpumask_set_cpu(cpu, mask_fn(cpu));
> > 
> > 
> > Ok, we need to do this because "cpu" is not yet set in the
> > cpu_online_mask. Prior to your patch the "cpu" was getting set in
> > cpu_l2_cache_map(cpu) as a side-effect of the code that is removed in
> > the patch.
> > 
> 
> Right.
> 
> > 
> > >   for_each_cpu(i, cpu_online_mask) {
> > >   /*
> > >* when updating the marks the current CPU has not been marked
> > > @@ -1278,29 +1279,30 @@ static void add_cpu_to_masks(int cpu)
> > >* add it to it's own thread sibling mask.
> > >*/
> > >   cpumask_set_cpu(cpu, cpu_sibling_mask(cpu));
> > > + cpumask_set_cpu(cpu, cpu_core_mask(cpu));
> 
> Note: Above, we are explicitly setting the cpu_core_mask.

You are right. I missed this.

> 
> > > 
> > >   for (i = first_thread; i < first_thread + threads_per_core; i++)
> > >   if (cpu_online(i))
> > >   set_cpus_related(i, cpu, cpu_sibling_mask);
> > > 
> > >   add_cpu_to_smallcore_masks(cpu);
> > > - /*
> > > -  * Copy the thread sibling mask into the cache sibling mask
> > > -  * and mark any CPUs that share an L2 with this CPU.
> > > -  */
> > > - for_each_cpu(i, cpu_sibling_mask(cpu))
> > > - set_cpus_related(cpu, i, cpu_l2_cache_mask);
> > >   update_mask_by_l2(cpu, cpu_l2_cache_mask);
> > > 
> > > - /*
> > > -  * Copy the cache sibling mask into core sibling mask and mark
> > > -  * any CPUs on the same chip as this CPU.
> > > -  */
> > > - for_each_cpu(i, cpu_l2_cache_mask(cpu))
> > > - set_cpus_related(cpu, i, cpu_core_mask);
> > > + if (pkg_id == -1) {
> > 
> > I suppose this "if" condition is an optimization, since if pkg_id != -1,
> > we anyway set these CPUs in the cpu_core_mask below.
> > 
> > However...
> 
> This is not just an optimization.
> The hunk removed would only work if cpu_l2_cache_mask is bigger than
> cpu_sibling_mask. (this was the previous assumption that we want to break)
> If the cpu_sibling_mask is bigger than cpu_l2_cache_mask and pkg_id is -1,
> then setting only cpu_l2_cache_mask in cpu_core_mask will result in a broken 
> topology.
> 
> > 
> > > + struct cpumask *(*mask)(int) = cpu_sibling_mask;
> > > +
> > > + /*
> > > +  * Copy the sibling mask into core sibling mask and
> > > +  * mark any CPUs on the same chip as this CPU.
> > > +  */
> > > + if (shared_caches)
> > > + mask = cpu_l2_cache_mask;
> > > +
> > > + for_each_cpu(i, mask(cpu))
> > > + set_cpus_related(cpu, i, cpu_core_mask);
> > > 
> > > - if (pkg_id == -1)
> > >   return;
> > > + }
> > 
> > 
> > ... since "cpu" is not yet set in the cpu_online_mask, do we not miss 
> > setting
> > "cpu" in the cpu_core_mask(cpu) in the for-loop below ?
> > 
> > 
> 
> As noted above, we are setting before. So we don't missing the cpu and hence
> have not different from before.


Fair enough.

> 
> > --
> > Thanks and Regards
> > gautham.
> 
> -- 
> Thanks and Regards
> Srikar Dronamraju


Re: [PATCH v3 04/10] powerpc/smp: Move topology fixups into a new function

2020-07-24 Thread Gautham R Shenoy
On Thu, Jul 23, 2020 at 02:21:10PM +0530, Srikar Dronamraju wrote:
> Move topology fixup based on the platform attributes into its own
> function which is called just before set_sched_topology.
> 
> Cc: linuxppc-dev 
> Cc: LKML 
> Cc: Michael Ellerman 
> Cc: Nicholas Piggin 
> Cc: Anton Blanchard 
> Cc: Oliver O'Halloran 
> Cc: Nathan Lynch 
> Cc: Michael Neuling 
> Cc: Gautham R Shenoy 
> Cc: Ingo Molnar 
> Cc: Peter Zijlstra 
> Cc: Valentin Schneider 
> Cc: Jordan Niethe 
> Signed-off-by: Srikar Dronamraju 


Reviewed-by: Gautham R. Shenoy 
> ---
> Changelog v2 -> v3:
>   Rewrote changelog (Gautham)
>   Renamed to powerpc/smp: Move topology fixups into  a new function
> 
>  arch/powerpc/kernel/smp.c | 17 +++--
>  1 file changed, 11 insertions(+), 6 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
> index a685915e5941..da27f6909be1 100644
> --- a/arch/powerpc/kernel/smp.c
> +++ b/arch/powerpc/kernel/smp.c
> @@ -1368,6 +1368,16 @@ int setup_profiling_timer(unsigned int multiplier)
>   return 0;
>  }
> 
> +static void fixup_topology(void)
> +{
> +#ifdef CONFIG_SCHED_SMT
> + if (has_big_cores) {
> + pr_info("Big cores detected but using small core scheduling\n");
> + powerpc_topology[0].mask = smallcore_smt_mask;
> + }
> +#endif
> +}
> +
>  void __init smp_cpus_done(unsigned int max_cpus)
>  {
>   /*
> @@ -1381,12 +1391,7 @@ void __init smp_cpus_done(unsigned int max_cpus)
> 
>   dump_numa_cpu_topology();
> 
> -#ifdef CONFIG_SCHED_SMT
> - if (has_big_cores) {
> - pr_info("Big cores detected but using small core scheduling\n");
> - powerpc_topology[0].mask = smallcore_smt_mask;
> - }
> -#endif
> + fixup_topology();
>   set_sched_topology(powerpc_topology);
>  }
> 
> -- 
> 2.18.2
> 


Re: [PATCH v3 02/10] powerpc/smp: Merge Power9 topology with Power topology

2020-07-24 Thread Gautham R Shenoy
On Thu, Jul 23, 2020 at 02:21:08PM +0530, Srikar Dronamraju wrote:
> A new sched_domain_topology_level was added just for Power9. However the
> same can be achieved by merging powerpc_topology with power9_topology
> and makes the code more simpler especially when adding a new sched
> domain.
> 
> Cc: linuxppc-dev 
> Cc: LKML 
> Cc: Michael Ellerman 
> Cc: Nicholas Piggin 
> Cc: Anton Blanchard 
> Cc: Oliver O'Halloran 
> Cc: Nathan Lynch 
> Cc: Michael Neuling 
> Cc: Gautham R Shenoy 
> Cc: Ingo Molnar 
> Cc: Peter Zijlstra 
> Cc: Valentin Schneider 
> Cc: Jordan Niethe 
> Signed-off-by: Srikar Dronamraju 

LGTM.


Reviewed-by: Gautham R. Shenoy 

> ---
> Changelog v1 -> v2:
>   Replaced a reference to cpu_smt_mask with per_cpu(cpu_sibling_map, cpu)
>   since cpu_smt_mask is only defined under CONFIG_SCHED_SMT
> 
>  arch/powerpc/kernel/smp.c | 33 ++---
>  1 file changed, 10 insertions(+), 23 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
> index edf94ca64eea..283a04e54f52 100644
> --- a/arch/powerpc/kernel/smp.c
> +++ b/arch/powerpc/kernel/smp.c
> @@ -1313,7 +1313,7 @@ int setup_profiling_timer(unsigned int multiplier)
>  }
> 
>  #ifdef CONFIG_SCHED_SMT
> -/* cpumask of CPUs with asymetric SMT dependancy */
> +/* cpumask of CPUs with asymmetric SMT dependency */
>  static int powerpc_smt_flags(void)
>  {
>   int flags = SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES;
> @@ -1326,14 +1326,6 @@ static int powerpc_smt_flags(void)
>  }
>  #endif
> 
> -static struct sched_domain_topology_level powerpc_topology[] = {
> -#ifdef CONFIG_SCHED_SMT
> - { cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) },
> -#endif
> - { cpu_cpu_mask, SD_INIT_NAME(DIE) },
> - { NULL, },
> -};
> -
>  /*
>   * P9 has a slightly odd architecture where pairs of cores share an L2 cache.
>   * This topology makes it *much* cheaper to migrate tasks between adjacent 
> cores
> @@ -1351,7 +1343,13 @@ static int powerpc_shared_cache_flags(void)
>   */
>  static const struct cpumask *shared_cache_mask(int cpu)
>  {
> - return cpu_l2_cache_mask(cpu);
> + if (shared_caches)
> + return cpu_l2_cache_mask(cpu);
> +
> + if (has_big_cores)
> + return cpu_smallcore_mask(cpu);
> +
> + return per_cpu(cpu_sibling_map, cpu);
>  }
> 
>  #ifdef CONFIG_SCHED_SMT
> @@ -1361,7 +1359,7 @@ static const struct cpumask *smallcore_smt_mask(int cpu)
>  }
>  #endif
> 
> -static struct sched_domain_topology_level power9_topology[] = {
> +static struct sched_domain_topology_level powerpc_topology[] = {
>  #ifdef CONFIG_SCHED_SMT
>   { cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) },
>  #endif
> @@ -1386,21 +1384,10 @@ void __init smp_cpus_done(unsigned int max_cpus)
>  #ifdef CONFIG_SCHED_SMT
>   if (has_big_cores) {
>   pr_info("Big cores detected but using small core scheduling\n");
> - power9_topology[0].mask = smallcore_smt_mask;
>   powerpc_topology[0].mask = smallcore_smt_mask;
>   }
>  #endif
> - /*
> -  * If any CPU detects that it's sharing a cache with another CPU then
> -  * use the deeper topology that is aware of this sharing.
> -  */
> - if (shared_caches) {
> - pr_info("Using shared cache scheduler topology\n");
> - set_sched_topology(power9_topology);
> - } else {
> - pr_info("Using standard scheduler topology\n");
> - set_sched_topology(powerpc_topology);
> - }
> + set_sched_topology(powerpc_topology);
>  }
> 
>  #ifdef CONFIG_HOTPLUG_CPU
> -- 
> 2.18.2
> 


Re: [PATCH v2 10/10] powerpc/smp: Implement cpu_to_coregroup_id

2020-07-22 Thread Gautham R Shenoy
On Tue, Jul 21, 2020 at 05:08:14PM +0530, Srikar Dronamraju wrote:
> Lookup the coregroup id from the associativity array.
> 
> If unable to detect the coregroup id, fallback on the core id.
> This way, ensure sched_domain degenerates and an extra sched domain is
> not created.
> 
> Ideally this function should have been implemented in
> arch/powerpc/kernel/smp.c. However if its implemented in mm/numa.c, we
> don't need to find the primary domain again.
> 
> If the device-tree mentions more than one coregroup, then kernel
> implements only the last or the smallest coregroup, which currently
> corresponds to the penultimate domain in the device-tree.
> 
> Cc: linuxppc-dev 
> Cc: LKML 
> Cc: Michael Ellerman 
> Cc: Ingo Molnar 
> Cc: Peter Zijlstra 
> Cc: Valentin Schneider 
> Cc: Nick Piggin 
> Cc: Oliver OHalloran 
> Cc: Nathan Lynch 
> Cc: Michael Neuling 
> Cc: Anton Blanchard 
> Cc: Gautham R Shenoy 
> Cc: Vaidyanathan Srinivasan 
> Cc: Jordan Niethe 
> Signed-off-by: Srikar Dronamraju 

Looks good to me.

Reviewed-by : Gautham R. Shenoy 


> ---
> Changelog v1 -> v2:
> powerpc/smp: Implement cpu_to_coregroup_id
>   Move coregroup_enabled before getting associativity (Gautham)
> 
>  arch/powerpc/mm/numa.c | 20 
>  1 file changed, 20 insertions(+)
> 
> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
> index ef8aa580da21..ae57b68beaee 100644
> --- a/arch/powerpc/mm/numa.c
> +++ b/arch/powerpc/mm/numa.c
> @@ -1218,6 +1218,26 @@ int find_and_online_cpu_nid(int cpu)
> 
>  int cpu_to_coregroup_id(int cpu)
>  {
> + __be32 associativity[VPHN_ASSOC_BUFSIZE] = {0};
> + int index;
> +
> + if (cpu < 0 || cpu > nr_cpu_ids)
> + return -1;
> +
> + if (!coregroup_enabled)
> + goto out;
> +
> + if (!firmware_has_feature(FW_FEATURE_VPHN))
> + goto out;
> +
> + if (vphn_get_associativity(cpu, associativity))
> + goto out;
> +
> + index = of_read_number(associativity, 1);
> + if (index > min_common_depth + 1)
> + return of_read_number([index - 1], 1);
> +
> +out:
>   return cpu_to_core_id(cpu);
>  }
> 
> -- 
> 2.17.1
> 


Re: [PATCH v2 09/10] Powerpc/smp: Create coregroup domain

2020-07-22 Thread Gautham R Shenoy
Hi Srikar,

On Tue, Jul 21, 2020 at 05:08:13PM +0530, Srikar Dronamraju wrote:
> Add percpu coregroup maps and masks to create coregroup domain.
> If a coregroup doesn't exist, the coregroup domain will be degenerated
> in favour of SMT/CACHE domain.
> 
> Cc: linuxppc-dev 
> Cc: LKML 
> Cc: Michael Ellerman 
> Cc: Ingo Molnar 
> Cc: Peter Zijlstra 
> Cc: Valentin Schneider 
> Cc: Nick Piggin 
> Cc: Oliver OHalloran 
> Cc: Nathan Lynch 
> Cc: Michael Neuling 
> Cc: Anton Blanchard 
> Cc: Gautham R Shenoy 
> Cc: Vaidyanathan Srinivasan 
> Cc: Jordan Niethe 
> Signed-off-by: Srikar Dronamraju 

A query below.

> ---
> Changelog v1 -> v2:
> Powerpc/smp: Create coregroup domain
>   Moved coregroup topology fixup to fixup_topology (Gautham)
> 
>  arch/powerpc/include/asm/topology.h | 10 
>  arch/powerpc/kernel/smp.c   | 38 +
>  arch/powerpc/mm/numa.c  |  5 
>  3 files changed, 53 insertions(+)
> 
> diff --git a/arch/powerpc/include/asm/topology.h 
> b/arch/powerpc/include/asm/topology.h
> index f0b6300e7dd3..6609174918ab 100644
> --- a/arch/powerpc/include/asm/topology.h
> +++ b/arch/powerpc/include/asm/topology.h

[..snip..]

> @@ -91,6 +92,7 @@ enum {
>   smt_idx,
>  #endif
>   bigcore_idx,
> + mc_idx,
>   die_idx,
>  };
> 


[..snip..]

> @@ -879,6 +896,7 @@ static struct sched_domain_topology_level 
> powerpc_topology[] = {
>   { cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) },
>  #endif
>   { cpu_bigcore_mask, SD_INIT_NAME(BIGCORE) },
> + { cpu_mc_mask, SD_INIT_NAME(MC) },
>   { cpu_cpu_mask, SD_INIT_NAME(DIE) },
>   { NULL, },
>  };


[..snip..]

> @@ -1386,6 +1421,9 @@ int setup_profiling_timer(unsigned int multiplier)
> 
>  static void fixup_topology(void)
>  {
> + if (!has_coregroup_support())
> + powerpc_topology[mc_idx].mask = cpu_bigcore_mask;
> +

Shouldn't we move this condition after doing the fixup for shared
caches ? Because if we have shared_caches, but not core_group, then we
want the coregroup domain to degenerate correctly.


>   if (shared_caches) {
>   pr_info("Using shared cache scheduler topology\n");
>   powerpc_topology[bigcore_idx].mask = shared_cache_mask;


--
Thanks and regards
gautham.


Re: [PATCH v2 06/10] powerpc/smp: Generalize 2nd sched domain

2020-07-22 Thread Gautham R Shenoy
Hello Srikar,

On Tue, Jul 21, 2020 at 05:08:10PM +0530, Srikar Dronamraju wrote:
> Currently "CACHE" domain happens to be the 2nd sched domain as per
> powerpc_topology. This domain will collapse if cpumask of l2-cache is
> same as SMT domain. However we could generalize this domain such that it
> could mean either be a "CACHE" domain or a "BIGCORE" domain.
> 
> While setting up the "CACHE" domain, check if shared_cache is already
> set.
> 
> Cc: linuxppc-dev 
> Cc: LKML 
> Cc: Michael Ellerman 
> Cc: Ingo Molnar 
> Cc: Peter Zijlstra 
> Cc: Valentin Schneider 
> Cc: Nick Piggin 
> Cc: Oliver OHalloran 
> Cc: Nathan Lynch 
> Cc: Michael Neuling 
> Cc: Anton Blanchard 
> Cc: Gautham R Shenoy 
> Cc: Vaidyanathan Srinivasan 
> Cc: Jordan Niethe 
> Signed-off-by: Srikar Dronamraju 
> ---
> Changelog v1 -> v2:
> powerpc/smp: Generalize 2nd sched domain
>   Moved shared_cache topology fixup to fixup_topology (Gautham)
>

Just one comment below.

>  arch/powerpc/kernel/smp.c | 49 ---
>  1 file changed, 35 insertions(+), 14 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
> index 57468877499a..933ebdf97432 100644
> --- a/arch/powerpc/kernel/smp.c
> +++ b/arch/powerpc/kernel/smp.c
> @@ -85,6 +85,14 @@ EXPORT_PER_CPU_SYMBOL(cpu_l2_cache_map);
>  EXPORT_PER_CPU_SYMBOL(cpu_core_map);
>  EXPORT_SYMBOL_GPL(has_big_cores);
> 
> +enum {
> +#ifdef CONFIG_SCHED_SMT
> + smt_idx,
> +#endif
> + bigcore_idx,
> + die_idx,
> +};
> +


[..snip..]

> @@ -1339,14 +1345,20 @@ void start_secondary(void *unused)
>   /* Update topology CPU masks */
>   add_cpu_to_masks(cpu);
> 
> - if (has_big_cores)
> - sibling_mask = cpu_smallcore_mask;
>   /*
>* Check for any shared caches. Note that this must be done on a
>* per-core basis because one core in the pair might be disabled.
>*/
> - if (!cpumask_equal(cpu_l2_cache_mask(cpu), sibling_mask(cpu)))
> - shared_caches = true;
> + if (!shared_caches) {
> + struct cpumask *(*sibling_mask)(int) = cpu_sibling_mask;
> + struct cpumask *mask = cpu_l2_cache_mask(cpu);
> +
> + if (has_big_cores)
> + sibling_mask = cpu_smallcore_mask;
> +
> + if (cpumask_weight(mask) > cpumask_weight(sibling_mask(cpu)))
> + shared_caches = true;

At the risk of repeating my comment to the v1 version of the patch, we
have shared caches only l2_cache_mask(cpu) is a strict superset of
sibling_mask(cpu).

"cpumask_weight(mask) > cpumask_weight(sibling_mask(cpu))" does not
capture this.

Could we please use

  if (!cpumask_equal(sibling_mask(cpu), mask) &&
  cpumask_subset(sibling_mask(cpu), mask) {
  }

?


> + }
> 
>   set_numa_node(numa_cpu_lookup_table[cpu]);
>   set_numa_mem(local_memory_node(numa_cpu_lookup_table[cpu]));
> @@ -1374,10 +1386,19 @@ int setup_profiling_timer(unsigned int multiplier)
> 
>  static void fixup_topology(void)
>  {
> + if (shared_caches) {
> + pr_info("Using shared cache scheduler topology\n");
> + powerpc_topology[bigcore_idx].mask = shared_cache_mask;
> +#ifdef CONFIG_SCHED_DEBUG
> + powerpc_topology[bigcore_idx].name = "CACHE";
> +#endif
> + powerpc_topology[bigcore_idx].sd_flags = 
> powerpc_shared_cache_flags;
> + }
> +
>  #ifdef CONFIG_SCHED_SMT
>   if (has_big_cores) {
>   pr_info("Big cores detected but using small core scheduling\n");
> - powerpc_topology[0].mask = smallcore_smt_mask;
> + powerpc_topology[smt_idx].mask = smallcore_smt_mask;
>   }
>  #endif


Otherwise the patch looks good to me.

--
Thanks and Regards
gautham.


Re: [PATCH v2 05/10] powerpc/smp: Dont assume l2-cache to be superset of sibling

2020-07-22 Thread Gautham R Shenoy
Hi Srikar,

On Tue, Jul 21, 2020 at 05:08:09PM +0530, Srikar Dronamraju wrote:
> Current code assumes that cpumask of cpus sharing a l2-cache mask will
> always be a superset of cpu_sibling_mask.
> 
> Lets stop that assumption. cpu_l2_cache_mask is a superset of
> cpu_sibling_mask if and only if shared_caches is set.
> 
> Cc: linuxppc-dev 
> Cc: LKML 
> Cc: Michael Ellerman 
> Cc: Ingo Molnar 
> Cc: Peter Zijlstra 
> Cc: Valentin Schneider 
> Cc: Nick Piggin 
> Cc: Oliver OHalloran 
> Cc: Nathan Lynch 
> Cc: Michael Neuling 
> Cc: Anton Blanchard 
> Cc: Gautham R Shenoy 
> Cc: Vaidyanathan Srinivasan 
> Cc: Jordan Niethe 
> Signed-off-by: Srikar Dronamraju 
> ---
> Changelog v1 -> v2:
> powerpc/smp: Dont assume l2-cache to be superset of sibling
>   Set cpumask after verifying l2-cache. (Gautham)
> 
>  arch/powerpc/kernel/smp.c | 28 +++-
>  1 file changed, 15 insertions(+), 13 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
> index 72f16dc0cb26..57468877499a 100644
> --- a/arch/powerpc/kernel/smp.c
> +++ b/arch/powerpc/kernel/smp.c
> @@ -1196,6 +1196,7 @@ static bool update_mask_by_l2(int cpu, struct cpumask 
> *(*mask_fn)(int))
>   if (!l2_cache)
>   return false;
> 
> + cpumask_set_cpu(cpu, mask_fn(cpu));


Ok, we need to do this because "cpu" is not yet set in the
cpu_online_mask. Prior to your patch the "cpu" was getting set in
cpu_l2_cache_map(cpu) as a side-effect of the code that is removed in
the patch.


>   for_each_cpu(i, cpu_online_mask) {
>   /*
>* when updating the marks the current CPU has not been marked
> @@ -1278,29 +1279,30 @@ static void add_cpu_to_masks(int cpu)
>* add it to it's own thread sibling mask.
>*/
>   cpumask_set_cpu(cpu, cpu_sibling_mask(cpu));
> + cpumask_set_cpu(cpu, cpu_core_mask(cpu));
> 
>   for (i = first_thread; i < first_thread + threads_per_core; i++)
>   if (cpu_online(i))
>   set_cpus_related(i, cpu, cpu_sibling_mask);
> 
>   add_cpu_to_smallcore_masks(cpu);
> - /*
> -  * Copy the thread sibling mask into the cache sibling mask
> -  * and mark any CPUs that share an L2 with this CPU.
> -  */
> - for_each_cpu(i, cpu_sibling_mask(cpu))
> - set_cpus_related(cpu, i, cpu_l2_cache_mask);
>   update_mask_by_l2(cpu, cpu_l2_cache_mask);
> 
> - /*
> -  * Copy the cache sibling mask into core sibling mask and mark
> -  * any CPUs on the same chip as this CPU.
> -  */
> - for_each_cpu(i, cpu_l2_cache_mask(cpu))
> - set_cpus_related(cpu, i, cpu_core_mask);
> + if (pkg_id == -1) {

I suppose this "if" condition is an optimization, since if pkg_id != -1,
we anyway set these CPUs in the cpu_core_mask below.

However...

> + struct cpumask *(*mask)(int) = cpu_sibling_mask;
> +
> + /*
> +  * Copy the sibling mask into core sibling mask and
> +  * mark any CPUs on the same chip as this CPU.
> +  */
> + if (shared_caches)
> + mask = cpu_l2_cache_mask;
> +
> + for_each_cpu(i, mask(cpu))
> + set_cpus_related(cpu, i, cpu_core_mask);
> 
> - if (pkg_id == -1)
>   return;
> + }


... since "cpu" is not yet set in the cpu_online_mask, do we not miss setting
"cpu" in the cpu_core_mask(cpu) in the for-loop below ?


> 
>   for_each_cpu(i, cpu_online_mask)
>   if (get_physical_package_id(i) == pkg_id)


Before this patch it was unconditionally getting set in
cpu_core_mask(cpu) because of the fact that it was set in
cpu_l2_cache_mask(cpu) and we were unconditionally setting all the
CPUs in cpu_l2_cache_mask(cpu) in cpu_core_mask(cpu).

What am I missing ?

> -- 
> 2.17.1
>

--
Thanks and Regards
gautham.


Re: [PATCH v2 04/10] powerpc/smp: Enable small core scheduling sooner

2020-07-21 Thread Gautham R Shenoy
Hello Srikar,

On Tue, Jul 21, 2020 at 05:08:08PM +0530, Srikar Dronamraju wrote:
> Enable small core scheduling as soon as we detect that we are in a
> system that supports thread group. Doing so would avoid a redundant
> check.

The patch looks good to me. However, I think the commit message still
reflect the v1 code where we were moving the functionality from
smp_cpus_done() to init_big_cores().

In this we are moving it to a helper function to collate all fixups to
topology.

> 
> Cc: linuxppc-dev 
> Cc: LKML 
> Cc: Michael Ellerman 
> Cc: Ingo Molnar 
> Cc: Peter Zijlstra 
> Cc: Valentin Schneider 
> Cc: Nick Piggin 
> Cc: Oliver OHalloran 
> Cc: Nathan Lynch 
> Cc: Michael Neuling 
> Cc: Anton Blanchard 
> Cc: Gautham R Shenoy 
> Cc: Vaidyanathan Srinivasan 
> Cc: Jordan Niethe 
> Signed-off-by: Srikar Dronamraju 
> ---
> Changelog v1 -> v2:
> powerpc/smp: Enable small core scheduling sooner
>   Restored the previous info msg (Jordan)
>   Moved big core topology fixup to fixup_topology (Gautham)
> 
>  arch/powerpc/kernel/smp.c | 17 +++--
>  1 file changed, 11 insertions(+), 6 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
> index 1ce95da00cb6..72f16dc0cb26 100644
> --- a/arch/powerpc/kernel/smp.c
> +++ b/arch/powerpc/kernel/smp.c
> @@ -1370,6 +1370,16 @@ int setup_profiling_timer(unsigned int multiplier)
>   return 0;
>  }
> 
> +static void fixup_topology(void)
> +{
> +#ifdef CONFIG_SCHED_SMT
> + if (has_big_cores) {
> + pr_info("Big cores detected but using small core scheduling\n");
> + powerpc_topology[0].mask = smallcore_smt_mask;
> + }
> +#endif
> +}
> +
>  void __init smp_cpus_done(unsigned int max_cpus)
>  {
>   /*
> @@ -1383,12 +1393,7 @@ void __init smp_cpus_done(unsigned int max_cpus)
> 
>   dump_numa_cpu_topology();
> 
> -#ifdef CONFIG_SCHED_SMT
> - if (has_big_cores) {
> - pr_info("Big cores detected but using small core scheduling\n");
> - powerpc_topology[0].mask = smallcore_smt_mask;
> - }
> -#endif
> + fixup_topology();
>   set_sched_topology(powerpc_topology);
>  }
> 
> -- 
> 2.17.1
> 


Re: [PATCH v2 02/10] powerpc/smp: Merge Power9 topology with Power topology

2020-07-21 Thread Gautham R Shenoy
On Tue, Jul 21, 2020 at 05:08:06PM +0530, Srikar Dronamraju wrote:
> A new sched_domain_topology_level was added just for Power9. However the
> same can be achieved by merging powerpc_topology with power9_topology
> and makes the code more simpler especially when adding a new sched
> domain.
> 
> Cc: linuxppc-dev 
> Cc: LKML 
> Cc: Michael Ellerman 
> Cc: Ingo Molnar 
> Cc: Peter Zijlstra 
> Cc: Valentin Schneider 
> Cc: Nick Piggin 
> Cc: Oliver OHalloran 
> Cc: Nathan Lynch 
> Cc: Michael Neuling 
> Cc: Anton Blanchard 
> Cc: Gautham R Shenoy 
> Cc: Vaidyanathan Srinivasan 
> Cc: Jordan Niethe 
> Signed-off-by: Srikar Dronamraju 
> ---
> Changelog v1 -> v2:
> powerpc/smp: Merge Power9 topology with Power topology
>   Replaced a reference to cpu_smt_mask with per_cpu(cpu_sibling_map, cpu)
>   since cpu_smt_mask is only defined under CONFIG_SCHED_SMT
> 
>  arch/powerpc/kernel/smp.c | 33 ++---
>  1 file changed, 10 insertions(+), 23 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
> index 680c0edcc59d..0e0b118d9b6e 100644
> --- a/arch/powerpc/kernel/smp.c
> +++ b/arch/powerpc/kernel/smp.c
> @@ -1315,7 +1315,7 @@ int setup_profiling_timer(unsigned int multiplier)
>  }
> 
>  #ifdef CONFIG_SCHED_SMT
> -/* cpumask of CPUs with asymetric SMT dependancy */
> +/* cpumask of CPUs with asymmetric SMT dependency */
>  static int powerpc_smt_flags(void)
>  {
>   int flags = SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES;
> @@ -1328,14 +1328,6 @@ static int powerpc_smt_flags(void)
>  }
>  #endif
> 
> -static struct sched_domain_topology_level powerpc_topology[] = {
> -#ifdef CONFIG_SCHED_SMT
> - { cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) },
> -#endif
> - { cpu_cpu_mask, SD_INIT_NAME(DIE) },
> - { NULL, },
> -};
> -
>  /*
>   * P9 has a slightly odd architecture where pairs of cores share an L2 cache.
>   * This topology makes it *much* cheaper to migrate tasks between adjacent 
> cores
> @@ -1353,7 +1345,13 @@ static int powerpc_shared_cache_flags(void)
>   */
>  static const struct cpumask *shared_cache_mask(int cpu)
>  {
> - return cpu_l2_cache_mask(cpu);
> + if (shared_caches)
> + return cpu_l2_cache_mask(cpu);
> +
> + if (has_big_cores)
> + return cpu_smallcore_mask(cpu);
> +
> + return per_cpu(cpu_sibling_map, cpu);
>  }


It might be helpful to enumerate the consequences of this change:

With this patch, on POWER7 and POWER8

   SMT and CACHE domains' cpumasks will both be
   per_cpu(cpu_sibling_map, cpu).

   On POWER7 SMT level flags has the following
   (SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING)

   On POWER8 SMT level flags has the following
   (SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES).

   On both POWER7 and POWER8, CACHE level flags only has
   SD_SHARE_PKG_RESOURCES

   Thus, on both POWER7 and POWER8, since the SMT and CACHE cpumasks
   are the same and since CACHE has no additional flags which SMT does
   not, the parent domain CACHE will be degenerated.

   Hence we will have SMT --> DIE --> NUMA as before without the
   patch. So the patch introduces no behavioural change. Only change
   is an additional degeneration of the CACHE domain.

On POWER9 : Baremetal.
   SMT level cpumask = per_cpu(cpu_sibling_map, cpu)

   Since the caches are shared for a pair of two cores,
   CACHE level cpumask = cpu_l2_cache_mask(cpu)

   Thus, we will have SMT --> CACHE --> DIE --> NUMA as before.  No
   behavioural change.

On POWER9 : LPAR
   SMT level cpumask = cpu_smallcore_mask(cpu).

   Since the caches are shared,
   CACHE level cpumask = cpu_l2_cache_mask(cpu).

   Thus, we will have SMT --> CACHE --> DIE --> NUMA as before.  Again
   no change in behaviour.

Reviewed-by: Gautham R. Shenoy 

--
Thanks and Regards
gautham.


Re: [PATCH v4 2/3] powerpc/powernv/idle: Rename pnv_first_spr_loss_level variable

2020-07-21 Thread Gautham R Shenoy
On Tue, Jul 21, 2020 at 09:07:07PM +0530, Pratik Rajesh Sampat wrote:
> Replace the variable name from using "pnv_first_spr_loss_level" to
> "deep_spr_loss_state".
> 
> pnv_first_spr_loss_level is supposed to be the earliest state that
> has OPAL_PM_LOSE_FULL_CONTEXT set, in other places the kernel uses the
> "deep" states as terminology. Hence renaming the variable to be coherent
> to its semantics.
> 
> Signed-off-by: Pratik Rajesh Sampat 

Acked-by: Gautham R. Shenoy 

> ---
>  arch/powerpc/platforms/powernv/idle.c | 18 +-
>  1 file changed, 9 insertions(+), 9 deletions(-)
> 
> diff --git a/arch/powerpc/platforms/powernv/idle.c 
> b/arch/powerpc/platforms/powernv/idle.c
> index 642abf0b8329..28462d59a8e1 100644
> --- a/arch/powerpc/platforms/powernv/idle.c
> +++ b/arch/powerpc/platforms/powernv/idle.c
> @@ -48,7 +48,7 @@ static bool default_stop_found;
>   * First stop state levels when SPR and TB loss can occur.
>   */
>  static u64 pnv_first_tb_loss_level = MAX_STOP_STATE + 1;
> -static u64 pnv_first_spr_loss_level = MAX_STOP_STATE + 1;
> +static u64 deep_spr_loss_state = MAX_STOP_STATE + 1;
> 
>  /*
>   * psscr value and mask of the deepest stop idle state.
> @@ -657,7 +657,7 @@ static unsigned long power9_idle_stop(unsigned long 
> psscr, bool mmu_on)
> */
>   mmcr0   = mfspr(SPRN_MMCR0);
>   }
> - if ((psscr & PSSCR_RL_MASK) >= pnv_first_spr_loss_level) {
> + if ((psscr & PSSCR_RL_MASK) >= deep_spr_loss_state) {
>   sprs.lpcr   = mfspr(SPRN_LPCR);
>   sprs.hfscr  = mfspr(SPRN_HFSCR);
>   sprs.fscr   = mfspr(SPRN_FSCR);
> @@ -741,7 +741,7 @@ static unsigned long power9_idle_stop(unsigned long 
> psscr, bool mmu_on)
>* just always test PSSCR for SPR/TB state loss.
>*/
>   pls = (psscr & PSSCR_PLS) >> PSSCR_PLS_SHIFT;
> - if (likely(pls < pnv_first_spr_loss_level)) {
> + if (likely(pls < deep_spr_loss_state)) {
>   if (sprs_saved)
>   atomic_stop_thread_idle();
>   goto out;
> @@ -1088,7 +1088,7 @@ static void __init pnv_power9_idle_init(void)
>* the deepest loss-less (OPAL_PM_STOP_INST_FAST) stop state.
>*/
>   pnv_first_tb_loss_level = MAX_STOP_STATE + 1;
> - pnv_first_spr_loss_level = MAX_STOP_STATE + 1;
> + deep_spr_loss_state = MAX_STOP_STATE + 1;
>   for (i = 0; i < nr_pnv_idle_states; i++) {
>   int err;
>   struct pnv_idle_states_t *state = _idle_states[i];
> @@ -1099,8 +1099,8 @@ static void __init pnv_power9_idle_init(void)
>   pnv_first_tb_loss_level = psscr_rl;
> 
>   if ((state->flags & OPAL_PM_LOSE_FULL_CONTEXT) &&
> -  (pnv_first_spr_loss_level > psscr_rl))
> - pnv_first_spr_loss_level = psscr_rl;
> +  (deep_spr_loss_state > psscr_rl))
> + deep_spr_loss_state = psscr_rl;
> 
>   /*
>* The idle code does not deal with TB loss occurring
> @@ -,8 +,8 @@ static void __init pnv_power9_idle_init(void)
>* compatibility.
>*/
>   if ((state->flags & OPAL_PM_TIMEBASE_STOP) &&
> -  (pnv_first_spr_loss_level > psscr_rl))
> - pnv_first_spr_loss_level = psscr_rl;
> +  (deep_spr_loss_state > psscr_rl))
> + deep_spr_loss_state = psscr_rl;
> 
>   err = validate_psscr_val_mask(>psscr_val,
> >psscr_mask,
> @@ -1158,7 +1158,7 @@ static void __init pnv_power9_idle_init(void)
>   }
> 
>   pr_info("cpuidle-powernv: First stop level that may lose SPRs = 
> 0x%llx\n",
> - pnv_first_spr_loss_level);
> + deep_spr_loss_state);
> 
>   pr_info("cpuidle-powernv: First stop level that may lose timebase = 
> 0x%llx\n",
>   pnv_first_tb_loss_level);
> -- 
> 2.25.4
> 


Re: [PATCH v3 2/3] powerpc/powernv/idle: Rename pnv_first_spr_loss_level variable

2020-07-21 Thread Gautham R Shenoy
Hi,

On Wed, Jul 22, 2020 at 12:37:41AM +1000, Nicholas Piggin wrote:
> Excerpts from Pratik Sampat's message of July 21, 2020 8:29 pm:
> > 
> > 
> > On 20/07/20 5:27 am, Nicholas Piggin wrote:
> >> Excerpts from Pratik Rajesh Sampat's message of July 18, 2020 4:53 am:
> >>> Replace the variable name from using "pnv_first_spr_loss_level" to
> >>> "pnv_first_fullstate_loss_level".
> >>>
> >>> As pnv_first_spr_loss_level is supposed to be the earliest state that
> >>> has OPAL_PM_LOSE_FULL_CONTEXT set, however as shallow states too loose
> >>> SPR values, render an incorrect terminology.
> >> It also doesn't lose "full" state at this loss level though. From the
> >> architecture it could be called "hv state loss level", but in POWER10
> >> even that is not strictly true.
> >>
> > Right. Just discovered that deep stop states won't loose full state
> > P10 onwards.
> > Would it better if we rename it as "pnv_all_spr_loss_state" instead
> > so that it stays generic enough while being semantically coherent?
> 
> It doesn't lose all SPRs. It does physically, but for Linux it appears 
> at least timebase SPRs are retained and that's mostly how it's 
> documented.
> 
> Maybe there's no really good name for it, but we do call it "deep" stop 
> in other places, you could call it deep_spr_loss maybe. I don't mind too 
> much though, whatever Gautham is happy with.

Nick's suggestion is fine by me. We can call it deep_spr_loss_state.

> 
> Thanks,
> Nick

--
Thanks and Regards
gautham.


Re: [v3 11/15] powerpc/perf: BHRB control to disable BHRB logic when not used

2020-07-20 Thread Gautham R Shenoy
On Fri, Jul 17, 2020 at 10:38:23AM -0400, Athira Rajeev wrote:
> PowerISA v3.1 has few updates for the Branch History Rolling Buffer(BHRB).
> 
> BHRB disable is controlled via Monitor Mode Control Register A (MMCRA)
> bit, namely "BHRB Recording Disable (BHRBRD)". This field controls
> whether BHRB entries are written when BHRB recording is enabled by other
> bits. This patch implements support for this BHRB disable bit.
> 
> By setting 0b1 to this bit will disable the BHRB and by setting 0b0
> to this bit will have BHRB enabled. This addresses backward
> compatibility (for older OS), since this bit will be cleared and
> hardware will be writing to BHRB by default.
> 
> This patch addresses changes to set MMCRA (BHRBRD) at boot for power10
> ( there by the core will run faster) and enable this feature only on
> runtime ie, on explicit need from user. Also save/restore MMCRA in the
> restore path of state-loss idle state to make sure we keep BHRB disabled
> if it was not enabled on request at runtime.
> 
> Signed-off-by: Athira Rajeev 

For arch/powerpc/platforms/powernv/idle.c
Reviewed-by: Gautham R. Shenoy 


> ---
>  arch/powerpc/perf/core-book3s.c   | 20 
>  arch/powerpc/perf/isa207-common.c | 12 
>  arch/powerpc/platforms/powernv/idle.c | 22 --
>  3 files changed, 48 insertions(+), 6 deletions(-)

[..snip..]

> diff --git a/arch/powerpc/platforms/powernv/idle.c 
> b/arch/powerpc/platforms/powernv/idle.c
> index 2dd4673..1c9d0a9 100644
> --- a/arch/powerpc/platforms/powernv/idle.c
> +++ b/arch/powerpc/platforms/powernv/idle.c
> @@ -611,6 +611,7 @@ static unsigned long power9_idle_stop(unsigned long 
> psscr, bool mmu_on)
>   unsigned long srr1;
>   unsigned long pls;
>   unsigned long mmcr0 = 0;
> + unsigned long mmcra = 0;
>   struct p9_sprs sprs = {}; /* avoid false used-uninitialised */
>   bool sprs_saved = false;
> 
> @@ -657,6 +658,21 @@ static unsigned long power9_idle_stop(unsigned long 
> psscr, bool mmu_on)
> */
>   mmcr0   = mfspr(SPRN_MMCR0);
>   }
> +
> + if (cpu_has_feature(CPU_FTR_ARCH_31)) {
> + /*
> +  * POWER10 uses MMCRA (BHRBRD) as BHRB disable bit.
> +  * If the user hasn't asked for the BHRB to be
> +  * written, the value of MMCRA[BHRBRD] is 1.
> +  * On wakeup from stop, MMCRA[BHRBD] will be 0,
> +  * since it is previleged resource and will be lost.
> +  * Thus, if we do not save and restore the MMCRA[BHRBD],
> +  * hardware will be needlessly writing to the BHRB
> +  * in problem mode.
> +  */
> + mmcra   = mfspr(SPRN_MMCRA);
> + }
> +
>   if ((psscr & PSSCR_RL_MASK) >= pnv_first_spr_loss_level) {
>   sprs.lpcr   = mfspr(SPRN_LPCR);
>   sprs.hfscr  = mfspr(SPRN_HFSCR);
> @@ -700,8 +716,6 @@ static unsigned long power9_idle_stop(unsigned long 
> psscr, bool mmu_on)
>   WARN_ON_ONCE(mfmsr() & (MSR_IR|MSR_DR));
> 
>   if ((srr1 & SRR1_WAKESTATE) != SRR1_WS_NOLOSS) {
> - unsigned long mmcra;
> -
>   /*
>* We don't need an isync after the mtsprs here because the
>* upcoming mtmsrd is execution synchronizing.
> @@ -721,6 +735,10 @@ static unsigned long power9_idle_stop(unsigned long 
> psscr, bool mmu_on)
>   mtspr(SPRN_MMCR0, mmcr0);
>   }
> 
> + /* Reload MMCRA to restore BHRB disable bit for POWER10 */
> + if (cpu_has_feature(CPU_FTR_ARCH_31))
> + mtspr(SPRN_MMCRA, mmcra);
> +
>   /*
>* DD2.2 and earlier need to set then clear bit 60 in MMCRA
>* to ensure the PMU starts running.
> -- 
> 1.8.3.1
> 


Re: [PATCH 10/11] powerpc/smp: Implement cpu_to_coregroup_id

2020-07-20 Thread Gautham R Shenoy
Hi Srikar,


On Mon, Jul 20, 2020 at 11:18:16AM +0530, Srikar Dronamraju wrote:
> * Gautham R Shenoy  [2020-07-17 13:56:53]:
> 
> > On Tue, Jul 14, 2020 at 10:06:23AM +0530, Srikar Dronamraju wrote:
> > > Lookup the coregroup id from the associativity array.
> > > 
> > > If unable to detect the coregroup id, fallback on the core id.
> > > This way, ensure sched_domain degenerates and an extra sched domain is
> > > not created.
> > > 
> > > Ideally this function should have been implemented in
> > > arch/powerpc/kernel/smp.c. However if its implemented in mm/numa.c, we
> > > don't need to find the primary domain again.
> > > 
> > > If the device-tree mentions more than one coregroup, then kernel
> > > implements only the last or the smallest coregroup, which currently
> > > corresponds to the penultimate domain in the device-tree.
> > > 
> > > Cc: linuxppc-dev 
> > > Cc: Michael Ellerman 
> > > Cc: Nick Piggin 
> > > Cc: Oliver OHalloran 
> > > Cc: Nathan Lynch 
> > > Cc: Michael Neuling 
> > > Cc: Anton Blanchard 
> > > Cc: Gautham R Shenoy 
> > > Cc: Vaidyanathan Srinivasan 
> > > Signed-off-by: Srikar Dronamraju 
> > > ---
> > >  arch/powerpc/mm/numa.c | 17 +
> > >  1 file changed, 17 insertions(+)
> > > 
> > > diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
> > > index d9ab9da85eab..4e85564ef62a 100644
> > > --- a/arch/powerpc/mm/numa.c
> > > +++ b/arch/powerpc/mm/numa.c
> > > @@ -1697,6 +1697,23 @@ static const struct proc_ops topology_proc_ops = {
> > > 
> > >  int cpu_to_coregroup_id(int cpu)
> > >  {
> > > + __be32 associativity[VPHN_ASSOC_BUFSIZE] = {0};
> > > + int index;
> > > +
> > 
> > It would be good to have an assert here to ensure that we are calling
> > this function only when coregroups are enabled.
> > 
> > Else, we may end up returning the penultimate index which maps to the
> > chip-id.
> > 
> 
> We have a check below exactly for the same reason. Please look
below.

I saw that. However, it would be better to assert within the function
so that we don't call it from any other context without ascertaining
first that core_groups are enabled. Or at least a comment in the
function saying that we should call this only after ascertaining that
core_groups are enabled.



> 
> > 
> > 
> > > + if (cpu < 0 || cpu > nr_cpu_ids)
> > > + return -1;
> > > +
> > > + if (!firmware_has_feature(FW_FEATURE_VPHN))
> > > + goto out;
> > > +
> > > + if (vphn_get_associativity(cpu, associativity))
> > > + goto out;
> > > +
> > > + index = of_read_number(associativity, 1);
> > > + if ((index > min_common_depth + 1) && coregroup_enabled)
> > > + return of_read_number([index - 1], 1);
> 
> See ^above.
> 
> index would be the all the domains in the associativity array, 
> min_common_depth would be where the primary domain or the chip-id is
> defined. So we are reading the penultimate domain if and only if the
> min_common_depth isn't the primary domain aka chip-id. 
> 
> What other check /assertions can we add?
> 
> 
> > > +
> > > +out:
> > >   return cpu_to_core_id(cpu);
> > >  }
> > > 
> > > -- 
> > > 2.17.1
> > > 
> 
> -- 
> Thanks and Regards
> Srikar Dronamraju


Re: [PATCH 06/11] powerpc/smp: Generalize 2nd sched domain

2020-07-20 Thread Gautham R Shenoy
On Mon, Jul 20, 2020 at 11:49:11AM +0530, Srikar Dronamraju wrote:
> * Gautham R Shenoy  [2020-07-17 12:07:55]:
> 
> > On Tue, Jul 14, 2020 at 10:06:19AM +0530, Srikar Dronamraju wrote:
> > > Currently "CACHE" domain happens to be the 2nd sched domain as per
> > > powerpc_topology. This domain will collapse if cpumask of l2-cache is
> > > same as SMT domain. However we could generalize this domain such that it
> > > could mean either be a "CACHE" domain or a "BIGCORE" domain.
> > > 
> > > While setting up the "CACHE" domain, check if shared_cache is already
> > > set.
> > > 
> > > Cc: linuxppc-dev 
> > > Cc: Michael Ellerman 
> > > Cc: Nick Piggin 
> > > Cc: Oliver OHalloran 
> > > Cc: Nathan Lynch 
> > > Cc: Michael Neuling 
> > > Cc: Anton Blanchard 
> > > Cc: Gautham R Shenoy 
> > > Cc: Vaidyanathan Srinivasan 
> > > Signed-off-by: Srikar Dronamraju 
> > > ---
> > > @@ -867,11 +869,16 @@ static const struct cpumask *smallcore_smt_mask(int 
> > > cpu)
> > >  }
> > >  #endif
> > > 
> > > +static const struct cpumask *cpu_bigcore_mask(int cpu)
> > > +{
> > > + return cpu_core_mask(cpu);
> > 
> > It should be cpu_smt_mask() if we want the redundant big-core to be
> > degenerated in favour of the SMT level on P8, no? Because
> > cpu_core_mask refers to all the CPUs that are in the same chip.
> > 
> 
> Right, but it cant be cpu_smt_mask since cpu_smt_mask is only enabled in
> CONFIG_SCHED_SMT. I was looking at using sibling_map, but we have to careful
> for power9 / PowerNV mode. Guess that should be fine.

Ok.

> 
> > > +}
> > > +
> > >  static struct sched_domain_topology_level powerpc_topology[] = {
> > >  #ifdef CONFIG_SCHED_SMT
> > >   { cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) },
> > >  #endif
> > > - { shared_cache_mask, powerpc_shared_cache_flags, SD_INIT_NAME(CACHE) },
> > > + { cpu_bigcore_mask, SD_INIT_NAME(BIGCORE) },
> > >   { cpu_cpu_mask, SD_INIT_NAME(DIE) },
> > >   { NULL, },
> > >  };
> > > @@ -1319,7 +1326,6 @@ static void add_cpu_to_masks(int cpu)
> > >  void start_secondary(void *unused)
> > >  {
> > >   unsigned int cpu = smp_processor_id();
> > > - struct cpumask *(*sibling_mask)(int) = cpu_sibling_mask;
> > > 
> > >   mmgrab(_mm);
> > >   current->active_mm = _mm;
> > > @@ -1345,14 +1351,20 @@ void start_secondary(void *unused)
> > >   /* Update topology CPU masks */
> > >   add_cpu_to_masks(cpu);
> > > 
> > > - if (has_big_cores)
> > > - sibling_mask = cpu_smallcore_mask;
> > >   /*
> > >* Check for any shared caches. Note that this must be done on a
> > >* per-core basis because one core in the pair might be disabled.
> > >*/
> > > - if (!cpumask_equal(cpu_l2_cache_mask(cpu), sibling_mask(cpu)))
> > > - shared_caches = true;
> > > + if (!shared_caches) {
> > > + struct cpumask *(*sibling_mask)(int) = cpu_sibling_mask;
> > > + struct cpumask *mask = cpu_l2_cache_mask(cpu);
> > > +
> > > + if (has_big_cores)
> > > + sibling_mask = cpu_smallcore_mask;
> > > +
> > > + if (cpumask_weight(mask) > cpumask_weight(sibling_mask(cpu)))
> > > + shared_caches = true;
> > 
> > Shouldn't we use cpumask_subset() here ?
> 
> Wouldn't cpumask_subset should return 1 if both are same?

When are caches shared ? When the sibling_mask(cpu)  is a
strict-subset of cpu_l2_cache_mask(cpu). cpumask_weight() only
checks if the number of CPUs in cpu_l2_cache_mask(cpu) is greater than
sibling_mask(cpu) but not if constituent CPUs of the former forms
a strict superset of the latter.

We are better off using
if (!cpumask_equal(sibling_mask(cpu), mask) &&
cpumask_subset(sibling_mask(cpu), mask))

which is accurate.



> We dont want to have shared_caches set if both the masks are equal.


> 
> > 
> > > + }
> > > 
> > >   set_numa_node(numa_cpu_lookup_table[cpu]);
> > >   set_numa_mem(local_memory_node(numa_cpu_lookup_table[cpu]));
> > > @@ -1390,6 +1402,14 @@ void __init smp_cpus_done(unsigned int max_cpus)
> > >   smp_ops->bringup_done();
> > > 
> > >   dump_numa_cpu_topology();
> > > + if (shared_caches) {
> > > + pr_info("Using shar

Re: [PATCH 05/11] powerpc/smp: Dont assume l2-cache to be superset of sibling

2020-07-20 Thread Gautham R Shenoy
Hi Srikar,

On Mon, Jul 20, 2020 at 12:15:04PM +0530, Srikar Dronamraju wrote:
> * Gautham R Shenoy  [2020-07-17 11:30:11]:
> 
> > Hi Srikar,
> > 
> > On Tue, Jul 14, 2020 at 10:06:18AM +0530, Srikar Dronamraju wrote:
> > > Current code assumes that cpumask of cpus sharing a l2-cache mask will
> > > always be a superset of cpu_sibling_mask.
> > > 
> > > Lets stop that assumption.
> > > 
> > > Cc: linuxppc-dev 
> > > Cc: Michael Ellerman 
> > > Cc: Nick Piggin 
> > > Cc: Oliver OHalloran 
> > > Cc: Nathan Lynch 
> > > Cc: Michael Neuling 
> > > Cc: Anton Blanchard 
> > > Cc: Gautham R Shenoy 
> > > Cc: Vaidyanathan Srinivasan 
> > > Signed-off-by: Srikar Dronamraju 
> > > ---
> > >  arch/powerpc/kernel/smp.c | 28 +++-
> > >  1 file changed, 15 insertions(+), 13 deletions(-)
> > > 
> > > diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
> > > index 7d430fc536cc..875f57e41355 100644
> > > --- a/arch/powerpc/kernel/smp.c
> > > +++ b/arch/powerpc/kernel/smp.c
> > > @@ -1198,6 +1198,7 @@ static bool update_mask_by_l2(int cpu, struct 
> > > cpumask *(*mask_fn)(int))
> > >   struct device_node *l2_cache, *np;
> > >   int i;
> > > 
> > > + cpumask_set_cpu(cpu, mask_fn(cpu));
> > 
> > It would be good to comment why do we need to do set the CPU in the
> > l2-mask if we don't have a l2cache domain.
> > 
> 
> Good Catch, 
> We should move this after the cpu_to_l2cache.
> 
> > >   l2_cache = cpu_to_l2cache(cpu);
> > >   if (!l2_cache)
> > >   return false;
> > > @@ -1284,29 +1285,30 @@ static void add_cpu_to_masks(int cpu)
> > >* add it to it's own thread sibling mask.
> > >*/
> > >   cpumask_set_cpu(cpu, cpu_sibling_mask(cpu));
> > > + cpumask_set_cpu(cpu, cpu_core_mask(cpu));
> > > 
> > >   for (i = first_thread; i < first_thread + threads_per_core; i++)
> > >   if (cpu_online(i))
> > >   set_cpus_related(i, cpu, cpu_sibling_mask);
> > > 
> > >   add_cpu_to_smallcore_masks(cpu);
> > > - /*
> > > -  * Copy the thread sibling mask into the cache sibling mask
> > > -  * and mark any CPUs that share an L2 with this CPU.
> > > -  */
> > > - for_each_cpu(i, cpu_sibling_mask(cpu))
> > > - set_cpus_related(cpu, i, cpu_l2_cache_mask);
> > >   update_mask_by_l2(cpu, cpu_l2_cache_mask);
> > > 
> > > - /*
> > > -  * Copy the cache sibling mask into core sibling mask and mark
> > > -  * any CPUs on the same chip as this CPU.
> > > -  */
> > > - for_each_cpu(i, cpu_l2_cache_mask(cpu))
> > > - set_cpus_related(cpu, i, cpu_core_mask);
> > > + if (pkg_id == -1) {
> > > + struct cpumask *(*mask)(int) = cpu_sibling_mask;
> > > +
> > > + /*
> > > +  * Copy the sibling mask into core sibling mask and
> > > +  * mark any CPUs on the same chip as this CPU.
> > > +  */
> > > + if (shared_caches)
> > > + mask = cpu_l2_cache_mask;
> > > +
> > 
> > 
> > Now that we decoupling the containment relationship between
> > sibling_mask and l2-cache mask, should we set all the CPUs that are
> > both in cpu_sibling_mask(cpu) as well as cpu_l2_mask(cpu) in
> > cpu_core_mask ? 
> > 
> 
> Are you saying instead of setting this cpu in this cpu_core_mask, can we set
> all the cpus in the mask in cpu_core_mask?

No. What I am referring to is in the for-loop below, you are setting
the CPUs that are set in mask(cpu) in the cpu_core_mask.

Now, the above code sets
mask(cpu) == cpu_sibling_mask(cpu) in the absence of shared_caches, and 
  == cpu_l2_cache_mask(cpu) in the presence of shared_cache.

Since we have decoupled the assumption that cpu_sibling_mask(cpu) may not
be contained within cpu_l2_cache_mask(cpu), in the presence of a
shared-cache, why are we only picking the CPUs in
cpu_l2_cache_mask(cpu) to be set in cpu_core_maks(cpu) ? It should
ideally be the superset whose CPUs should be set in
cpu_core_mask(cpu). And the correct cpuset is
cpumask_or(cpu_sibling_mask(cpu), cpu_l2_cache_mask(cpu))


> Currently we dont know if any of the cpus of the mask were already set or
> not. Plus we need to anyway update cpumask of all other cpus to says they
> are related. So setting a mask instead of cpu at a time will not change
> anything for our side.
> 
> > > + for_each_cpu(i, mask(cpu))
> > > + set_cpus_related(cpu, i, cpu_core_mask);
> > > 
> > > - if (pkg_id == -1)
> > >   return;
> > > + }
> > > 
> > >   for_each_cpu(i, cpu_online_mask)
> > >   if (get_physical_package_id(i) == pkg_id)
> > > -- 
> > > 2.17.1
> > > 
> > --
> > Thanks and Regards
> > gautham.
> 
> -- 
> Thanks and Regards
> Srikar Dronamraju


Re: [PATCH v2 2/2] selftest/cpuidle: Add support for cpuidle latency measurement

2020-07-19 Thread Gautham R Shenoy
Hi Pratik,


On Fri, Jul 17, 2020 at 02:48:01PM +0530, Pratik Rajesh Sampat wrote:
> This patch adds support to trace IPI based and timer based wakeup
> latency from idle states
> 
> Latches onto the test-cpuidle_latency kernel module using the debugfs
> interface to send IPIs or schedule a timer based event, which in-turn
> populates the debugfs with the latency measurements.
> 
> Currently for the IPI and timer tests; first disable all idle states
> and then test for latency measurements incrementally enabling each state
> 
> Signed-off-by: Pratik Rajesh Sampat 

A few comments below.

> ---
>  tools/testing/selftests/Makefile   |   1 +
>  tools/testing/selftests/cpuidle/Makefile   |   6 +
>  tools/testing/selftests/cpuidle/cpuidle.sh | 257 +
>  tools/testing/selftests/cpuidle/settings   |   1 +
>  4 files changed, 265 insertions(+)
>  create mode 100644 tools/testing/selftests/cpuidle/Makefile
>  create mode 100755 tools/testing/selftests/cpuidle/cpuidle.sh
>  create mode 100644 tools/testing/selftests/cpuidle/settings
> 

[..skip..]

> +
> +ins_mod()
> +{
> + if [ ! -f "$MODULE" ]; then
> + printf "$MODULE module does not exist. Exitting\n"

If the module has been compiled into the kernel (due to a
localyesconfig, for instance), then it is unlikely that we will find
it in /lib/modules. Perhaps you want to check if the debugfs
directories created by the module exist, and if so, print a message
saying that the modules is already loaded or some such?

> + exit $ksft_skip
> + fi
> + printf "Inserting $MODULE module\n\n"
> + insmod $MODULE
> + if [ $? != 0 ]; then
> + printf "Insmod $MODULE failed\n"
> + exit $ksft_skip
> + fi
> +}
> +
> +compute_average()
> +{
> + arr=("$@")
> + sum=0
> + size=${#arr[@]}
> + for i in "${arr[@]}"
> + do
> + sum=$((sum + i))
> + done
> + avg=$((sum/size))

It would be good to assert that "size" isn't 0 here.

> +}
> +
> +# Disable all stop states
> +disable_idle()
> +{
> + for ((cpu=0; cpu + do
> + for ((state=0; state + do
> + echo 1 > 
> /sys/devices/system/cpu/cpu$cpu/cpuidle/state$state/disable

So, on offlined CPUs, we won't see
/sys/devices/system/cpu/cpu$cpu/cpuidle/state$state directory. You
should probably perform this operation only on online CPUs.


> + done
> + done
> +}
> +
> +# Perform operation on each CPU for the given state
> +# $1 - Operation: enable (0) / disable (1)
> +# $2 - State to enable
> +op_state()
> +{
> + for ((cpu=0; cpu + do
> + echo $1 > 
> /sys/devices/system/cpu/cpu$cpu/cpuidle/state$2/disable


Ditto

> + done
> +}

This is a helper function. For better readability of the main code you
can define the following wrappers and use them.


cpuidle_enable_state()
{
state=$1
op_state 1 $state
}

cpuidle_disable_state()
{
state=$1
op_state 0 $state

}


> +

[..snip..]

> +run_ipi_tests()
> +{
> +extract_latency
> +disable_idle
> +declare -a avg_arr
> +echo -e "--IPI Latency Test---" >> $LOG
> +
> + echo -e "--Baseline IPI Latency measurement: CPU Busy--" >> $LOG
> + printf "%s %10s %12s\n" "SRC_CPU" "DEST_CPU" "IPI_Latency(ns)" 
> >> $LOG
> + for ((cpu=0; cpu + do
> + ipi_test_once "baseline" $cpu
> + printf "%-3s %10s %12s\n" $src_cpu $cpu $ipi_latency >> 
> $LOG
> + avg_arr+=($ipi_latency)
> + done
> + compute_average "${avg_arr[@]}"
> + echo -e "Baseline Average IPI latency(ns): $avg" >> $LOG
> +
> +for ((state=0; state +do
> + unset avg_arr
> + echo -e "---Enabling state: $state---" >> $LOG
> + op_state 0 $state
> + printf "%s %10s %12s\n" "SRC_CPU" "DEST_CPU" 
> "IPI_Latency(ns)" >> $LOG
> + for ((cpu=0; cpu + do

If a CPU is offline, then we should skip it here.

> + # Running IPI test and logging results
> + sleep 1
> + ipi_test_once "test" $cpu
> + printf "%-3s %10s %12s\n" $src_cpu $cpu 
> $ipi_latency >> $LOG
> + avg_arr+=($ipi_latency)
> + done
> + compute_average "${avg_arr[@]}"
> + echo -e "Expected IPI latency(ns): 
> ${latency_arr[$state]}" >> $LOG
> + echo -e "Observed Average IPI latency(ns): $avg" >> $LOG
> + op_state 1 $state
> +done
> +}
> +
> +# Extract the residency in microseconds and convert to nanoseconds.
> +# Add 100 ns so that the timer stays for a little longer than the residency
> +extract_residency()
> +{
> 

Re: [PATCH v2 1/2] cpuidle: Trace IPI based and timer based wakeup latency from idle states

2020-07-19 Thread Gautham R Shenoy
On Fri, Jul 17, 2020 at 02:48:00PM +0530, Pratik Rajesh Sampat wrote:
> Fire directed smp_call_function_single IPIs from a specified source
> CPU to the specified target CPU to reduce the noise we have to wade
> through in the trace log.
> The module is based on the idea written by Srivatsa Bhat and maintained
> by Vaidyanathan Srinivasan internally.
> 
> Queue HR timer and measure jitter. Wakeup latency measurement for idle
> states using hrtimer.  Echo a value in ns to timer_test_function and
> watch trace. A HRtimer will be queued and when it fires the expected
> wakeup vs actual wakeup is computes and delay printed in ns.
> 
> Implemented as a module which utilizes debugfs so that it can be
> integrated with selftests.
> 
> To include the module, check option and include as module
> kernel hacking -> Cpuidle latency selftests
> 
> [srivatsa.b...@linux.vnet.ibm.com: Initial implementation in
>  cpidle/sysfs]
> 
> [sva...@linux.vnet.ibm.com: wakeup latency measurements using hrtimer
>  and fix some of the time calculation]
> 
> [e...@linux.vnet.ibm.com: Fix some whitespace and tab errors and
>  increase the resolution of IPI wakeup]
> 
> Signed-off-by: Pratik Rajesh Sampat 


The debugfs module looks good to me.

Reviewed-by: Gautham R. Shenoy 


> ---
>  drivers/cpuidle/Makefile   |   1 +
>  drivers/cpuidle/test-cpuidle_latency.c | 150 +
>  lib/Kconfig.debug  |  10 ++
>  3 files changed, 161 insertions(+)
>  create mode 100644 drivers/cpuidle/test-cpuidle_latency.c
> 
> diff --git a/drivers/cpuidle/Makefile b/drivers/cpuidle/Makefile
> index f07800cbb43f..2ae05968078c 100644
> --- a/drivers/cpuidle/Makefile
> +++ b/drivers/cpuidle/Makefile
> @@ -8,6 +8,7 @@ obj-$(CONFIG_ARCH_NEEDS_CPU_IDLE_COUPLED) += coupled.o
>  obj-$(CONFIG_DT_IDLE_STATES)   += dt_idle_states.o
>  obj-$(CONFIG_ARCH_HAS_CPU_RELAX)   += poll_state.o
>  obj-$(CONFIG_HALTPOLL_CPUIDLE) += cpuidle-haltpoll.o
> +obj-$(CONFIG_IDLE_LATENCY_SELFTEST)+= test-cpuidle_latency.o
> 
>  
> ##
>  # ARM SoC drivers
> diff --git a/drivers/cpuidle/test-cpuidle_latency.c 
> b/drivers/cpuidle/test-cpuidle_latency.c
> new file mode 100644
> index ..61574665e972
> --- /dev/null
> +++ b/drivers/cpuidle/test-cpuidle_latency.c
> @@ -0,0 +1,150 @@
> +// SPDX-License-Identifier: GPL-2.0-or-later
> +/*
> + * Module-based API test facility for cpuidle latency using IPIs and timers
> + */
> +
> +#include 
> +#include 
> +#include 
> +
> +/* IPI based wakeup latencies */
> +struct latency {
> + unsigned int src_cpu;
> + unsigned int dest_cpu;
> + ktime_t time_start;
> + ktime_t time_end;
> + u64 latency_ns;
> +} ipi_wakeup;
> +
> +static void measure_latency(void *info)
> +{
> + struct latency *v;
> + ktime_t time_diff;
> +
> + v = (struct latency *)info;
> + v->time_end = ktime_get();
> + time_diff = ktime_sub(v->time_end, v->time_start);
> + v->latency_ns = ktime_to_ns(time_diff);
> +}
> +
> +void run_smp_call_function_test(unsigned int cpu)
> +{
> + ipi_wakeup.src_cpu = smp_processor_id();
> + ipi_wakeup.dest_cpu = cpu;
> + ipi_wakeup.time_start = ktime_get();
> + smp_call_function_single(cpu, measure_latency, _wakeup, 1);
> +}
> +
> +/* Timer based wakeup latencies */
> +struct timer_data {
> + unsigned int src_cpu;
> + u64 timeout;
> + ktime_t time_start;
> + ktime_t time_end;
> + struct hrtimer timer;
> + u64 timeout_diff_ns;
> +} timer_wakeup;
> +
> +static enum hrtimer_restart timer_called(struct hrtimer *hrtimer)
> +{
> + struct timer_data *w;
> + ktime_t time_diff;
> +
> + w = container_of(hrtimer, struct timer_data, timer);
> + w->time_end = ktime_get();
> +
> + time_diff = ktime_sub(w->time_end, w->time_start);
> + time_diff = ktime_sub(time_diff, ns_to_ktime(w->timeout));
> + w->timeout_diff_ns = ktime_to_ns(time_diff);
> + return HRTIMER_NORESTART;
> +}
> +
> +static void run_timer_test(unsigned int ns)
> +{
> + hrtimer_init(_wakeup.timer, CLOCK_MONOTONIC,
> +  HRTIMER_MODE_REL);
> + timer_wakeup.timer.function = timer_called;
> + timer_wakeup.time_start = ktime_get();
> + timer_wakeup.src_cpu = smp_processor_id();
> + timer_wakeup.timeout = ns;
> +
> + hrtimer_start(_wakeup.timer, ns_to_ktime(ns),
> +   HRTIMER_MODE_REL_PINNED);
> +}
> +
> +static struct dentry *dir;
> +
> +static 

Re: [PATCH 11/11] powerpc/smp: Provide an ability to disable coregroup

2020-07-17 Thread Gautham R Shenoy
On Tue, Jul 14, 2020 at 10:06:24AM +0530, Srikar Dronamraju wrote:
> If user wants to enable coregroup sched_domain then they can boot with
> kernel parameter "coregroup_support=on"
> 
> Cc: linuxppc-dev 
> Cc: Michael Ellerman 
> Cc: Nick Piggin 
> Cc: Oliver OHalloran 
> Cc: Nathan Lynch 
> Cc: Michael Neuling 
> Cc: Anton Blanchard 
> Cc: Gautham R Shenoy 
> Cc: Vaidyanathan Srinivasan 
> Signed-off-by: Srikar Dronamraju 


We need this documented in the
Documentation/admin-guide/kernel-parameters.txt

Other than that, the patch looks good to me.
Reviewed-by: Gautham R. Shenoy 

> ---
>  arch/powerpc/kernel/smp.c | 19 ++-
>  1 file changed, 18 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
> index bb25c13bbb79..c43909e6e8e9 100644
> --- a/arch/powerpc/kernel/smp.c
> +++ b/arch/powerpc/kernel/smp.c
> @@ -118,6 +118,23 @@ struct smp_ops_t *smp_ops;
>  volatile unsigned int cpu_callin_map[NR_CPUS];
> 
>  int smt_enabled_at_boot = 1;
> +int coregroup_support;
> +
> +static int __init early_coregroup(char *p)
> +{
> + if (!p)
> + return 0;
> +
> + if (strstr(p, "on"))
> + coregroup_support = 1;
> +
> + if (strstr(p, "1"))
> + coregroup_support = 1;
> +
> + return 0;
> +}
> +
> +early_param("coregroup_support", early_coregroup);
> 
>  /*
>   * Returns 1 if the specified cpu should be brought up during boot.
> @@ -878,7 +895,7 @@ static struct cpumask *cpu_coregroup_mask(int cpu)
> 
>  static bool has_coregroup_support(void)
>  {
> - return coregroup_enabled;
> + return coregroup_enabled && coregroup_support;
>  }
> 
>  static const struct cpumask *cpu_mc_mask(int cpu)
> -- 
> 2.17.1
> 


Re: [PATCH 10/11] powerpc/smp: Implement cpu_to_coregroup_id

2020-07-17 Thread Gautham R Shenoy
On Tue, Jul 14, 2020 at 10:06:23AM +0530, Srikar Dronamraju wrote:
> Lookup the coregroup id from the associativity array.
> 
> If unable to detect the coregroup id, fallback on the core id.
> This way, ensure sched_domain degenerates and an extra sched domain is
> not created.
> 
> Ideally this function should have been implemented in
> arch/powerpc/kernel/smp.c. However if its implemented in mm/numa.c, we
> don't need to find the primary domain again.
> 
> If the device-tree mentions more than one coregroup, then kernel
> implements only the last or the smallest coregroup, which currently
> corresponds to the penultimate domain in the device-tree.
> 
> Cc: linuxppc-dev 
> Cc: Michael Ellerman 
> Cc: Nick Piggin 
> Cc: Oliver OHalloran 
> Cc: Nathan Lynch 
> Cc: Michael Neuling 
> Cc: Anton Blanchard 
> Cc: Gautham R Shenoy 
> Cc: Vaidyanathan Srinivasan 
> Signed-off-by: Srikar Dronamraju 
> ---
>  arch/powerpc/mm/numa.c | 17 +
>  1 file changed, 17 insertions(+)
> 
> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
> index d9ab9da85eab..4e85564ef62a 100644
> --- a/arch/powerpc/mm/numa.c
> +++ b/arch/powerpc/mm/numa.c
> @@ -1697,6 +1697,23 @@ static const struct proc_ops topology_proc_ops = {
> 
>  int cpu_to_coregroup_id(int cpu)
>  {
> + __be32 associativity[VPHN_ASSOC_BUFSIZE] = {0};
> + int index;
> +

It would be good to have an assert here to ensure that we are calling
this function only when coregroups are enabled.

Else, we may end up returning the penultimate index which maps to the
chip-id.



> + if (cpu < 0 || cpu > nr_cpu_ids)
> + return -1;
> +
> + if (!firmware_has_feature(FW_FEATURE_VPHN))
> + goto out;
> +
> + if (vphn_get_associativity(cpu, associativity))
> + goto out;
> +
> + index = of_read_number(associativity, 1);
> + if ((index > min_common_depth + 1) && coregroup_enabled)
> + return of_read_number([index - 1], 1);
> +
> +out:
>   return cpu_to_core_id(cpu);
>  }
> 
> -- 
> 2.17.1
> 


Re: [PATCH 09/11] Powerpc/smp: Create coregroup domain

2020-07-17 Thread Gautham R Shenoy
On Fri, Jul 17, 2020 at 01:49:26PM +0530, Gautham R Shenoy wrote:

> > +int cpu_to_coregroup_id(int cpu)
> > +{
> > +   return cpu_to_core_id(cpu);
> > +}
> 
> 
> So, if has_coregroup_support() returns true, then since the core_group
> identification is currently done through the core-id, the
> coregroup_mask is going to be the same as the
> cpu_core_mask/cpu_cpu_mask. Thus, we will be degenerating the DIE
> domain. Right ? Instead we could have kept the core-group to be a
> single bigcore by default, so that those domains can get degenerated
> preserving the legacy SMT, DIE, NUMA hierarchy.

Never mind, got confused with core_id and cpu_core_mask (which
corresponds to the cores of a chip).  cpu_to_core_id() does exactly
what we have described above. Sorry for the noise.

I am ok with this patch, except that I would request if all the
changes to the topology structure be done within a single function for
ease of tracking it instead of distributing it all over the code.


> 
> 
> 
> 
> > +
> >  static int topology_update_init(void)
> >  {
> > start_topology_update();
> > -- 
> > 2.17.1
> > 


Re: [PATCH 09/11] Powerpc/smp: Create coregroup domain

2020-07-17 Thread Gautham R Shenoy
On Tue, Jul 14, 2020 at 10:06:22AM +0530, Srikar Dronamraju wrote:
> Add percpu coregroup maps and masks to create coregroup domain.
> If a coregroup doesn't exist, the coregroup domain will be degenerated
> in favour of SMT/CACHE domain.
> 
> Cc: linuxppc-dev 
> Cc: Michael Ellerman 
> Cc: Nick Piggin 
> Cc: Oliver OHalloran 
> Cc: Nathan Lynch 
> Cc: Michael Neuling 
> Cc: Anton Blanchard 
> Cc: Gautham R Shenoy 
> Cc: Vaidyanathan Srinivasan 
> Signed-off-by: Srikar Dronamraju 
> ---
>  arch/powerpc/include/asm/topology.h | 10 
>  arch/powerpc/kernel/smp.c   | 37 +
>  arch/powerpc/mm/numa.c  |  5 
>  3 files changed, 52 insertions(+)
> 
> diff --git a/arch/powerpc/include/asm/topology.h 
> b/arch/powerpc/include/asm/topology.h
> index 2db7ba789720..34812c35018e 100644
> --- a/arch/powerpc/include/asm/topology.h
> +++ b/arch/powerpc/include/asm/topology.h
> @@ -98,6 +98,7 @@ extern int stop_topology_update(void);
>  extern int prrn_is_enabled(void);
>  extern int find_and_online_cpu_nid(int cpu);
>  extern int timed_topology_update(int nsecs);
> +extern int cpu_to_coregroup_id(int cpu);
>  #else
>  static inline int start_topology_update(void)
>  {
> @@ -120,6 +121,15 @@ static inline int timed_topology_update(int nsecs)
>   return 0;
>  }
> 
> +static inline int cpu_to_coregroup_id(int cpu)
> +{
> +#ifdef CONFIG_SMP
> + return cpu_to_core_id(cpu);
> +#else
> + return 0;
> +#endif
> +}
> +
>  #endif /* CONFIG_NUMA && CONFIG_PPC_SPLPAR */
> 
>  #include 
> diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
> index ef19eeccd21e..bb25c13bbb79 100644
> --- a/arch/powerpc/kernel/smp.c
> +++ b/arch/powerpc/kernel/smp.c
> @@ -80,6 +80,7 @@ DEFINE_PER_CPU(cpumask_var_t, cpu_sibling_map);
>  DEFINE_PER_CPU(cpumask_var_t, cpu_smallcore_map);
>  DEFINE_PER_CPU(cpumask_var_t, cpu_l2_cache_map);
>  DEFINE_PER_CPU(cpumask_var_t, cpu_core_map);
> +DEFINE_PER_CPU(cpumask_var_t, cpu_coregroup_map);
> 
>  EXPORT_PER_CPU_SYMBOL(cpu_sibling_map);
>  EXPORT_PER_CPU_SYMBOL(cpu_l2_cache_map);
> @@ -91,6 +92,7 @@ enum {
>   smt_idx,
>  #endif
>   bigcore_idx,
> + mc_idx,
>   die_idx,
>  };
> 
> @@ -869,6 +871,21 @@ static const struct cpumask *smallcore_smt_mask(int cpu)
>  }
>  #endif
> 
> +static struct cpumask *cpu_coregroup_mask(int cpu)
> +{
> + return per_cpu(cpu_coregroup_map, cpu);
> +}
> +
> +static bool has_coregroup_support(void)
> +{
> + return coregroup_enabled;
> +}
> +
> +static const struct cpumask *cpu_mc_mask(int cpu)
> +{
> + return cpu_coregroup_mask(cpu);
> +}
> +
>  static const struct cpumask *cpu_bigcore_mask(int cpu)
>  {
>   return cpu_core_mask(cpu);
> @@ -879,6 +896,7 @@ static struct sched_domain_topology_level 
> powerpc_topology[] = {
>   { cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) },
>  #endif
>   { cpu_bigcore_mask, SD_INIT_NAME(BIGCORE) },
> + { cpu_mc_mask, SD_INIT_NAME(MC) },
>   { cpu_cpu_mask, SD_INIT_NAME(DIE) },
>   { NULL, },
>  };
> @@ -933,6 +951,10 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
>   GFP_KERNEL, node);
>   zalloc_cpumask_var_node(_cpu(cpu_core_map, cpu),
>   GFP_KERNEL, node);
> + if (has_coregroup_support())
> + zalloc_cpumask_var_node(_cpu(cpu_coregroup_map, 
> cpu),
> + GFP_KERNEL, node);
> +
>  #ifdef CONFIG_NEED_MULTIPLE_NODES
>   /*
>* numa_node_id() works after this.
> @@ -950,6 +972,11 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
>   cpumask_set_cpu(boot_cpuid, cpu_l2_cache_mask(boot_cpuid));
>   cpumask_set_cpu(boot_cpuid, cpu_core_mask(boot_cpuid));
> 
> + if (has_coregroup_support())
> + cpumask_set_cpu(boot_cpuid, cpu_coregroup_mask(boot_cpuid));
> + else
> + powerpc_topology[mc_idx].mask = cpu_bigcore_mask;
> +

The else part could be moved to the common function where we are
modifying the other attributes of the topology array.


>   init_big_cores();
>   if (has_big_cores) {
>   cpumask_set_cpu(boot_cpuid,
> @@ -1241,6 +1268,8 @@ static void remove_cpu_from_masks(int cpu)
>   set_cpus_unrelated(cpu, i, cpu_sibling_mask);
>   if (has_big_cores)
>   set_cpus_unrelated(cpu, i, cpu_smallcore_mask);
> + if (has_coregroup_support())
> + set_cpus_unrelated(cpu, i, cpu_coregr

Re: [PATCH 08/11] powerpc/smp: Allocate cpumask only after searching thread group

2020-07-17 Thread Gautham R Shenoy
On Tue, Jul 14, 2020 at 10:06:21AM +0530, Srikar Dronamraju wrote:
> If allocated earlier and the search fails, then cpumask need to be
> freed. However cpu_l1_cache_map can be allocated after we search thread
> group.
> 
> Cc: linuxppc-dev 
> Cc: Michael Ellerman 
> Cc: Nick Piggin 
> Cc: Oliver OHalloran 
> Cc: Nathan Lynch 
> Cc: Michael Neuling 
> Cc: Anton Blanchard 
> Cc: Gautham R Shenoy 
> Cc: Vaidyanathan Srinivasan 
> Signed-off-by: Srikar Dronamraju 

Good fix.

Reviewed-by: Gautham R. Shenoy 

> ---
>  arch/powerpc/kernel/smp.c | 7 +++
>  1 file changed, 3 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
> index 96e47450d9b3..ef19eeccd21e 100644
> --- a/arch/powerpc/kernel/smp.c
> +++ b/arch/powerpc/kernel/smp.c
> @@ -797,10 +797,6 @@ static int init_cpu_l1_cache_map(int cpu)
>   if (err)
>   goto out;
> 
> - zalloc_cpumask_var_node(_cpu(cpu_l1_cache_map, cpu),
> - GFP_KERNEL,
> - cpu_to_node(cpu));
> -
>   cpu_group_start = get_cpu_thread_group_start(cpu, );
> 
>   if (unlikely(cpu_group_start == -1)) {
> @@ -809,6 +805,9 @@ static int init_cpu_l1_cache_map(int cpu)
>   goto out;
>   }
> 
> + zalloc_cpumask_var_node(_cpu(cpu_l1_cache_map, cpu),
> + GFP_KERNEL, cpu_to_node(cpu));
> +
>   for (i = first_thread; i < first_thread + threads_per_core; i++) {
>   int i_group_start = get_cpu_thread_group_start(i, );
> 
> -- 
> 2.17.1
> 


Re: [PATCH 07/11] Powerpc/numa: Detect support for coregroup

2020-07-17 Thread Gautham R Shenoy
On Tue, Jul 14, 2020 at 10:06:20AM +0530, Srikar Dronamraju wrote:
> Add support for grouping cores based on the device-tree classification.
> - The last domain in the associativity domains always refers to the
> core.
> - If primary reference domain happens to be the penultimate domain in
> the associativity domains device-tree property, then there are no
> coregroups. However if its not a penultimate domain, then there are
> coregroups. There can be more than one coregroup. For now we would be
> interested in the last or the smallest coregroups.
> 
> Cc: linuxppc-dev 
> Cc: Michael Ellerman 
> Cc: Nick Piggin 
> Cc: Oliver OHalloran 
> Cc: Nathan Lynch 
> Cc: Michael Neuling 
> Cc: Anton Blanchard 
> Cc: Gautham R Shenoy 
> Cc: Vaidyanathan Srinivasan 
> Signed-off-by: Srikar Dronamraju 

This looks good to me from the discovery of the core-group point of
view.

Reviewed-by: Gautham R. Shenoy 

> ---
>  arch/powerpc/include/asm/smp.h |  1 +
>  arch/powerpc/kernel/smp.c  |  1 +
>  arch/powerpc/mm/numa.c | 34 +-
>  3 files changed, 23 insertions(+), 13 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h
> index 49a25e2400f2..5bdc17a7049f 100644
> --- a/arch/powerpc/include/asm/smp.h
> +++ b/arch/powerpc/include/asm/smp.h
> @@ -28,6 +28,7 @@
>  extern int boot_cpuid;
>  extern int spinning_secondaries;
>  extern u32 *cpu_to_phys_id;
> +extern bool coregroup_enabled;
> 
>  extern void cpu_die(void);
>  extern int cpu_to_chip_id(int cpu);
> diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
> index f8faf75135af..96e47450d9b3 100644
> --- a/arch/powerpc/kernel/smp.c
> +++ b/arch/powerpc/kernel/smp.c
> @@ -74,6 +74,7 @@ static DEFINE_PER_CPU(int, cpu_state) = { 0 };
> 
>  struct task_struct *secondary_current;
>  bool has_big_cores;
> +bool coregroup_enabled;
> 
>  DEFINE_PER_CPU(cpumask_var_t, cpu_sibling_map);
>  DEFINE_PER_CPU(cpumask_var_t, cpu_smallcore_map);
> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
> index fc7b0505bdd8..a43eab455be4 100644
> --- a/arch/powerpc/mm/numa.c
> +++ b/arch/powerpc/mm/numa.c
> @@ -887,7 +887,9 @@ static void __init setup_node_data(int nid, u64 
> start_pfn, u64 end_pfn)
>  static void __init find_possible_nodes(void)
>  {
>   struct device_node *rtas;
> - u32 numnodes, i;
> + const __be32 *domains;
> + int prop_length, max_nodes;
> + u32 i;
> 
>   if (!numa_enabled)
>   return;
> @@ -896,25 +898,31 @@ static void __init find_possible_nodes(void)
>   if (!rtas)
>   return;
> 
> - if (of_property_read_u32_index(rtas, 
> "ibm,current-associativity-domains",
> - min_common_depth, )) {
> - /*
> -  * ibm,current-associativity-domains is a fairly recent
> -  * property. If it doesn't exist, then fallback on
> -  * ibm,max-associativity-domains. Current denotes what the
> -  * platform can support compared to max which denotes what the
> -  * Hypervisor can support.
> -  */
> - if (of_property_read_u32_index(rtas, 
> "ibm,max-associativity-domains",
> - min_common_depth, ))
> + /*
> +  * ibm,current-associativity-domains is a fairly recent property. If
> +  * it doesn't exist, then fallback on ibm,max-associativity-domains.
> +  * Current denotes what the platform can support compared to max
> +  * which denotes what the Hypervisor can support.
> +  */
> + domains = of_get_property(rtas, "ibm,current-associativity-domains",
> + _length);
> + if (!domains) {
> + domains = of_get_property(rtas, "ibm,max-associativity-domains",
> + _length);
> + if (!domains)
>   goto out;
>   }
> 
> - for (i = 0; i < numnodes; i++) {
> + max_nodes = of_read_number([min_common_depth], 1);
> + for (i = 0; i < max_nodes; i++) {
>   if (!node_possible(i))
>   node_set(i, node_possible_map);
>   }
> 
> + prop_length /= sizeof(int);
> + if (prop_length > min_common_depth + 2)
> + coregroup_enabled = 1;
> +
>  out:
>   of_node_put(rtas);
>  }
> -- 
> 2.17.1
> 


Re: [PATCH 06/11] powerpc/smp: Generalize 2nd sched domain

2020-07-17 Thread Gautham R Shenoy
On Tue, Jul 14, 2020 at 10:06:19AM +0530, Srikar Dronamraju wrote:
> Currently "CACHE" domain happens to be the 2nd sched domain as per
> powerpc_topology. This domain will collapse if cpumask of l2-cache is
> same as SMT domain. However we could generalize this domain such that it
> could mean either be a "CACHE" domain or a "BIGCORE" domain.
> 
> While setting up the "CACHE" domain, check if shared_cache is already
> set.
> 
> Cc: linuxppc-dev 
> Cc: Michael Ellerman 
> Cc: Nick Piggin 
> Cc: Oliver OHalloran 
> Cc: Nathan Lynch 
> Cc: Michael Neuling 
> Cc: Anton Blanchard 
> Cc: Gautham R Shenoy 
> Cc: Vaidyanathan Srinivasan 
> Signed-off-by: Srikar Dronamraju 
> ---
>  arch/powerpc/kernel/smp.c | 48 +++
>  1 file changed, 34 insertions(+), 14 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
> index 875f57e41355..f8faf75135af 100644
> --- a/arch/powerpc/kernel/smp.c
> +++ b/arch/powerpc/kernel/smp.c
> @@ -85,6 +85,14 @@ EXPORT_PER_CPU_SYMBOL(cpu_l2_cache_map);
>  EXPORT_PER_CPU_SYMBOL(cpu_core_map);
>  EXPORT_SYMBOL_GPL(has_big_cores);
> 
> +enum {
> +#ifdef CONFIG_SCHED_SMT
> + smt_idx,
> +#endif
> + bigcore_idx,
> + die_idx,
> +};
> +
>  #define MAX_THREAD_LIST_SIZE 8
>  #define THREAD_GROUP_SHARE_L1   1
>  struct thread_groups {
> @@ -851,13 +859,7 @@ static int powerpc_shared_cache_flags(void)
>   */
>  static const struct cpumask *shared_cache_mask(int cpu)
>  {
> - if (shared_caches)
> - return cpu_l2_cache_mask(cpu);
> -
> - if (has_big_cores)
> - return cpu_smallcore_mask(cpu);
> -
> - return cpu_smt_mask(cpu);
> + return per_cpu(cpu_l2_cache_map, cpu);
>  }
> 
>  #ifdef CONFIG_SCHED_SMT
> @@ -867,11 +869,16 @@ static const struct cpumask *smallcore_smt_mask(int cpu)
>  }
>  #endif
> 
> +static const struct cpumask *cpu_bigcore_mask(int cpu)
> +{
> + return cpu_core_mask(cpu);

It should be cpu_smt_mask() if we want the redundant big-core to be
degenerated in favour of the SMT level on P8, no? Because
cpu_core_mask refers to all the CPUs that are in the same chip.


> +}
> +
>  static struct sched_domain_topology_level powerpc_topology[] = {
>  #ifdef CONFIG_SCHED_SMT
>   { cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) },
>  #endif
> - { shared_cache_mask, powerpc_shared_cache_flags, SD_INIT_NAME(CACHE) },
> + { cpu_bigcore_mask, SD_INIT_NAME(BIGCORE) },
>   { cpu_cpu_mask, SD_INIT_NAME(DIE) },
>   { NULL, },
>  };
> @@ -895,7 +902,7 @@ static int init_big_cores(void)
> 
>  #ifdef CONFIG_SCHED_SMT
>   pr_info("Big cores detected. Using small core scheduling\n");
> - powerpc_topology[0].mask = smallcore_smt_mask;
> + powerpc_topology[smt_idx].mask = smallcore_smt_mask;
>  #endif
> 
>   return 0;
> @@ -1319,7 +1326,6 @@ static void add_cpu_to_masks(int cpu)
>  void start_secondary(void *unused)
>  {
>   unsigned int cpu = smp_processor_id();
> - struct cpumask *(*sibling_mask)(int) = cpu_sibling_mask;
> 
>   mmgrab(_mm);
>   current->active_mm = _mm;
> @@ -1345,14 +1351,20 @@ void start_secondary(void *unused)
>   /* Update topology CPU masks */
>   add_cpu_to_masks(cpu);
> 
> - if (has_big_cores)
> - sibling_mask = cpu_smallcore_mask;
>   /*
>* Check for any shared caches. Note that this must be done on a
>* per-core basis because one core in the pair might be disabled.
>*/
> - if (!cpumask_equal(cpu_l2_cache_mask(cpu), sibling_mask(cpu)))
> - shared_caches = true;
> + if (!shared_caches) {
> + struct cpumask *(*sibling_mask)(int) = cpu_sibling_mask;
> + struct cpumask *mask = cpu_l2_cache_mask(cpu);
> +
> + if (has_big_cores)
> + sibling_mask = cpu_smallcore_mask;
> +
> + if (cpumask_weight(mask) > cpumask_weight(sibling_mask(cpu)))
> + shared_caches = true;

Shouldn't we use cpumask_subset() here ?


> + }
> 
>   set_numa_node(numa_cpu_lookup_table[cpu]);
>   set_numa_mem(local_memory_node(numa_cpu_lookup_table[cpu]));
> @@ -1390,6 +1402,14 @@ void __init smp_cpus_done(unsigned int max_cpus)
>   smp_ops->bringup_done();
> 
>   dump_numa_cpu_topology();
> + if (shared_caches) {
> + pr_info("Using shared cache scheduler topology\n");
> + powerpc_topology[bigcore_idx].mask = shared_cache_mask;
> +#ifdef CONFIG_SCHED_DEBUG
> + p

Re: [PATCH 05/11] powerpc/smp: Dont assume l2-cache to be superset of sibling

2020-07-17 Thread Gautham R Shenoy
Hi Srikar,

On Tue, Jul 14, 2020 at 10:06:18AM +0530, Srikar Dronamraju wrote:
> Current code assumes that cpumask of cpus sharing a l2-cache mask will
> always be a superset of cpu_sibling_mask.
> 
> Lets stop that assumption.
> 
> Cc: linuxppc-dev 
> Cc: Michael Ellerman 
> Cc: Nick Piggin 
> Cc: Oliver OHalloran 
> Cc: Nathan Lynch 
> Cc: Michael Neuling 
> Cc: Anton Blanchard 
> Cc: Gautham R Shenoy 
> Cc: Vaidyanathan Srinivasan 
> Signed-off-by: Srikar Dronamraju 
> ---
>  arch/powerpc/kernel/smp.c | 28 +++-
>  1 file changed, 15 insertions(+), 13 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
> index 7d430fc536cc..875f57e41355 100644
> --- a/arch/powerpc/kernel/smp.c
> +++ b/arch/powerpc/kernel/smp.c
> @@ -1198,6 +1198,7 @@ static bool update_mask_by_l2(int cpu, struct cpumask 
> *(*mask_fn)(int))
>   struct device_node *l2_cache, *np;
>   int i;
> 
> + cpumask_set_cpu(cpu, mask_fn(cpu));

It would be good to comment why do we need to do set the CPU in the
l2-mask if we don't have a l2cache domain.

>   l2_cache = cpu_to_l2cache(cpu);
>   if (!l2_cache)
>   return false;
> @@ -1284,29 +1285,30 @@ static void add_cpu_to_masks(int cpu)
>* add it to it's own thread sibling mask.
>*/
>   cpumask_set_cpu(cpu, cpu_sibling_mask(cpu));
> + cpumask_set_cpu(cpu, cpu_core_mask(cpu));
> 
>   for (i = first_thread; i < first_thread + threads_per_core; i++)
>   if (cpu_online(i))
>   set_cpus_related(i, cpu, cpu_sibling_mask);
> 
>   add_cpu_to_smallcore_masks(cpu);
> - /*
> -  * Copy the thread sibling mask into the cache sibling mask
> -  * and mark any CPUs that share an L2 with this CPU.
> -  */
> - for_each_cpu(i, cpu_sibling_mask(cpu))
> - set_cpus_related(cpu, i, cpu_l2_cache_mask);
>   update_mask_by_l2(cpu, cpu_l2_cache_mask);
> 
> - /*
> -  * Copy the cache sibling mask into core sibling mask and mark
> -  * any CPUs on the same chip as this CPU.
> -  */
> - for_each_cpu(i, cpu_l2_cache_mask(cpu))
> - set_cpus_related(cpu, i, cpu_core_mask);
> + if (pkg_id == -1) {
> + struct cpumask *(*mask)(int) = cpu_sibling_mask;
> +
> + /*
> +  * Copy the sibling mask into core sibling mask and
> +  * mark any CPUs on the same chip as this CPU.
> +  */
> + if (shared_caches)
> + mask = cpu_l2_cache_mask;
> +


Now that we decoupling the containment relationship between
sibling_mask and l2-cache mask, should we set all the CPUs that are
both in cpu_sibling_mask(cpu) as well as cpu_l2_mask(cpu) in
cpu_core_mask ? 

> + for_each_cpu(i, mask(cpu))
> + set_cpus_related(cpu, i, cpu_core_mask);
> 
> - if (pkg_id == -1)
>   return;
> + }
> 
>   for_each_cpu(i, cpu_online_mask)
>   if (get_physical_package_id(i) == pkg_id)
> -- 
> 2.17.1
> 
--
Thanks and Regards
gautham.


  1   2   3   4   5   6   7   >