Re: [RFC] cpuidle/powernv : Support for pre-entry and post exit of stop state in firmware

2020-04-08 Thread Gautham R Shenoy
Hi Abhishek,

On Fri, Apr 03, 2020 at 04:27:01AM -0500, Abhishek Goel wrote:
> This patch provides kernel framework fro opal support of save restore
> of sprs in idle stop loop. Opal support for stop states is needed to
> selectively enable stop states or to introduce a quirk quickly in case
> a buggy stop state is present.
> 
> We make a opal call from kernel if firmware-stop-support for stop
> states is present and enabled. All the quirks for pre-entry of stop
> state is handled inside opal. A call from opal is made into kernel
> where we execute stop afer saving of NVGPRs.
> After waking up from 0x100 vector in kernel, we enter back into opal.
> All the quirks in post exit path, if any, are then handled in opal,
> from where we return successfully back to kernel.
> For deep stop states in which additional SPRs are lost, saving and
> restoration will be done in OPAL.
> 
> This idea was first proposed by Nick here:
> https://patchwork.ozlabs.org/patch/1208159/
> 
> The corresponding skiboot patch for this kernel patch is here:
> https://patchwork.ozlabs.org/patch/1265959/
> 
> When we callback from OPAL into kernel, r13 is clobbered. So, to
> access PACA we need to restore it from HSPRGO. In future we can
> handle this into OPAL as in here:
> https://patchwork.ozlabs.org/patch/1245275/
> 
> Signed-off-by: Abhishek Goel 
> Signed-off-by: Nicholas Piggin 
> ---
>  arch/powerpc/include/asm/opal-api.h|  8 -
>  arch/powerpc/include/asm/opal.h|  3 ++
>  arch/powerpc/kernel/idle_book3s.S  |  5 +++
>  arch/powerpc/platforms/powernv/idle.c  | 37 ++
>  arch/powerpc/platforms/powernv/opal-call.c |  2 ++
>  5 files changed, 54 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/powerpc/include/asm/opal-api.h 
> b/arch/powerpc/include/asm/opal-api.h
> index c1f25a760eb1..a2c782c99c9e 100644
> --- a/arch/powerpc/include/asm/opal-api.h
> +++ b/arch/powerpc/include/asm/opal-api.h
> @@ -214,7 +214,9 @@
>  #define OPAL_SECVAR_GET  176
>  #define OPAL_SECVAR_GET_NEXT 177
>  #define OPAL_SECVAR_ENQUEUE_UPDATE   178
> -#define OPAL_LAST178
> +#define OPAL_REGISTER_OS_OPS 181
> +#define OPAL_CPU_IDLE182
> +#define OPAL_LAST182
> 
>  #define QUIESCE_HOLD 1 /* Spin all calls at entry */
>  #define QUIESCE_REJECT   2 /* Fail all calls with 
> OPAL_BUSY */
> @@ -1181,6 +1183,10 @@ struct opal_mpipl_fadump {
>   struct  opal_mpipl_region region[];
>  } __packed;
> 
> +struct opal_os_ops {
> + __be64 os_idle_stop;
> +};
> +
>  #endif /* __ASSEMBLY__ */
> 
>  #endif /* __OPAL_API_H */
> diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
> index 9986ac34b8e2..3c340bc4df8e 100644
> --- a/arch/powerpc/include/asm/opal.h
> +++ b/arch/powerpc/include/asm/opal.h
> @@ -400,6 +400,9 @@ void opal_powercap_init(void);
>  void opal_psr_init(void);
>  void opal_sensor_groups_init(void);
> 
> +extern int64_t opal_register_os_ops(struct opal_os_ops *os_ops);
> +extern int64_t opal_cpu_idle(__be64 srr1_addr, uint64_t psscr);
> +
>  #endif /* __ASSEMBLY__ */
> 
>  #endif /* _ASM_POWERPC_OPAL_H */
> diff --git a/arch/powerpc/kernel/idle_book3s.S 
> b/arch/powerpc/kernel/idle_book3s.S
> index 22f249b6f58d..8d287d1d06c0 100644
> --- a/arch/powerpc/kernel/idle_book3s.S
> +++ b/arch/powerpc/kernel/idle_book3s.S
> @@ -49,6 +49,8 @@ _GLOBAL(isa300_idle_stop_noloss)
>   */
>  _GLOBAL(isa300_idle_stop_mayloss)
>   mtspr   SPRN_PSSCR,r3
> + mr  r6, r13
> + mfspr   r13, SPRN_HSPRG0
>   std r1,PACAR1(r13)
>   mflrr4
>   mfcrr5
> @@ -74,6 +76,7 @@ _GLOBAL(isa300_idle_stop_mayloss)
>   std r31,-8*18(r1)
>   std r4,-8*19(r1)
>   std r5,-8*20(r1)
> + std r6,-8*21(r1)
>   /* 168 bytes */
>   PPC_STOP
>   b   .   /* catch bugs */
> @@ -91,8 +94,10 @@ _GLOBAL(idle_return_gpr_loss)
>   ld  r1,PACAR1(r13)
>   ld  r4,-8*19(r1)
>   ld  r5,-8*20(r1)
> + ld  r6,-8*21(r1)
>   mtlrr4
>   mtcrr5
> + mr  r13,r6
>   /*
>* KVM nap requires r2 to be saved, rather than just restoring it
>* from PACATOC. This could be avoided for that less common case
> diff --git a/arch/powerpc/platforms/powernv/idle.c 
> b/arch/powerpc/platforms/powernv/idle.c
> index 78599bca66c2..1841027b25c5 100644
> --- a/arch/powerpc/platforms/powernv/idle.c
> +++ b/arch/powerpc/platforms/powernv/idle.c
> @@ -35,6 +35,7 @@
>  static u32 supported_cpuidle_states;
>  struct pnv_idle_states_t *pnv_idle_states;
>  int nr_pnv_idle_states;
> +static bool firmware_stop_supported;
> 
>  /*
>   * The default stop state that will be used by ppc_md.power_save
> @@ -602,6 +603,25 @@ struct p9_sprs {
>   u64 uamor;
>  };
> 
> +/*
> + * This function is called from OPAL if 

Re: [RFC PATCH 2/3] powerpc/lib: Initialize a temporary mm for code patching

2020-04-08 Thread Christophe Leroy




Le 31/03/2020 à 05:19, Christopher M Riedl a écrit :

On March 24, 2020 11:10 AM Christophe Leroy  wrote:

  
Le 23/03/2020 à 05:52, Christopher M. Riedl a écrit :

When code patching a STRICT_KERNEL_RWX kernel the page containing the
address to be patched is temporarily mapped with permissive memory
protections. Currently, a per-cpu vmalloc patch area is used for this
purpose. While the patch area is per-cpu, the temporary page mapping is
inserted into the kernel page tables for the duration of the patching.
The mapping is exposed to CPUs other than the patching CPU - this is
undesirable from a hardening perspective.

Use the `poking_init` init hook to prepare a temporary mm and patching
address. Initialize the temporary mm by copying the init mm. Choose a
randomized patching address inside the temporary mm userspace address
portion. The next patch uses the temporary mm and patching address for
code patching.

Based on x86 implementation:

commit 4fc19708b165
("x86/alternatives: Initialize temporary mm for patching")

Signed-off-by: Christopher M. Riedl 
---
   arch/powerpc/lib/code-patching.c | 26 ++
   1 file changed, 26 insertions(+)

diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c
index 3345f039a876..18b88ecfc5a8 100644
--- a/arch/powerpc/lib/code-patching.c
+++ b/arch/powerpc/lib/code-patching.c
@@ -11,6 +11,8 @@
   #include 
   #include 
   #include 
+#include 
+#include 
   
   #include 

   #include 
@@ -39,6 +41,30 @@ int raw_patch_instruction(unsigned int *addr, unsigned int 
instr)
   }
   
   #ifdef CONFIG_STRICT_KERNEL_RWX

+
+__ro_after_init struct mm_struct *patching_mm;
+__ro_after_init unsigned long patching_addr;


Can we make those those static ?



Yes, makes sense to me.


+
+void __init poking_init(void)
+{
+   spinlock_t *ptl; /* for protecting pte table */
+   pte_t *ptep;
+
+   patching_mm = copy_init_mm();
+   BUG_ON(!patching_mm);


Does it needs to be a BUG_ON() ? Can't we fail gracefully with just a
WARN_ON ?



I'm not sure what failing gracefully means here? The main reason this could
fail is if there is not enough memory to allocate the patching_mm. The
previous implementation had this justification for BUG_ON():


But the system can continue running just fine after this failure.
Only the things that make use of code patching will fail (ftrace, kgdb, ...)

Checkpatch tells: "Avoid crashing the kernel - try using WARN_ON & 
recovery code rather than BUG() or BUG_ON()"


All vital code patching has already been done previously, so I think a 
WARN_ON() should be enough, plus returning non 0 to indicate that the 
late_initcall failed.





/*
  * Run as a late init call. This allows all the boot time patching to be done
  * simply by patching the code, and then we're called here prior to
  * mark_rodata_ro(), which happens after all init calls are run. Although
  * BUG_ON() is rude, in this case it should only happen if ENOMEM, and we judge
  * it as being preferable to a kernel that will crash later when someone tries
  * to use patch_instruction().
  */
static int __init setup_text_poke_area(void)
{
 BUG_ON(!cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
 "powerpc/text_poke:online", text_area_cpu_up,
 text_area_cpu_down));

 return 0;
}
late_initcall(setup_text_poke_area);

I think the BUG_ON() is appropriate even if only to adhere to the previous
judgement call. I can add a similar comment explaining the reasoning if
that helps.


+
+   /*
+* In hash we cannot go above DEFAULT_MAP_WINDOW easily.
+* XXX: Do we want additional bits of entropy for radix?
+*/
+   patching_addr = (get_random_long() & PAGE_MASK) %
+   (DEFAULT_MAP_WINDOW - PAGE_SIZE);
+
+   ptep = get_locked_pte(patching_mm, patching_addr, );
+   BUG_ON(!ptep);


Same here, can we fail gracefully instead ?



Same reasoning as above.


Here as well, a WARN_ON() should be enough, the system will continue 
running after that.





+   pte_unmap_unlock(ptep, ptl);
+}
+
   static DEFINE_PER_CPU(struct vm_struct *, text_poke_area);
   
   static int text_area_cpu_up(unsigned int cpu)




Christophe


Christophe


Re: [RFC 3/3] Introduce capability for firmware-enabled-stop

2020-04-08 Thread Gautham R Shenoy
Hi Pratik,

On Wed, Mar 04, 2020 at 09:31:23PM +0530, Pratik Rajesh Sampat wrote:
> Design patch that introduces the capability for firmware to handle the
> stop states instead. A bit is set based on the discovery of the feature
> and correspondingly also the responsibility to handle the stop states.
> 
> The commit does not contain calling into the firmware to utilize
> firmware enabled stop.
> 
> Signed-off-by: Pratik Rajesh Sampat 
> ---
>  arch/powerpc/include/asm/processor.h | 1 +
>  arch/powerpc/kernel/dt_cpu_ftrs.c| 9 +
>  2 files changed, 10 insertions(+)
> 
> diff --git a/arch/powerpc/include/asm/processor.h 
> b/arch/powerpc/include/asm/processor.h
> index 277dbabafd02..978fab35d133 100644
> --- a/arch/powerpc/include/asm/processor.h
> +++ b/arch/powerpc/include/asm/processor.h
> @@ -430,6 +430,7 @@ extern unsigned long cpuidle_disable;
>  enum idle_boot_override {IDLE_NO_OVERRIDE = 0, IDLE_POWERSAVE_OFF};
> 
>  #define STOP_ENABLE  0x0001
> +#define FIRMWARE_STOP_ENABLE 0x0010


This could be made a bit in the "version" variable.

> 
>  #define STOP_VERSION_P9   0x1
>  #define STOP_VERSION_P9_V10x2
> diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c 
> b/arch/powerpc/kernel/dt_cpu_ftrs.c
> index 63e30aa49356..e00f8afabc46 100644
> --- a/arch/powerpc/kernel/dt_cpu_ftrs.c
> +++ b/arch/powerpc/kernel/dt_cpu_ftrs.c
> @@ -313,6 +313,14 @@ static int __init feat_enable_idle_stop_quirk(struct 
> dt_cpu_feature *f)
> 
>   return 1;
>  }
> +
> +static int __init feat_enable_firmware_stop(struct dt_cpu_feature *f)
> +{
> + stop_dep.cpuidle_prop |= FIRMWARE_STOP_ENABLE;

stop_dep.cpuidle_version |= FIRMWARE_STOP_V1; or some such
variant.


> +
> + return 1;
> +}
> +
>  static int __init feat_enable_mmu_hash(struct dt_cpu_feature *f)
>  {
>   u64 lpcr;
> @@ -608,6 +616,7 @@ static struct dt_cpu_feature_match __initdata
>   {"alignment-interrupt-dsisr", feat_enable_align_dsisr, 0},
>   {"idle-stop", feat_enable_idle_stop, 0},
>   {"idle-stop-v1", feat_enable_idle_stop_quirk, 0},
> + {"firmware-stop-supported", feat_enable_firmware_stop, 0},
>   {"machine-check-power8", feat_enable_mce_power8, 0},
>   {"performance-monitor-power8", feat_enable_pmu_power8, 0},
>   {"data-stream-control-register", feat_enable_dscr, CPU_FTR_DSCR},
> -- 
> 2.24.1
> 


Re: [RFC 1/3] Interface for an idle-stop dependency structure

2020-04-08 Thread Gautham R Shenoy
Hi Pratik,

On Wed, Mar 04, 2020 at 09:31:21PM +0530, Pratik Rajesh Sampat wrote:
> Design patch to introduce the idea of having a dependency structure for
> idle-stop. The structure encapsulates the following:
> 1. Bitmask for version of idle-stop
> 2. Bitmask for propterties like ENABLE/DISABLE
> 3. Function pointer which helps handle how the stop must be invoked
> 
> The commit lays a foundation for other idle-stop versions to be added
> and handled cleanly based on their specified requirments.
> Currently it handles the existing "idle-stop" version by setting the
> discovery bits and the function pointer.

So, if this patch is applied, and we are running with an OPAL that
doesn't publish the "idle-stop" dt-cpu-feature, then the goal is to
not enable any stop states. Is this correct ?


> 
> Signed-off-by: Pratik Rajesh Sampat 
> ---
>  arch/powerpc/include/asm/processor.h  | 17 +
>  arch/powerpc/kernel/dt_cpu_ftrs.c |  5 +
>  arch/powerpc/platforms/powernv/idle.c | 17 +
>  drivers/cpuidle/cpuidle-powernv.c |  3 ++-
>  4 files changed, 37 insertions(+), 5 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/processor.h 
> b/arch/powerpc/include/asm/processor.h
> index eedcbfb9a6ff..da59f01a5c09 100644
> --- a/arch/powerpc/include/asm/processor.h
> +++ b/arch/powerpc/include/asm/processor.h
> @@ -429,6 +429,23 @@ extern void power4_idle_nap(void);
>  extern unsigned long cpuidle_disable;
>  enum idle_boot_override {IDLE_NO_OVERRIDE = 0, IDLE_POWERSAVE_OFF};
> 
> +#define STOP_ENABLE  0x0001
> +
> +#define STOP_VERSION_P9   0x1
> +
> +/*
> + * Classify the dependencies of the stop states
> + * @idle_stop: function handler to handle the quirk stop version
> + * @cpuidle_prop: Signify support for stop states through kernel and/or 
> firmware
> + * @stop_version: Classify quirk versions for stop states
> + */
> +typedef struct {
> + unsigned long (*idle_stop)(unsigned long, bool);
> + uint8_t cpuidle_prop;
> + uint8_t stop_version;

Why do we need both cpuidle_prop and stop_version ? 


> +}stop_deps_t;
> +extern stop_deps_t stop_dep;
> +
>  extern int powersave_nap;/* set if nap mode can be used in idle loop */
> 
>  extern void power7_idle_type(unsigned long type);
> diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c 
> b/arch/powerpc/kernel/dt_cpu_ftrs.c
> index 182b4047c1ef..db1a525e090d 100644
> --- a/arch/powerpc/kernel/dt_cpu_ftrs.c
> +++ b/arch/powerpc/kernel/dt_cpu_ftrs.c
> @@ -292,6 +292,8 @@ static int __init feat_enable_idle_stop(struct 
> dt_cpu_feature *f)
>   lpcr |=  LPCR_PECE1;
>   lpcr |=  LPCR_PECE2;
>   mtspr(SPRN_LPCR, lpcr);
> + stop_dep.cpuidle_prop |= STOP_ENABLE;
> + stop_dep.stop_version = STOP_VERSION_P9;

Doesn't stop_version != 0 imply (stop_dep.cpuidle_prop & STOP_ENABLE)?

> 
>   return 1;
>  }
> @@ -657,6 +659,9 @@ static void __init cpufeatures_setup_start(u32 isa)
>   }
>  }
> 
> +stop_deps_t stop_dep = {NULL, 0x0, 0x0};
> +EXPORT_SYMBOL(stop_dep);
> +
>  static bool __init cpufeatures_process_feature(struct dt_cpu_feature *f)
>  {
>   const struct dt_cpu_feature_match *m;
> diff --git a/arch/powerpc/platforms/powernv/idle.c 
> b/arch/powerpc/platforms/powernv/idle.c
> index 78599bca66c2..c32cdc37acf4 100644
> --- a/arch/powerpc/platforms/powernv/idle.c
> +++ b/arch/powerpc/platforms/powernv/idle.c
> @@ -812,7 +812,7 @@ static unsigned long power9_offline_stop(unsigned long 
> psscr)
> 
>  #ifndef CONFIG_KVM_BOOK3S_HV_POSSIBLE
>   __ppc64_runlatch_off();
> - srr1 = power9_idle_stop(psscr, true);
> + srr1 = stop_dep.idle_stop(psscr, true);
>   __ppc64_runlatch_on();
>  #else
>   /*
> @@ -828,7 +828,7 @@ static unsigned long power9_offline_stop(unsigned long 
> psscr)
>   local_paca->kvm_hstate.hwthread_state = KVM_HWTHREAD_IN_IDLE;
> 
>   __ppc64_runlatch_off();
> - srr1 = power9_idle_stop(psscr, false);
> + srr1 = stop_dep.idle_stop(psscr, true);
>   __ppc64_runlatch_on();
> 
>   local_paca->kvm_hstate.hwthread_state = KVM_HWTHREAD_IN_KERNEL;
> @@ -856,7 +856,7 @@ void power9_idle_type(unsigned long stop_psscr_val,
>   psscr = (psscr & ~stop_psscr_mask) | stop_psscr_val;
> 
>   __ppc64_runlatch_off();
> - srr1 = power9_idle_stop(psscr, true);
> + srr1 = stop_dep.idle_stop(psscr, true);
>   __ppc64_runlatch_on();
>

There is one other place in arch/powerpc/kvm/book3s_hv_rmhandlers.S
where call isa300_idle_stop_mayloss (this is kvm_nap_sequence).

So, if stop states are not supported, then, KVM subsystem should know
about it. Some KVM configurations depend on putting the secondary
threads of the core offline into an idle state whose wakeup is from
0x100 vector. Your patch doesn't address that part.

>   fini_irq_for_idle_irqsoff();
> @@ -1353,8 +1353,17 @@ static int __init pnv_init_idle_states(void)
>   nr_pnv_idle_states = 0;
>   supported_cpuidle_states = 0;
> 
> - if (cpuidle_disable != 

[PATCH] i2c: powermac: Simplify reading the "reg" and "i2c-address" property

2020-04-08 Thread Aishwarya R
Use of_property_read_u32 to read the "reg" and "i2c-address" property
instead of using of_get_property to check the return values.

Signed-off-by: Aishwarya R 
---
 drivers/i2c/busses/i2c-powermac.c | 16 
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/i2c/busses/i2c-powermac.c 
b/drivers/i2c/busses/i2c-powermac.c
index d565714c1f13..81506c2dab65 100644
--- a/drivers/i2c/busses/i2c-powermac.c
+++ b/drivers/i2c/busses/i2c-powermac.c
@@ -207,18 +207,18 @@ static u32 i2c_powermac_get_addr(struct i2c_adapter *adap,
   struct pmac_i2c_bus *bus,
   struct device_node *node)
 {
-   const __be32 *prop;
-   int len;
+   u32 prop;
+   int ret;
 
/* First check for valid "reg" */
-   prop = of_get_property(node, "reg", );
-   if (prop && (len >= sizeof(int)))
-   return (be32_to_cpup(prop) & 0xff) >> 1;
+   ret = of_property_read_u32(node, "reg", );
+   if (ret == 0)
+   return (prop & 0xff) >> 1;
 
/* Then check old-style "i2c-address" */
-   prop = of_get_property(node, "i2c-address", );
-   if (prop && (len >= sizeof(int)))
-   return (be32_to_cpup(prop) & 0xff) >> 1;
+   ret = of_property_read_u32(node, "i2c-address", );
+   if (ret == 0)
+   return (prop & 0xff) >> 1;
 
/* Now handle some devices with missing "reg" properties */
if (of_node_name_eq(node, "cereal"))
-- 
2.17.1



Re: [RFC] Support stop state version quirk and firmware enabled stop

2020-04-08 Thread Gautham R Shenoy
Hi Pratik,

On Wed, Mar 04, 2020 at 09:26:48PM +0530, Pratik Rajesh Sampat wrote:
> A concept patch in Skiboot to illustrate the case wherein handling of
> stop states for different DD versions of a CPU can be achieved by a
> simple modification in the list of cpu_features.
> As an example idle-stop1 is defined which uses P9_CPU_DD1 to define the
> cpu feature.
> 
> Along with that, an implementation is being worked upon the LE OPAL
> series which helps OPAL handle the stop state entry and exit.
> 
> This patch advertises this capability of the firmware which can be
> availed if the quirk-version-setting is not cognizable.
> 
> The firmware-enabled stop is being worked by Abhishek Goel
>  building upon the LE OPAL series.
> 
> Signed-off-by: Pratik Rajesh Sampat 
> ---
>  core/cpufeatures.c | 22 ++
>  1 file changed, 22 insertions(+)
> 
> diff --git a/core/cpufeatures.c b/core/cpufeatures.c
> index ec30c975..b9875e7b 100644
> --- a/core/cpufeatures.c
> +++ b/core/cpufeatures.c
> @@ -510,6 +510,25 @@ static const struct cpu_feature cpu_features_table[] = {
>   -1, -1, -1,
>   NULL, },
> 
> + /*
> +  * QUIRK for ISAv3.0B stop idle instructions and registers
> +  * Helps us determine if there are any quirks
> +  * XXX: Same of idle-stop
> +  */
> + { "idle-stop-v1",
> + CPU_P9_DD1,
> + ISA_V3_0B, USABLE_HV|USABLE_OS,
> + HV_CUSTOM, OS_CUSTOM,
> + -1, -1, -1,
> + NULL, },


So, at this point, we don't need any such quirk for any of the DD
version right ? This is to demonstrate that if say P9_DD1 had a quirk
w.r.t stop-state handling, then this is how we would advertise it to
the kernel.

> +
> + { "firmware-stop-supported",
> + CPU_P9,
> + ISA_V3_0B, USABLE_HV|USABLE_OS,
> + HV_CUSTOM, OS_CUSTOM,
> + -1, -1, -1,
> + NULL, },
> +


I suppose this is for the opal-cpuidle driver support posted here:
https://lists.ozlabs.org/pipermail/skiboot/2020-April/016726.html

>   /*
>* ISAv3.0B Hypervisor Virtualization Interrupt
>* Also associated system registers, LPCR EE, HEIC, HVICE,
> @@ -883,6 +902,9 @@ static void add_cpufeatures(struct dt_node *cpus,
>   const struct cpu_feature *f = _features_table[i];
> 
>   if (f->cpus_supported & cpu_feature_cpu) {
> + if (!strcmp(f->name, "firmware-stop-supported") &&
> + HAVE_BIG_ENDIAN)
> + continue;

In OPAL do we have an macro defining BIG_ENDIAN ? If yes, you could
wrap the "firmware-stop-supported" in cpu_features_table[] within
#ifndef BIG_ENDIAN. That way you won't need a special case here.



>   DBG("  '%s'\n", f->name);
>   add_cpu_feature_nodeps(features, f);
>   }
> -- 
> 2.24.1
> 

--
Thanks and Regards
gautham.


Re: [PATCH kernel v2 0/7] powerpc/powenv/ioda: Allow huge DMA window at 4GB

2020-04-08 Thread Alexey Kardashevskiy



On 23/03/2020 18:53, Alexey Kardashevskiy wrote:
> Here is an attempt to support bigger DMA space for devices
> supporting DMA masks less than 59 bits (GPUs come into mind
> first). POWER9 PHBs have an option to map 2 windows at 0
> and select a windows based on DMA address being below or above
> 4GB.
> 
> This adds the "iommu=iommu_bypass" kernel parameter and
> supports VFIO+pseries machine - current this requires telling
> upstream+unmodified QEMU about this via
> -global spapr-pci-host-bridge.dma64_win_addr=0x1
> or per-phb property. 4/4 advertises the new option but
> there is no automation around it in QEMU (should it be?).
> 
> For now it is either 1<<59 or 4GB mode; dynamic switching is
> not supported (could be via sysfs).
> 
> This is a rebased version of
> https://lore.kernel.org/kvm/20191202015953.127902-1-...@ozlabs.ru/
> 
> The main change since v1 is that now it is 7 patches with
> clearer separation of steps.
> 
> 
> This is based on 6c90b86a745a "Merge tag 'mmc-v5.6-rc6' of 
> git://git.kernel.org/pub/scm/linux/kernel/git/ulfh/mmc"
> 
> Please comment. Thanks.

Ping?


> 
> 
> 
> Alexey Kardashevskiy (7):
>   powerpc/powernv/ioda: Move TCE bypass base to PE
>   powerpc/powernv/ioda: Rework for huge DMA window at 4GB
>   powerpc/powernv/ioda: Allow smaller TCE table levels
>   powerpc/powernv/phb4: Use IOMMU instead of bypassing
>   powerpc/iommu: Add a window number to
> iommu_table_group_ops::get_table_size
>   powerpc/powernv/phb4: Add 4GB IOMMU bypass mode
>   vfio/spapr_tce: Advertise and allow a huge DMA windows at 4GB
> 
>  arch/powerpc/include/asm/iommu.h  |   3 +
>  arch/powerpc/include/asm/opal-api.h   |   9 +-
>  arch/powerpc/include/asm/opal.h   |   2 +
>  arch/powerpc/platforms/powernv/pci.h  |   4 +-
>  include/uapi/linux/vfio.h |   2 +
>  arch/powerpc/platforms/powernv/npu-dma.c  |   1 +
>  arch/powerpc/platforms/powernv/opal-call.c|   2 +
>  arch/powerpc/platforms/powernv/pci-ioda-tce.c |   4 +-
>  arch/powerpc/platforms/powernv/pci-ioda.c | 234 ++
>  drivers/vfio/vfio_iommu_spapr_tce.c   |  17 +-
>  10 files changed, 213 insertions(+), 65 deletions(-)
> 

-- 
Alexey


Re: [PATCH V2 0/3] mm/debug: Add more arch page table helper tests

2020-04-08 Thread Anshuman Khandual


On 04/07/2020 09:24 PM, Gerald Schaefer wrote:
> On Sun, 5 Apr 2020 17:58:14 +0530
> Anshuman Khandual  wrote:
> 
> [...]
>>>
>>> Could be fixed like this (the first de-reference is a bit special,
>>> because at that point *ptep does not really point to a large (pmd) entry
>>> yet, it is initially an invalid pte entry, which breaks our huge_ptep_get() 
>>>  
>>
>> There seems to be an inconsistency on s390 platform. Even though it defines
>> a huge_ptep_get() override, it does not subscribe __HAVE_ARCH_HUGE_PTEP_GET
>> which should have forced it fallback on generic huge_ptep_get() but it does
>> not :) Then I realized that __HAVE_ARCH_HUGE_PTEP_GET only makes sense when
>> an arch uses . s390 does not use that and hence gets
>> away with it's own huge_ptep_get() without __HAVE_ARCH_HUGE_PTEP_GET. Sounds
>> confusing ? But I might not have the entire context here.
> 
> Yes, that sounds very confusing. Also a bit ironic, since huge_ptep_get()
> was initially introduced because of s390, and now we don't select
> __HAVE_ARCH_HUGE_PTEP_GET...
> 
> As you realized, I guess this is because we do not use generic hugetlb.h.
> And when __HAVE_ARCH_HUGE_PTEP_GET was introduced with commit 544db7597ad
> ("hugetlb: introduce generic version of huge_ptep_get"), that was probably
> the reason why we did not get our share of __HAVE_ARCH_HUGE_PTEP_GET.

Understood.

> 
> Nothing really wrong with that, but yes, very confusing. Maybe we could
> also select it for s390, even though it wouldn't have any functional
> impact (so far), just for less confusion. Maybe also thinking about
> using the generic hugetlb.h, not sure if the original reasons for not
> doing so would still apply. Now I only need to find the time...

Seems like something worth to explore if we could remove this confusion.

> 
>>
>>> conversion logic. I also added PMD_MASK alignment for RANDOM_ORVALUE,
>>> because we do have some special bits there in our large pmds. It seems
>>> to also work w/o that alignment, but it feels a bit wrong):  
>>
>> Sure, we can accommodate that.
>>
>>>
>>> @@ -731,26 +731,26 @@ static void __init hugetlb_advanced_test
>>>   unsigned long vaddr, pgprot_t 
>>> prot)
>>>  {
>>> struct page *page = pfn_to_page(pfn);
>>> -   pte_t pte = READ_ONCE(*ptep);
>>> +   pte_t pte;
>>>
>>> -   pte = __pte(pte_val(pte) | RANDOM_ORVALUE);
>>> +   pte = pte_mkhuge(mk_pte_phys(RANDOM_ORVALUE & PMD_MASK, prot));  
>>
>> So that keeps the existing value in 'ptep' pointer at bay and instead
>> construct a PTE from scratch. I would rather have READ_ONCE(*ptep) at
>> least provide the seed that can be ORed with RANDOM_ORVALUE before
>> being masked with PMD_MASK. Do you see any problem ?
> 
> Yes, unfortunately. The problem is that the resulting pte is not marked
> as present. The conversion pte -> (huge) pmd, which is done in
> set_huge_pte_at() for s390, will establish an empty pmd for non-present
> ptes, all the RANDOM_ORVALUE stuff is lost. And a subsequent
> huge_ptep_get() will not result in the same original pte value. If you

Ohh.

> want to preserve and check the RANDOM_ORVALUE, it has to be a present
> pte, hence the mk_pte(_phys).

Understood and mk_pte() is also available on all platforms.

> 
>>
>> Some thing like this instead.
>>
>> pte_t pte = READ_ONCE(*ptep);
>> pte = pte_mkhuge(__pte((pte_val(pte) | RANDOM_ORVALUE) & PMD_MASK));
>>
>> We cannot use mk_pte_phys() as it is defined only on some platforms
>> without any generic fallback for others.
> 
> Oh, didn't know that, sorry. What about using mk_pte() instead, at least
> it would result in a present pte:
> 
> pte = pte_mkhuge(mk_pte(phys_to_page(RANDOM_ORVALUE & PMD_MASK), prot));

Lets use mk_pte() here but can we do this instead

paddr = (__pfn_to_phys(pfn) | RANDOM_ORVALUE) & PMD_MASK;
pte = pte_mkhuge(mk_pte(phys_to_page(paddr), prot));

> 
> And if you also want to do some with the existing value, which seems
> to be an empty pte, then maybe just check if writing and reading that
> value with set_huge_pte_at() / huge_ptep_get() returns the same,
> i.e. initially w/o RANDOM_ORVALUE.
> 
> So, in combination, like this (BTW, why is the barrier() needed, it
> is not used for the other set_huge_pte_at() calls later?):

Ahh missed, will add them. Earlier we faced problem without it after
set_pte_at() for a test on powerpc (64) platform. Hence just added it
here to be extra careful.

> 
> @@ -733,24 +733,28 @@ static void __init hugetlb_advanced_test
> struct page *page = pfn_to_page(pfn);
> pte_t pte = READ_ONCE(*ptep);
>  
> -   pte = __pte(pte_val(pte) | RANDOM_ORVALUE);
> +   set_huge_pte_at(mm, vaddr, ptep, pte);
> +   WARN_ON(!pte_same(pte, huge_ptep_get(ptep)));
> +
> +   pte = pte_mkhuge(mk_pte(phys_to_page(RANDOM_ORVALUE & PMD_MASK), 
> prot));
> set_huge_pte_at(mm, vaddr, ptep, pte);
> barrier();
> WARN_ON(!pte_same(pte, huge_ptep_get(ptep)));
> 
> 

Re: [PATCH] cxl: Rework error message for incompatible slots

2020-04-08 Thread Frederic Barrat




Le 08/04/2020 à 04:13, Andrew Donnellan a écrit :

On 7/4/20 9:56 pm, Frederic Barrat wrote:

Improve the error message shown if a capi adapter is plugged on a
capi-incompatible slot directly under the PHB (no intermediate switch).

Fixes: 5632874311db ("cxl: Add support for POWER9 DD2")
Cc: sta...@vger.kernel.org # 4.14+
Signed-off-by: Frederic Barrat 


Seems fine to me, not sure if it needs to go to stable but I suppose 
this could be causing actual confusion out in the field?



Yes it does. The reason for this patch is it was hit by a customer.

  Fred



Reviewed-by: Andrew Donnellan 




Re: [PATCH 0/2] powerpc: Remove support for ppc405/440 Xilinx platforms

2020-04-08 Thread Christophe Leroy




Le 08/04/2020 à 01:32, Benjamin Herrenschmidt a écrit :

On Fri, 2020-04-03 at 15:59 +1100, Michael Ellerman wrote:

Benjamin Herrenschmidt  writes:

On Tue, 2020-03-31 at 16:30 +1100, Michael Ellerman wrote:

I have no attachment to 40x, and I'd certainly be happy to have
less
code in the tree, we struggle to keep even the modern platforms
well
maintained.

At the same time I don't want to render anyone's hardware
obsolete
unnecessarily. But if there's really no one using 40x then we
should
remove it, it could well be broken already.

So I guess post a series to do the removal and we'll see if
anyone
speaks up.


We shouldn't remove 40x completely. Just remove the Xilinx 405
stuff.


Congratulations on becoming the 40x maintainer!


Didn't I give you my last 40x system ? :-) IBM still put 40x cores
inside POWER chips no ?



According to Wikipedia (https://en.wikipedia.org/wiki/PowerPC_400), 405 
cores are used in modern equipments (how modern ?), however 403 has 
never reached the market.


Can we start removing 403 stuff ? That's not a lot, but still.

Does anybody knows anything about this ERRATUM 77 stuff ? Is that still 
an issue with all 405 cores or has this been fixed long time ago and can 
be removed ?


Christophe


Re: [PATCH 3/4] powerpc/eeh: Remove workaround from eeh_add_device_late()

2020-04-08 Thread Oliver O'Halloran
On Wed, Apr 8, 2020 at 4:22 PM Sam Bobroff  wrote:
>
> On Fri, Apr 03, 2020 at 05:08:32PM +1100, Oliver O'Halloran wrote:
> > On Mon, 2020-03-30 at 15:56 +1100, Sam Bobroff wrote:
> > > When EEH device state was released asynchronously by the device
> > > release handler, it was possible for an outstanding reference to
> > > prevent it's release and it was necessary to work around that if a
> > > device was re-discovered at the same PCI location.
> >
> > I think this is a bit misleading. The main situation where you'll hit
> > this hack is when recovering a device with a driver that doesn't
> > implement the error handling callbacks. In that case the device is
> > removed, reset, then re-probed by the PCI core, but we assume it's the
> > same physical device so the eeh_device state remains active.
> >
> > If you actually changed the underlying device I suspect something bad
> > would happen.
>
> I'm not sure I understand. Isn't the case you're talking about caught by
> the earlier check (just above the patch)?
>
> if (edev->pdev == dev) {
> eeh_edev_dbg(edev, "Device already referenced!\n");
> return;
> }

No, in the case I'm talking about the pci_dev is torn down and
freed(). After the PE is reset we re-probe the device and create a new
pci_dev.  If the release of the old pci_dev is delayed we need the
hack this patch is removing.

The check above should probably be a WARN_ON() since we should never
be re-running the EEH probe on the same device. I think there is a
case where that can happen, but I don't remember the details.

Oliver


Re: [PATCH 4/4] powerpc/eeh: Clean up edev cleanup for VFs

2020-04-08 Thread Sam Bobroff
On Fri, Apr 03, 2020 at 04:45:47PM +1100, Oliver O'Halloran wrote:
> On Mon, 2020-03-30 at 15:56 +1100, Sam Bobroff wrote:
> > Because the bus notifier calls eeh_rmv_from_parent_pe() (via
> > eeh_remove_device()) when a VF is removed, the call in
> > remove_sriov_vf_pdns() is redundant.
> 
> eeh_rmv_from_parent_pe() won't actually remove the device if the
> recovering flag is set on the PE. Are you sure we're not introducing a
> race here?
> 

Ah, I assume you're referring to the difference between calling
eeh_remove_device() and directly calling eeh_rmv_from_parent_pe(), where
the behaviour for PE's with EEH_PE_KEEP set is subtly different.

I'll take a closer look at it and make sure to explain it better in v2.


signature.asc
Description: PGP signature


Re: [PATCH 0/2] powerpc: Remove support for ppc405/440 Xilinx platforms

2020-04-08 Thread Geert Uytterhoeven
Hi Ben,

On Wed, Apr 8, 2020 at 1:34 AM Benjamin Herrenschmidt
 wrote:
> On Fri, 2020-04-03 at 15:59 +1100, Michael Ellerman wrote:
> > Benjamin Herrenschmidt  writes:
> > > On Tue, 2020-03-31 at 16:30 +1100, Michael Ellerman wrote:
> > > > I have no attachment to 40x, and I'd certainly be happy to have
> > > > less
> > > > code in the tree, we struggle to keep even the modern platforms
> > > > well
> > > > maintained.
> > > >
> > > > At the same time I don't want to render anyone's hardware
> > > > obsolete
> > > > unnecessarily. But if there's really no one using 40x then we
> > > > should
> > > > remove it, it could well be broken already.
> > > >
> > > > So I guess post a series to do the removal and we'll see if
> > > > anyone
> > > > speaks up.
> > >
> > > We shouldn't remove 40x completely. Just remove the Xilinx 405
> > > stuff.
> >
> > Congratulations on becoming the 40x maintainer!
>
> Didn't I give you my last 40x system ? :-) IBM still put 40x cores
> inside POWER chips no ?

Good to know!

Are they still big-endian, or have they been corrupted by the LE frenzy,
too? ;)

Gr{oetje,eeting}s,

Geert

-- 
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- ge...@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
-- Linus Torvalds


Re: [PATCH 3/4] powerpc/eeh: Remove workaround from eeh_add_device_late()

2020-04-08 Thread Sam Bobroff
On Fri, Apr 03, 2020 at 05:08:32PM +1100, Oliver O'Halloran wrote:
> On Mon, 2020-03-30 at 15:56 +1100, Sam Bobroff wrote:
> > When EEH device state was released asynchronously by the device
> > release handler, it was possible for an outstanding reference to
> > prevent it's release and it was necessary to work around that if a
> > device was re-discovered at the same PCI location.
> 
> I think this is a bit misleading. The main situation where you'll hit
> this hack is when recovering a device with a driver that doesn't
> implement the error handling callbacks. In that case the device is
> removed, reset, then re-probed by the PCI core, but we assume it's the
> same physical device so the eeh_device state remains active.
> 
> If you actually changed the underlying device I suspect something bad
> would happen.

I'm not sure I understand. Isn't the case you're talking about caught by
the earlier check (just above the patch)?

if (edev->pdev == dev) {
eeh_edev_dbg(edev, "Device already referenced!\n");
return;
}
> 
> > Now that the state is released synchronously that is no longer
> > possible and the workaround is no longer necessary.
> 
> You could probably fold this into the previous patch, but eh. You could
> probably fold this into the previous patch, but eh.

True.

> > Signed-off-by: Sam Bobroff 
> > ---
> >  arch/powerpc/kernel/eeh.c | 23 +--
> >  1 file changed, 1 insertion(+), 22 deletions(-)
> > 
> > diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
> > index c36c5a7db5ca..12c248a16527 100644
> > --- a/arch/powerpc/kernel/eeh.c
> > +++ b/arch/powerpc/kernel/eeh.c
> > @@ -1206,28 +1206,7 @@ void eeh_add_device_late(struct pci_dev *dev)
> > eeh_edev_dbg(edev, "Device already referenced!\n");
> > return;
> > }
> > -
> > -   /*
> > -* The EEH cache might not be removed correctly because of
> > -* unbalanced kref to the device during unplug time, which
> > -* relies on pcibios_release_device(). So we have to remove
> > -* that here explicitly.
> > -*/
> > -   if (edev->pdev) {
> > -   eeh_rmv_from_parent_pe(edev);
> > -   eeh_addr_cache_rmv_dev(edev->pdev);
> > -   eeh_sysfs_remove_device(edev->pdev);
> > -
> > -   /*
> > -* We definitely should have the PCI device removed
> > -* though it wasn't correctly. So we needn't call
> > -* into error handler afterwards.
> > -*/
> > -   edev->mode |= EEH_DEV_NO_HANDLER;
> > -
> > -   edev->pdev = NULL;
> > -   dev->dev.archdata.edev = NULL;
> > -   }
> > +   WARN_ON_ONCE(edev->pdev);
> >  
> > if (eeh_has_flag(EEH_PROBE_MODE_DEV))
> > eeh_ops->probe(pdn, NULL);
> 


signature.asc
Description: PGP signature


Re: [PATCH 2/4] powerpc/eeh: Release EEH device state synchronously

2020-04-08 Thread Sam Bobroff
On Fri, Apr 03, 2020 at 03:51:18PM +1100, Oliver O'Halloran wrote:
> On Mon, 2020-03-30 at 15:56 +1100, Sam Bobroff wrote:
> > EEH device state is currently removed (by eeh_remove_device()) during
> > the device release handler, which is invoked as the device's reference
> > count drops to zero. This may take some time, or forever, as other
> > threads may hold references.
> > 
> > However, the PCI device state is released synchronously by
> > pci_stop_and_remove_bus_device(). This mismatch causes problems, for
> > example the device may be re-discovered as a new device before the
> > release handler has been called, leaving the PCI and EEH state
> > mismatched.
> > 
> > So instead, call eeh_remove_device() from the bus device removal
> > handlers, which are called synchronously in the removal path.
> > 
> > Signed-off-by: Sam Bobroff 
> > ---
> >  arch/powerpc/kernel/eeh.c | 26 ++
> >  arch/powerpc/kernel/pci-hotplug.c |  2 --
> >  2 files changed, 26 insertions(+), 2 deletions(-)
> > 
> > diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
> > index 17cb3e9b5697..c36c5a7db5ca 100644
> > --- a/arch/powerpc/kernel/eeh.c
> > +++ b/arch/powerpc/kernel/eeh.c
> > @@ -1106,6 +1106,32 @@ static int eeh_init(void)
> >  
> >  core_initcall_sync(eeh_init);
> >  
> > +static int eeh_device_notifier(struct notifier_block *nb,
> > +  unsigned long action, void *data)
> > +{
> > +   struct device *dev = data;
> > +
> > +   switch (action) {
> > +   case BUS_NOTIFY_DEL_DEVICE:
> > +   eeh_remove_device(to_pci_dev(dev));
> > +   break;
> > +   default:
> > +   break;
> > +   }
> 
> A comment briefly explaining why we're not doing anything in the add
> case might be nice.

Good point, I'll add something for v2.
> 
> Reviewed-by: Oliver O'Halloran 
> 
> > +   return NOTIFY_DONE;
> > +}
> > +
> > +static struct notifier_block eeh_device_nb = {
> > +   .notifier_call = eeh_device_notifier,
> > +};
> > +
> > +static __init int eeh_set_bus_notifier(void)
> > +{
> > +   bus_register_notifier(_bus_type, _device_nb);
> > +   return 0;
> > +}
> > +arch_initcall(eeh_set_bus_notifier);
> > +
> >  /**
> >   * eeh_add_device_early - Enable EEH for the indicated device node
> >   * @pdn: PCI device node for which to set up EEH
> > diff --git a/arch/powerpc/kernel/pci-hotplug.c 
> > b/arch/powerpc/kernel/pci-hotplug.c
> > index d6a67f814983..28e9aa274f64 100644
> > --- a/arch/powerpc/kernel/pci-hotplug.c
> > +++ b/arch/powerpc/kernel/pci-hotplug.c
> > @@ -57,8 +57,6 @@ void pcibios_release_device(struct pci_dev *dev)
> > struct pci_controller *phb = pci_bus_to_host(dev->bus);
> > struct pci_dn *pdn = pci_get_pdn(dev);
> >  
> > -   eeh_remove_device(dev);
> > -
> > if (phb->controller_ops.release_device)
> > phb->controller_ops.release_device(dev);
> >  
> 


signature.asc
Description: PGP signature


<    1   2