Re: [PATCH 1/3] powerpc/powernv/pci: Track largest available TCE order per PHB

2018-07-02 Thread Russell Currey
On Mon, 2018-07-02 at 17:34 +1000, Alexey Kardashevskiy wrote:
> On Mon, 2 Jul 2018 17:32:56 +1000
> Alexey Kardashevskiy  wrote:
> 
> > On Fri, 29 Jun 2018 17:34:35 +1000
> > Russell Currey  wrote:
> > 
> > > Knowing the largest possible TCE size of a PHB is useful, so get
> > > it
> > > out of the device tree.  This relies on the property being added
> > > in
> > > OPAL.
> > > 
> > > It is assumed that any PHB4 or later machine would be running
> > > firmware that implemented this property, and otherwise assumed to
> > > be PHB3, which has a maximum TCE order of 28 bits or 256MB TCEs.
> > > 
> > > This is used later in the series.
> > > 
> > > Signed-off-by: Russell Currey 
> > > ---
> > >  arch/powerpc/platforms/powernv/pci-ioda.c | 16 
> > >  arch/powerpc/platforms/powernv/pci.h  |  3 +++
> > >  2 files changed, 19 insertions(+)
> > > 
> > > diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c
> > > b/arch/powerpc/platforms/powernv/pci-ioda.c index
> > > 5bd0eb6681bc..17c590087279 100644 ---
> > > a/arch/powerpc/platforms/powernv/pci-ioda.c +++
> > > b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -3873,11 +3873,13
> > > @@
> > > static void __init pnv_pci_init_ioda_phb(struct device_node *np,
> > > struct resource r; const __be64 *prop64;
> > >   const __be32 *prop32;
> > > + struct property *prop;
> > >   int len;
> > >   unsigned int segno;
> > >   u64 phb_id;
> > >   void *aux;
> > >   long rc;
> > > + u32 val;
> > >  
> > >   if (!of_device_is_available(np))
> > >   return;
> > > @@ -4016,6 +4018,20 @@ static void __init
> > > pnv_pci_init_ioda_phb(struct device_node *np, }
> > >   phb->ioda.pe_array = aux + pemap_off;
> > >  
> > > + phb->ioda.max_tce_order = 0;
> > > + /* Get TCE order from the DT.  If it's not present,
> > > assume
> > > P8 */
> > > + if (!of_get_property(np, "ibm,supported-tce-sizes",
> > > NULL))
> > > {
> > > + phb->ioda.max_tce_order = 28; /* assume P8 256mb
> > > TCEs */
> > > + } else {
> > > + of_property_for_each_u32(np,
> > > "ibm,supported-tce-sizes", prop,
> > > +  prop32, val) {
> > > + if (val > phb->ioda.max_tce_order)
> > > + phb->ioda.max_tce_order = val;
> > > + }
> > > + pr_debug("PHB%llx Found max TCE order of %d
> > > bits\n",
> > > +  phb->opal_id, phb->ioda.max_tce_order);
> > > + }  
> > 
> > 
> > pnv_ioda_parse_tce_sizes() does this, use it. It even reports 256MB
> > pages for P8 as in v4.18-rc3.
> 
> 
> ah, not, not in rc3, my bad. I'll post it soon.

Sure, whatever works, no need for duplication

> 
> 
> --
> Alexey


Re: [PATCH v2 1/2] powernv/cpuidle: Parse dt idle properties into global structure

2018-07-02 Thread kbuild test robot
Hi Akshay,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on powerpc/next]
[also build test ERROR on v4.18-rc3 next-20180702]
[if your patch is applied to the wrong git tree, please drop us a note to help 
improve the system]

url:
https://github.com/0day-ci/linux/commits/Akshay-Adiga/powernv-cpuidle-Device-tree-parsing-cleanup/20180703-024607
base:   https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git next
config: powerpc-defconfig (attached as .config)
compiler: powerpc64-linux-gnu-gcc (Debian 7.2.0-11) 7.2.0
reproduce:
wget 
https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O 
~/bin/make.cross
chmod +x ~/bin/make.cross
# save the attached .config to linux build tree
GCC_VERSION=7.2.0 make.cross ARCH=powerpc 

Note: the 
linux-review/Akshay-Adiga/powernv-cpuidle-Device-tree-parsing-cleanup/20180703-024607
 HEAD 4beae9263d77036dae7f43905823867c6d982690 builds fine.
  It only hurts bisectibility.

All errors (new ones prefixed by >>):

   drivers/cpuidle/cpuidle-powernv.c: In function 'powernv_add_idle_states':
>> drivers/cpuidle/cpuidle-powernv.c:417:28: error: 'state' undeclared (first 
>> use in this function); did you mean 'statx'?
  if (has_stop_states && !(state->valid))
   ^
   statx
   drivers/cpuidle/cpuidle-powernv.c:417:28: note: each undeclared identifier 
is reported only once for each function it appears in

vim +417 drivers/cpuidle/cpuidle-powernv.c

   262  
   263  extern u32 pnv_get_supported_cpuidle_states(void);
   264  static int powernv_add_idle_states(void)
   265  {
   266  struct device_node *power_mgt;
   267  int nr_idle_states = 1; /* Snooze */
   268  int dt_idle_states, count;
   269  u32 latency_ns[CPUIDLE_STATE_MAX];
   270  u32 residency_ns[CPUIDLE_STATE_MAX];
   271  u32 flags[CPUIDLE_STATE_MAX];
   272  u64 psscr_val[CPUIDLE_STATE_MAX];
   273  u64 psscr_mask[CPUIDLE_STATE_MAX];
   274  const char *names[CPUIDLE_STATE_MAX];
   275  u32 has_stop_states = 0;
   276  int i, rc;
   277  u32 supported_flags = pnv_get_supported_cpuidle_states();
   278  
   279  
   280  /* Currently we have snooze statically defined */
   281  
   282  power_mgt = of_find_node_by_path("/ibm,opal/power-mgt");
   283  if (!power_mgt) {
   284  pr_warn("opal: PowerMgmt Node not found\n");
   285  goto out;
   286  }
   287  
   288  /* Read values of any property to determine the num of idle 
states */
   289  dt_idle_states = of_property_count_u32_elems(power_mgt, 
"ibm,cpu-idle-state-flags");
   290  if (dt_idle_states < 0) {
   291  pr_warn("cpuidle-powernv: no idle states found in the 
DT\n");
   292  goto out;
   293  }
   294  
   295  count = of_property_count_u32_elems(power_mgt,
   296  
"ibm,cpu-idle-state-latencies-ns");
   297  
   298  if (validate_dt_prop_sizes("ibm,cpu-idle-state-flags", 
dt_idle_states,
   299 "ibm,cpu-idle-state-latencies-ns",
   300 count) != 0)
   301  goto out;
   302  
   303  count = of_property_count_strings(power_mgt,
   304"ibm,cpu-idle-state-names");
   305  if (validate_dt_prop_sizes("ibm,cpu-idle-state-flags", 
dt_idle_states,
   306 "ibm,cpu-idle-state-names",
   307 count) != 0)
   308  goto out;
   309  
   310  /*
   311   * Since snooze is used as first idle state, max idle states 
allowed is
   312   * CPUIDLE_STATE_MAX -1
   313   */
   314  if (dt_idle_states > CPUIDLE_STATE_MAX - 1) {
   315  pr_warn("cpuidle-powernv: discovered idle states more 
than allowed");
   316  dt_idle_states = CPUIDLE_STATE_MAX - 1;
   317  }
   318  
   319  if (of_property_read_u32_array(power_mgt,
   320  "ibm,cpu-idle-state-flags", flags, 
dt_idle_states)) {
   321  pr_warn("cpuidle-powernv : missing 
ibm,cpu-idle-state-flags in DT\n");
   322  goto out;
   323  }
   324  
   325  if (of_property_read_u32_array(power_mgt,
   326  "ibm,cpu-idle-state-latencies-ns", latency_ns,
   327  dt_idle_states)) {
   328  pr_warn("cpuidle-powernv: missing 
ibm,cpu-idle-state-latencies-ns in DT\n");
   329

Re: [PATCH] kbuild: move bin2c back to scripts/ from scripts/basic/

2018-07-02 Thread Masahiro Yamada
2018-06-26 1:40 GMT+09:00 Masahiro Yamada :
> Commit 8370edea81e3 ("bin2c: move bin2c in scripts/basic") moved bin2c
> to the scripts/basic/ directory, incorrectly stating "Kexec wants to
> use bin2c and it wants to use it really early in the build process.
> See arch/x86/purgatory/ code in later patches."
>
> Commit bdab125c9301 ("Revert "kexec/purgatory: Add clean-up for
> purgatory directory"") and commit d6605b6bbee8 ("x86/build: Remove
> unnecessary preparation for purgatory") removed the redundant
> purgatory build magic entirely.
>
> That means that the move of bin2c was unnecessary in the first place.
>
> Signed-off-by: Masahiro Yamada 


Applied to linux-kbuild.


>
>  arch/powerpc/purgatory/Makefile | 3 +--
>  arch/s390/purgatory/Makefile| 3 +--
>  arch/x86/purgatory/Makefile | 3 +--
>  kernel/Makefile | 2 +-
>  scripts/.gitignore  | 1 +
>  scripts/Makefile| 1 +
>  scripts/basic/.gitignore| 1 -
>  scripts/basic/Makefile  | 1 -
>  scripts/{basic => }/bin2c.c | 0
>  security/tomoyo/Makefile| 2 +-
>  10 files changed, 7 insertions(+), 10 deletions(-)
>  rename scripts/{basic => }/bin2c.c (100%)
>
> diff --git a/arch/powerpc/purgatory/Makefile b/arch/powerpc/purgatory/Makefile
> index 30e05de..4314ba5 100644
> --- a/arch/powerpc/purgatory/Makefile
> +++ b/arch/powerpc/purgatory/Makefile
> @@ -6,9 +6,8 @@ LDFLAGS_purgatory.ro := -e purgatory_start -r --no-undefined
>  $(obj)/purgatory.ro: $(obj)/trampoline.o FORCE
> $(call if_changed,ld)
>
> -CMD_BIN2C = $(objtree)/scripts/basic/bin2c
>  quiet_cmd_bin2c = BIN2C   $@
> -  cmd_bin2c = $(CMD_BIN2C) kexec_purgatory < $< > $@
> +  cmd_bin2c = $(objtree)/scripts/bin2c kexec_purgatory < $< > $@
>
>  $(obj)/kexec-purgatory.c: $(obj)/purgatory.ro FORCE
> $(call if_changed,bin2c)
> diff --git a/arch/s390/purgatory/Makefile b/arch/s390/purgatory/Makefile
> index 1ace023..445c460 100644
> --- a/arch/s390/purgatory/Makefile
> +++ b/arch/s390/purgatory/Makefile
> @@ -27,9 +27,8 @@ KBUILD_CFLAGS += $(call cc-option,-fno-PIE)
>  $(obj)/purgatory.ro: $(PURGATORY_OBJS) FORCE
> $(call if_changed,ld)
>
> -CMD_BIN2C = $(objtree)/scripts/basic/bin2c
>  quiet_cmd_bin2c = BIN2C   $@
> -  cmd_bin2c = $(CMD_BIN2C) kexec_purgatory < $< > $@
> +  cmd_bin2c = $(objtree)/scripts/bin2c kexec_purgatory < $< > $@
>
>  $(obj)/kexec-purgatory.c: $(obj)/purgatory.ro FORCE
> $(call if_changed,bin2c)
> diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile
> index 2e9ee02..d6ac098 100644
> --- a/arch/x86/purgatory/Makefile
> +++ b/arch/x86/purgatory/Makefile
> @@ -28,9 +28,8 @@ $(obj)/purgatory.ro: $(PURGATORY_OBJS) FORCE
>
>  targets += kexec-purgatory.c
>
> -CMD_BIN2C = $(objtree)/scripts/basic/bin2c
>  quiet_cmd_bin2c = BIN2C   $@
> -  cmd_bin2c = $(CMD_BIN2C) kexec_purgatory < $< > $@
> +  cmd_bin2c = $(objtree)/scripts/bin2c kexec_purgatory < $< > $@
>
>  $(obj)/kexec-purgatory.c: $(obj)/purgatory.ro FORCE
> $(call if_changed,bin2c)
> diff --git a/kernel/Makefile b/kernel/Makefile
> index 04bc07c..7a63d56 100644
> --- a/kernel/Makefile
> +++ b/kernel/Makefile
> @@ -123,7 +123,7 @@ targets += config_data.gz
>  $(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE
> $(call if_changed,gzip)
>
> -  filechk_ikconfiggz = (echo "static const char kernel_config_data[] 
> __used = MAGIC_START"; cat $< | scripts/basic/bin2c; echo "MAGIC_END;")
> +  filechk_ikconfiggz = (echo "static const char kernel_config_data[] 
> __used = MAGIC_START"; cat $< | scripts/bin2c; echo "MAGIC_END;")
>  targets += config_data.h
>  $(obj)/config_data.h: $(obj)/config_data.gz FORCE
> $(call filechk,ikconfiggz)
> diff --git a/scripts/.gitignore b/scripts/.gitignore
> index 0442c06..12d302d 100644
> --- a/scripts/.gitignore
> +++ b/scripts/.gitignore
> @@ -1,6 +1,7 @@
>  #
>  # Generated files
>  #
> +bin2c
>  conmakehash
>  kallsyms
>  pnmtologo
> diff --git a/scripts/Makefile b/scripts/Makefile
> index 25ab143..59c21ec 100644
> --- a/scripts/Makefile
> +++ b/scripts/Makefile
> @@ -10,6 +10,7 @@
>
>  HOST_EXTRACFLAGS += -I$(srctree)/tools/include
>
> +hostprogs-$(CONFIG_BUILD_BIN2C)  += bin2c
>  hostprogs-$(CONFIG_KALLSYMS) += kallsyms
>  hostprogs-$(CONFIG_LOGO) += pnmtologo
>  hostprogs-$(CONFIG_VT)   += conmakehash
> diff --git a/scripts/basic/.gitignore b/scripts/basic/.gitignore
> index 9528ec9..a776371 100644
> --- a/scripts/basic/.gitignore
> +++ b/scripts/basic/.gitignore
> @@ -1,2 +1 @@
>  fixdep
> -bin2c
> diff --git a/scripts/basic/Makefile b/scripts/basic/Makefile
> index 0372b33..af49b44 100644
> --- a/scripts/basic/Makefile
> +++ b/scripts/basic/Makefile
> @@ -9,7 +9,6 @@
>  # fixdep:   Used to generate dependency information during build process
>
>  hostprogs-y:= fixdep
> -hostprogs-$(CONFIG_BUILD_BIN2C) += bin2c
>  always := $(hostprogs-y)
>
>  # fixdep is needed to 

Re: [PATCH v2 2/2] powernv/cpuidle: Use parsed device tree values for cpuidle_init

2018-07-02 Thread Nicholas Piggin
On Mon,  2 Jul 2018 19:53:21 +0530
Akshay Adiga  wrote:

> Export pnv_idle_states and nr_pnv_idle_states so that its accessible to
> cpuidle driver. Use properties from pnv_idle_states structure for powernv
> cpuidle_init.
>
> Signed-off-by: Akshay Adiga 

Reviewed-by: Nicholas Piggin 


Re: [PATCH 2/2] powerpc/ptrace-pkeys: execute-permission on keys are disabled by default

2018-07-02 Thread Thiago Jung Bauermann


Ram Pai  writes:

> The test case assumes execute-permissions of unallocated keys are
> enabled by default.
>
> Signed-off-by: Ram Pai 
> ---
>  .../testing/selftests/powerpc/ptrace/ptrace-pkey.c |5 +
>  1 files changed, 5 insertions(+), 0 deletions(-)
>
> diff --git a/tools/testing/selftests/powerpc/ptrace/ptrace-pkey.c 
> b/tools/testing/selftests/powerpc/ptrace/ptrace-pkey.c
> index 5cf631f..559c6cb 100644
> --- a/tools/testing/selftests/powerpc/ptrace/ptrace-pkey.c
> +++ b/tools/testing/selftests/powerpc/ptrace/ptrace-pkey.c
> @@ -104,6 +104,11 @@ static int child(struct shared_info *info)
>
>   if (disable_execute)
>   info->expected_iamr |= 1ul << pkeyshift(pkey1);
> + else
> + info->expected_iamr &= ~(1ul << pkeyshift(pkey1));
> + info->expected_iamr &= ~(1ul << pkeyshift(pkey2) | 1ul << 
> pkeyshift(pkey3));
> +
> +
>
>   info->expected_uamor |= 3ul << pkeyshift(pkey1) |
>   3ul << pkeyshift(pkey2);

Reviewed-by: Thiago Jung Bauermann 

-- 
Thiago Jung Bauermann
IBM Linux Technology Center



Re: [PATCH v2 1/2] powernv/cpuidle: Parse dt idle properties into global structure

2018-07-02 Thread Nicholas Piggin
On Mon,  2 Jul 2018 19:53:20 +0530
Akshay Adiga  wrote:

> Device-tree parsing happens twice, once while deciding idle state to be
> used for hotplug and once during cpuidle init. Hence, parsing the device
> tree and caching it will reduce code duplication. Parsing code has been
> moved to pnv_parse_cpuidle_dt() from pnv_probe_idle_states(). In addition
> to the properties in the device tree the number of available states is
> also required.
> 
> Signed-off-by: Akshay Adiga 
> ---
>  arch/powerpc/include/asm/cpuidle.h|  11 ++
>  arch/powerpc/platforms/powernv/idle.c | 216 --
>  drivers/cpuidle/cpuidle-powernv.c |  11 +-
>  3 files changed, 151 insertions(+), 87 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/cpuidle.h 
> b/arch/powerpc/include/asm/cpuidle.h
> index e210a83eb196..574b0ce1d671 100644
> --- a/arch/powerpc/include/asm/cpuidle.h
> +++ b/arch/powerpc/include/asm/cpuidle.h
> @@ -79,6 +79,17 @@ struct stop_sprs {
>   u64 mmcra;
>  };
>  
> +#define PNV_IDLE_NAME_LEN16
> +struct pnv_idle_states_t {
> + char name[PNV_IDLE_NAME_LEN];
> + u32 latency_ns;
> + u32 residency_ns;
> + u64 psscr_val;
> + u64 psscr_mask;
> + u32 flags;
> + bool valid;
> +};


This is a nice looking cleanup.

Reviewed-by: Nicholas Piggin 


Re: [PATCH 1/2] powerpc/core-pkeys: execute-permission on keys are disabled by default

2018-07-02 Thread Thiago Jung Bauermann


Ram Pai  writes:

> Only when the key is allocated, its permission are enabled.
>
> Signed-off-by: Ram Pai 
> ---
>  tools/testing/selftests/powerpc/ptrace/core-pkey.c |4 
>  1 files changed, 4 insertions(+), 0 deletions(-)
>
> diff --git a/tools/testing/selftests/powerpc/ptrace/core-pkey.c 
> b/tools/testing/selftests/powerpc/ptrace/core-pkey.c
> index 36bc312..b353d86 100644
> --- a/tools/testing/selftests/powerpc/ptrace/core-pkey.c
> +++ b/tools/testing/selftests/powerpc/ptrace/core-pkey.c
> @@ -140,6 +140,10 @@ static int child(struct shared_info *info)
>
>   if (disable_execute)
>   info->iamr |= 1ul << pkeyshift(pkey1);
> + else
> + info->iamr &= ~(1ul << pkeyshift(pkey1));
> + info->iamr &= ~(1ul << pkeyshift(pkey2) | 1ul << pkeyshift(pkey3));
> +
>
>   info->uamor |= 3ul << pkeyshift(pkey1) | 3ul << pkeyshift(pkey2);

Reviewed-by: Thiago Jung Bauermann 

-- 
Thiago Jung Bauermann
IBM Linux Technology Center



Re: [PATCH v5 2/7] powerpc/pseries: Defer the logging of rtas error to irq work queue.

2018-07-02 Thread Nicholas Piggin
On Mon, 02 Jul 2018 11:16:29 +0530
Mahesh J Salgaonkar  wrote:

> From: Mahesh Salgaonkar 
> 
> rtas_log_buf is a buffer to hold RTAS event data that are communicated
> to kernel by hypervisor. This buffer is then used to pass RTAS event
> data to user through proc fs. This buffer is allocated from vmalloc
> (non-linear mapping) area.
> 
> On Machine check interrupt, register r3 points to RTAS extended event
> log passed by hypervisor that contains the MCE event. The pseries
> machine check handler then logs this error into rtas_log_buf. The
> rtas_log_buf is a vmalloc-ed (non-linear) buffer we end up taking up a
> page fault (vector 0x300) while accessing it. Since machine check
> interrupt handler runs in NMI context we can not afford to take any
> page fault. Page faults are not honored in NMI context and causes
> kernel panic. Apart from that, as Nick pointed out, pSeries_log_error()
> also takes a spin_lock while logging error which is not safe in NMI
> context. It may endup in deadlock if we get another MCE before releasing
> the lock. Fix this by deferring the logging of rtas error to irq work queue.
> 
> Current implementation uses two different buffers to hold rtas error log
> depending on whether extended log is provided or not. This makes bit
> difficult to identify which buffer has valid data that needs to logged
> later in irq work. Simplify this using single buffer, one per paca, and
> copy rtas log to it irrespective of whether extended log is provided or
> not. Allocate this buffer below RMA region so that it can be accessed
> in real mode mce handler.
> 
> Fixes: b96672dd840f ("powerpc: Machine check interrupt is a non-maskable 
> interrupt")
> Cc: sta...@vger.kernel.org
> Signed-off-by: Mahesh Salgaonkar 

I think this looks reasonable. It doesn't fix that commit so much as
fixes the problem that's apparent after it's applied. I don't know if
we should backport this to a wider set of stable kernels? Aside from
that,

Reviewed-by: Nicholas Piggin 

Thanks,
Nick

> ---
>  arch/powerpc/include/asm/paca.h|3 ++
>  arch/powerpc/platforms/pseries/ras.c   |   47 
> ++--
>  arch/powerpc/platforms/pseries/setup.c |   16 +++
>  3 files changed, 51 insertions(+), 15 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
> index 3f109a3e3edb..b441fef53077 100644
> --- a/arch/powerpc/include/asm/paca.h
> +++ b/arch/powerpc/include/asm/paca.h
> @@ -251,6 +251,9 @@ struct paca_struct {
>   void *rfi_flush_fallback_area;
>   u64 l1d_flush_size;
>  #endif
> +#ifdef CONFIG_PPC_PSERIES
> + u8 *mce_data_buf;   /* buffer to hold per cpu rtas errlog */
> +#endif /* CONFIG_PPC_PSERIES */
>  } cacheline_aligned;
>  
>  extern void copy_mm_to_paca(struct mm_struct *mm);
> diff --git a/arch/powerpc/platforms/pseries/ras.c 
> b/arch/powerpc/platforms/pseries/ras.c
> index ef104144d4bc..14a46b07ab2f 100644
> --- a/arch/powerpc/platforms/pseries/ras.c
> +++ b/arch/powerpc/platforms/pseries/ras.c
> @@ -22,6 +22,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  
>  #include 
>  #include 
> @@ -32,11 +33,13 @@
>  static unsigned char ras_log_buf[RTAS_ERROR_LOG_MAX];
>  static DEFINE_SPINLOCK(ras_log_buf_lock);
>  
> -static char global_mce_data_buf[RTAS_ERROR_LOG_MAX];
> -static DEFINE_PER_CPU(__u64, mce_data_buf);
> -
>  static int ras_check_exception_token;
>  
> +static void mce_process_errlog_event(struct irq_work *work);
> +static struct irq_work mce_errlog_process_work = {
> + .func = mce_process_errlog_event,
> +};
> +
>  #define EPOW_SENSOR_TOKEN9
>  #define EPOW_SENSOR_INDEX0
>  
> @@ -330,16 +333,20 @@ static irqreturn_t ras_error_interrupt(int irq, void 
> *dev_id)
>   A) >= 0x7000) && ((A) < 0x7ff0)) || \
>   (((A) >= rtas.base) && ((A) < (rtas.base + rtas.size - 16
>  
> +static inline struct rtas_error_log *fwnmi_get_errlog(void)
> +{
> + return (struct rtas_error_log *)local_paca->mce_data_buf;
> +}
> +
>  /*
>   * Get the error information for errors coming through the
>   * FWNMI vectors.  The pt_regs' r3 will be updated to reflect
>   * the actual r3 if possible, and a ptr to the error log entry
>   * will be returned if found.
>   *
> - * If the RTAS error is not of the extended type, then we put it in a per
> - * cpu 64bit buffer. If it is the extended type we use global_mce_data_buf.
> + * Use one buffer mce_data_buf per cpu to store RTAS error.
>   *
> - * The global_mce_data_buf does not have any locks or protection around it,
> + * The mce_data_buf does not have any locks or protection around it,
>   * if a second machine check comes in, or a system reset is done
>   * before we have logged the error, then we will get corruption in the
>   * error log.  This is preferable over holding off on calling
> @@ -349,7 +356,7 @@ static irqreturn_t ras_error_interrupt(int irq, void 
> *dev_id)
>  static struct rtas_error_log 

Re: [PATCH] powerpc: mpc5200: Remove VLA usage

2018-07-02 Thread Michael Ellerman
Segher Boessenkool  writes:
> On Mon, Jul 02, 2018 at 11:33:32AM +1000, Michael Ellerman wrote:
>> What if we write it:
>> 
>>char saved_0x500[0x600 - 0x500];
>> 
>> Hopefully the compiler is smart enough not to generate a VLA for that :)
>
> It is a VLA if the array size is not an integer constant expression.  This
> is defined by C; the compiler has nothing to do with it.  0x600-0x500 is
> an integer constant expression, so this is not a VLA.

Thanks.

That wasn't meant as a dig at GCC. Kees had an epic struggle with the
kernel's min/max() macros which were causing expressions that looked
like they should be constant to generate VLAs.

> But if you meant if GCC will ever do a dynamic stack allocation for a fixed
> size local variable: yes indeed, I hope not!

Hey that would be cool, just-in-time local variable allocation :)

cheers


Re: [PATCH kernel v2 2/2] KVM: PPC: Check if IOMMU page is contained in the pinned physical page

2018-07-02 Thread David Gibson
On Mon, Jul 02, 2018 at 04:32:27PM +1000, Alexey Kardashevskiy wrote:
> On Mon, 2 Jul 2018 14:52:43 +1000
> David Gibson  wrote:
> 
> > On Mon, Jul 02, 2018 at 02:33:30PM +1000, Alexey Kardashevskiy wrote:
> > > On Mon, 2 Jul 2018 14:08:52 +1000
> > > David Gibson  wrote:
> > >   
> > > > On Fri, Jun 29, 2018 at 05:07:47PM +1000, Alexey Kardashevskiy wrote:  
> > > > > On Fri, 29 Jun 2018 15:18:20 +1000
> > > > > Alexey Kardashevskiy  wrote:
> > > > > 
> > > > > > On Fri, 29 Jun 2018 14:57:02 +1000
> > > > > > David Gibson  wrote:
> > > > > > 
> > > > > > > On Fri, Jun 29, 2018 at 02:51:21PM +1000, Alexey Kardashevskiy 
> > > > > > > wrote:  
> > > > > > > > On Fri, 29 Jun 2018 14:12:41 +1000
> > > > > > > > David Gibson  wrote:
> > > > > > > > 
> > > > > > > > > On Tue, Jun 26, 2018 at 03:59:26PM +1000, Alexey 
> > > > > > > > > Kardashevskiy wrote:
> > > > > > > > > > We already have a check in 
> > > > > > > > > > drivers/vfio/vfio_iommu_spapr_tce.c that
> > > > > > > > > > an IOMMU page is contained in the physical page so the PCI 
> > > > > > > > > > hardware won't
> > > > > > > > > > get access to unassigned host memory.
> > > > > > > > > > 
> > > > > > > > > > However we do not have this check in KVM fastpath 
> > > > > > > > > > (H_PUT_TCE accelerated
> > > > > > > > > > code) so the user space can pin memory backed with 64k 
> > > > > > > > > > pages and create
> > > > > > > > > > a hardware TCE table with a bigger page size. We were lucky 
> > > > > > > > > > so far and
> > > > > > > > > > did not hit this yet as the very first time the mapping 
> > > > > > > > > > happens
> > > > > > > > > > we do not have tbl::it_userspace allocated yet and fall 
> > > > > > > > > > back to
> > > > > > > > > > the userspace which in turn calls VFIO IOMMU driver and 
> > > > > > > > > > that fails
> > > > > > > > > > because of the check in vfio_iommu_spapr_tce.c which is 
> > > > > > > > > > really
> > > > > > > > > > sustainable solution.
> > > > > > > > > > 
> > > > > > > > > > This stores the smallest preregistered page size in the 
> > > > > > > > > > preregistered
> > > > > > > > > > region descriptor and changes the mm_iommu_xxx API to check 
> > > > > > > > > > this against
> > > > > > > > > > the IOMMU page size.
> > > > > > > > > > 
> > > > > > > > > > Signed-off-by: Alexey Kardashevskiy 
> > > > > > > > > > ---
> > > > > > > > > > Changes:
> > > > > > > > > > v2:
> > > > > > > > > > * explicitly check for compound pages before calling 
> > > > > > > > > > compound_order()
> > > > > > > > > > 
> > > > > > > > > > ---
> > > > > > > > > > The bug is: run QEMU _without_ hugepages (no -mempath) and 
> > > > > > > > > > tell it to
> > > > > > > > > > advertise 16MB pages to the guest; a typical pseries guest 
> > > > > > > > > > will use 16MB
> > > > > > > > > > for IOMMU pages without checking the mmu pagesize and this 
> > > > > > > > > > will fail
> > > > > > > > > > at 
> > > > > > > > > > https://git.qemu.org/?p=qemu.git;a=blob;f=hw/vfio/common.c;h=fb396cf00ac40eb35967a04c9cc798ca896eed57;hb=refs/heads/master#l256
> > > > > > > > > > 
> > > > > > > > > > With the change, mapping will fail in KVM and the guest 
> > > > > > > > > > will print:
> > > > > > > > > > 
> > > > > > > > > > mlx5_core :00:00.0: ibm,create-pe-dma-window(2027) 0 
> > > > > > > > > > 800 2000 18 1f returned 0 (liobn = 0x8001 
> > > > > > > > > > starting addr = 800 0)
> > > > > > > > > > mlx5_core :00:00.0: created tce table LIOBN 0x8001 
> > > > > > > > > > for /pci@8002000/ethernet@0
> > > > > > > > > > mlx5_core :00:00.0: failed to map direct window for
> > > > > > > > > > /pci@8002000/ethernet@0: -1  
> > > > > > > > > 
> > > > > > > > > [snip]
> > > > > > > > > > @@ -124,7 +125,7 @@ long mm_iommu_get(struct mm_struct *mm, 
> > > > > > > > > > unsigned long ua, unsigned long entries,
> > > > > > > > > > struct mm_iommu_table_group_mem_t **pmem)
> > > > > > > > > >  {
> > > > > > > > > > struct mm_iommu_table_group_mem_t *mem;
> > > > > > > > > > -   long i, j, ret = 0, locked_entries = 0;
> > > > > > > > > > +   long i, j, ret = 0, locked_entries = 0, pageshift;
> > > > > > > > > > struct page *page = NULL;
> > > > > > > > > >  
> > > > > > > > > > mutex_lock(_list_mutex);
> > > > > > > > > > @@ -166,6 +167,8 @@ long mm_iommu_get(struct mm_struct *mm, 
> > > > > > > > > > unsigned long ua, unsigned long entries,
> > > > > > > > > > goto unlock_exit;
> > > > > > > > > > }
> > > > > > > > > >  
> > > > > > > >  > > +  mem->pageshift = 30; /* start from 1G pages - the 
> > > > > > > > biggest we have */  
> > > > > > > > > 
> > > > > > > > > What about 16G pages on an HPT system?
> > > > > > > > 
> > > > > > > > 
> > > > > > > > Below in the loop mem->pageshift will reduce to the biggest 
> > > > > > > > actual size
> > > > > > > > which will be 16mb/64k/4k. Or remain 1GB if no memory 

Re: [PATCH 2/2] powerpc/pkeys: key allocation/deallocation must not change pkey registers

2018-07-02 Thread Thiago Jung Bauermann


Ram Pai  writes:

> Key allocation and deallocation has the side effect of programming the
> UAMOR/AMR/IAMR registers. This is wrong, since its the responsibility of
> the application and not that of the kernel, to modify the permission on
> the key.
>
> Do not modify the pkey registers at key allocation/deallocation.
>
> This patch also fixes a bug where a sys_pkey_free() resets the UAMOR
> bits of the key, thus making its permissions unmodifiable from user
> space.  Latter if the same key gets reallocated from a different thread
> this thread will no longer be able to change the permissions on the key.
>
> Problem noticed/reported by Michael Ellermen while running
> selftests/core-pkeys
>
> Signed-off-by: Ram Pai 
> ---
>  arch/powerpc/include/asm/pkeys.h |   11 ---
>  arch/powerpc/mm/pkeys.c  |   27 ---
>  2 files changed, 0 insertions(+), 38 deletions(-)

LGTM.

Reviewed-by: Thiago Jung Bauermann 

-- 
Thiago Jung Bauermann
IBM Linux Technology Center



[PATCH] mm: allow arch to supply p??_free_tlb functions

2018-07-02 Thread Nicholas Piggin
The mmu_gather APIs keep track of the invalidated address range
including the span covered by invalidated page table pages. Ranges
covered by page tables but not ptes (and therefore no TLBs) still need
to be invalidated because some architectures (x86) can cache
intermediate page table entries, and invalidate those with normal TLB
invalidation instructions to be almost-backward-compatible.

Architectures which don't cache intermediate page table entries, or
which invalidate these caches separately from TLB invalidation, do not
require TLB invalidation range expanded over page tables.

Allow architectures to supply their own p??_free_tlb functions, which
can avoid the __tlb_adjust_range.

Signed-off-by: Nicholas Piggin 
---
Just wanted your ack/nack on this approach, I just tidied the patch
and re-did the changelog. We left off with you wondering if overriding
__tlb_adjust_range for page tables would be the better option, but I
couldn't see any real benefit over this way. Actually I think this is
cleaner, powerpc will simply switch the name of its function from
__pte_free_tlb to pte_free_tlb to take over the tlb management for it.

And is this something that you'd merge at this point of the cycle, so
that arch changes for next window won't include generic code changes or
have cross tree dependencies?

Thanks,
Nick

 include/asm-generic/tlb.h | 8 
 1 file changed, 8 insertions(+)

diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index faddde44de8c..3063125197ad 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -265,33 +265,41 @@ static inline void 
tlb_remove_check_page_size_change(struct mmu_gather *tlb,
  * For now w.r.t page table cache, mark the range_size as PAGE_SIZE
  */
 
+#ifndef pte_free_tlb
 #define pte_free_tlb(tlb, ptep, address)   \
do {\
__tlb_adjust_range(tlb, address, PAGE_SIZE);\
__pte_free_tlb(tlb, ptep, address); \
} while (0)
+#endif
 
+#ifndef pmd_free_tlb
 #define pmd_free_tlb(tlb, pmdp, address)   \
do {\
__tlb_adjust_range(tlb, address, PAGE_SIZE);\
__pmd_free_tlb(tlb, pmdp, address); \
} while (0)
+#endif
 
 #ifndef __ARCH_HAS_4LEVEL_HACK
+#ifndef pud_free_tlb
 #define pud_free_tlb(tlb, pudp, address)   \
do {\
__tlb_adjust_range(tlb, address, PAGE_SIZE);\
__pud_free_tlb(tlb, pudp, address); \
} while (0)
 #endif
+#endif
 
 #ifndef __ARCH_HAS_5LEVEL_HACK
+#ifndef p4d_free_tlb
 #define p4d_free_tlb(tlb, pudp, address)   \
do {\
__tlb_adjust_range(tlb, address, PAGE_SIZE);\
__p4d_free_tlb(tlb, pudp, address); \
} while (0)
 #endif
+#endif
 
 #define tlb_migrate_finish(mm) do {} while (0)
 
-- 
2.17.0



Re: [PATCH v5 5/7] powerpc/pseries: flush SLB contents on SLB MCE errors.

2018-07-02 Thread Nicholas Piggin
On Mon, 02 Jul 2018 11:17:06 +0530
Mahesh J Salgaonkar  wrote:

> From: Mahesh Salgaonkar 
> 
> On pseries, as of today system crashes if we get a machine check
> exceptions due to SLB errors. These are soft errors and can be fixed by
> flushing the SLBs so the kernel can continue to function instead of
> system crash. We do this in real mode before turning on MMU. Otherwise
> we would run into nested machine checks. This patch now fetches the
> rtas error log in real mode and flushes the SLBs on SLB errors.
> 
> Signed-off-by: Mahesh Salgaonkar 
> ---
>  arch/powerpc/include/asm/book3s/64/mmu-hash.h |1 
>  arch/powerpc/include/asm/machdep.h|1 
>  arch/powerpc/kernel/exceptions-64s.S  |   42 +
>  arch/powerpc/kernel/mce.c |   16 +++-
>  arch/powerpc/mm/slb.c |6 +++
>  arch/powerpc/platforms/powernv/opal.c |1 
>  arch/powerpc/platforms/pseries/pseries.h  |1 
>  arch/powerpc/platforms/pseries/ras.c  |   51 
> +
>  arch/powerpc/platforms/pseries/setup.c|1 
>  9 files changed, 116 insertions(+), 4 deletions(-)
> 


> +TRAMP_REAL_BEGIN(machine_check_pSeries_early)
> +BEGIN_FTR_SECTION
> + EXCEPTION_PROLOG_1(PACA_EXMC, NOTEST, 0x200)
> + mr  r10,r1  /* Save r1 */
> + ld  r1,PACAMCEMERGSP(r13)   /* Use MC emergency stack */
> + subir1,r1,INT_FRAME_SIZE/* alloc stack frame*/
> + mfspr   r11,SPRN_SRR0   /* Save SRR0 */
> + mfspr   r12,SPRN_SRR1   /* Save SRR1 */
> + EXCEPTION_PROLOG_COMMON_1()
> + EXCEPTION_PROLOG_COMMON_2(PACA_EXMC)
> + EXCEPTION_PROLOG_COMMON_3(0x200)
> + addir3,r1,STACK_FRAME_OVERHEAD
> + BRANCH_LINK_TO_FAR(machine_check_early) /* Function call ABI */

Is there any reason you can't use the existing
machine_check_powernv_early code to do all this?

> diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
> index efdd16a79075..221271c96a57 100644
> --- a/arch/powerpc/kernel/mce.c
> +++ b/arch/powerpc/kernel/mce.c
> @@ -488,9 +488,21 @@ long machine_check_early(struct pt_regs *regs)
>  {
>   long handled = 0;
>  
> - __this_cpu_inc(irq_stat.mce_exceptions);
> + /*
> +  * For pSeries we count mce when we go into virtual mode machine
> +  * check handler. Hence skip it. Also, We can't access per cpu
> +  * variables in real mode for LPAR.
> +  */
> + if (early_cpu_has_feature(CPU_FTR_HVMODE))
> + __this_cpu_inc(irq_stat.mce_exceptions);
>  
> - if (cur_cpu_spec && cur_cpu_spec->machine_check_early)
> + /*
> +  * See if platform is capable of handling machine check.
> +  * Otherwise fallthrough and allow CPU to handle this machine check.
> +  */
> + if (ppc_md.machine_check_early)
> + handled = ppc_md.machine_check_early(regs);
> + else if (cur_cpu_spec && cur_cpu_spec->machine_check_early)
>   handled = cur_cpu_spec->machine_check_early(regs);

Would be good to add a powernv ppc_md handler which does the
cur_cpu_spec->machine_check_early() call now that other platforms are
calling this code. Because those aren't valid as a fallback call, but
specific to powernv.

> diff --git a/arch/powerpc/platforms/powernv/opal.c 
> b/arch/powerpc/platforms/powernv/opal.c
> index 48fbb41af5d1..ed548d40a9e1 100644
> --- a/arch/powerpc/platforms/powernv/opal.c
> +++ b/arch/powerpc/platforms/powernv/opal.c
> @@ -417,7 +417,6 @@ static int opal_recover_mce(struct pt_regs *regs,
>  
>   if (!(regs->msr & MSR_RI)) {
>   /* If MSR_RI isn't set, we cannot recover */
> - pr_err("Machine check interrupt unrecoverable: MSR(RI=0)\n");

What's the reason for this change?

>   recovered = 0;
>   } else if (evt->disposition == MCE_DISPOSITION_RECOVERED) {
>   /* Platform corrected itself */
> diff --git a/arch/powerpc/platforms/pseries/pseries.h 
> b/arch/powerpc/platforms/pseries/pseries.h
> index 60db2ee511fb..3611db5dd583 100644
> --- a/arch/powerpc/platforms/pseries/pseries.h
> +++ b/arch/powerpc/platforms/pseries/pseries.h
> @@ -24,6 +24,7 @@ struct pt_regs;
>  
>  extern int pSeries_system_reset_exception(struct pt_regs *regs);
>  extern int pSeries_machine_check_exception(struct pt_regs *regs);
> +extern int pSeries_machine_check_realmode(struct pt_regs *regs);
>  
>  #ifdef CONFIG_SMP
>  extern void smp_init_pseries(void);
> diff --git a/arch/powerpc/platforms/pseries/ras.c 
> b/arch/powerpc/platforms/pseries/ras.c
> index 851ce326874a..9aa7885e0148 100644
> --- a/arch/powerpc/platforms/pseries/ras.c
> +++ b/arch/powerpc/platforms/pseries/ras.c
> @@ -427,6 +427,35 @@ int pSeries_system_reset_exception(struct pt_regs *regs)
>   return 0; /* need to perform reset */
>  }
>  
> +static int mce_handle_error(struct rtas_error_log *errp)
> +{
> + struct pseries_errorlog *pseries_log;
> + 

Re: [Update] Regression in 4.18 - 32-bit PowerPC crashes on boot - bisected to commit 1d40a5ea01d5

2018-07-02 Thread Larry Finger

On 07/01/2018 11:16 PM, Michael Ellerman wrote:

Linus Torvalds  writes:

On Fri, Jun 29, 2018 at 1:42 PM Larry Finger  wrote:


I have more information regarding this BUG. Line 700 of page-flags.h is the
macro PAGE_TYPE_OPS(Table, table). For further debugging, I manually expanded
the macro, and found that the bug line is VM_BUG_ON_PAGE(!PageTable(page), page)
in routine __ClearPageTable(), which is called from pgtable_page_dtor() in
include/linux/mm.h. I also added a printk call to PageTable() that logs
page->page_type. The routine was called twice. The first had page_type of
0xfbff, which would have been expected for a . The second call had
0x, which led to the BUG.


So it looks to me like the tear-down of the page tables first found a
page that is indeed a page table, and cleared the page table bit
(well, it set it - the bits are reversed).

...


That said, can some ppc person who knows the 32-bit ppc code and maybe
knows what that "interrupt: 700" means talk about that oddity in the
trace, please?


I think everyone else answered your questions here, and it should be
fixed now in your tree.

Larry let me know if you're still seeing a crash with 4.18-rc3.


The problem is fixed in 4.18-rc3. Thanks to all that helped.

Larry



Re: [v3 PATCH 5/5] powerpc/pseries: Display machine check error details.

2018-07-02 Thread Michal Suchánek
On Fri, 8 Jun 2018 11:51:36 +1000
Nicholas Piggin  wrote:

> On Thu, 07 Jun 2018 22:59:04 +0530
> Mahesh J Salgaonkar  wrote:
> 
> > From: Mahesh Salgaonkar 
> > 
> > Extract the MCE error details from RTAS extended log and display it
> > to console.
> > 
> > With this patch you should now see mce logs like below:
> > 
> > [  142.371818] Severe Machine check interrupt [Recovered]
> > [  142.371822]   NIP [dca301b8]: init_module+0x1b8/0x338
> > [bork_kernel] [  142.371822]   Initiator: CPU
> > [  142.371823]   Error type: SLB [Multihit]
> > [  142.371824] Effective address: dca7
> > 
> > Signed-off-by: Mahesh Salgaonkar 
> > ---
> >  arch/powerpc/include/asm/rtas.h  |5 +
> >  arch/powerpc/platforms/pseries/ras.c |  128
> > +- 2 files changed, 131
> > insertions(+), 2 deletions(-)
> > 
> > diff --git a/arch/powerpc/include/asm/rtas.h
> > b/arch/powerpc/include/asm/rtas.h index 3f2fba7ef23b..8100a95c133a
> > 100644 --- a/arch/powerpc/include/asm/rtas.h
> > +++ b/arch/powerpc/include/asm/rtas.h
> > @@ -190,6 +190,11 @@ static inline uint8_t
> > rtas_error_extended(const struct rtas_error_log *elog) return
> > (elog->byte1 & 0x04) >> 2; }
> >  
> > +static inline uint8_t rtas_error_initiator(const struct
> > rtas_error_log *elog) +{
> > +   return (elog->byte2 & 0xf0) >> 4;
> > +}
> > +
> >  #define rtas_error_type(x) ((x)->byte3)
> >  
> >  static inline
> > diff --git a/arch/powerpc/platforms/pseries/ras.c
> > b/arch/powerpc/platforms/pseries/ras.c index
> > e56759d92356..cd9446980092 100644 ---
> > a/arch/powerpc/platforms/pseries/ras.c +++
> > b/arch/powerpc/platforms/pseries/ras.c @@ -422,7 +422,130 @@ int
> > pSeries_system_reset_exception(struct pt_regs *regs) return 0; /*
> > need to perform reset */ }
> >  
> > -static int mce_handle_error(struct rtas_error_log *errp)
> > +#define VAL_TO_STRING(ar, val) ((val < ARRAY_SIZE(ar)) ?
> > ar[val] : "Unknown") +
> > +static void pseries_print_mce_info(struct pt_regs *regs,
> > +   struct rtas_error_log *errp, int
> > disposition) +{
> > +   const char *level, *sevstr;
> > +   struct pseries_errorlog *pseries_log;
> > +   struct pseries_mc_errorlog *mce_log;
> > +   uint8_t error_type, err_sub_type;
> > +   uint8_t initiator = rtas_error_initiator(errp);
> > +   uint64_t addr;
> > +
> > +   static const char * const initiators[] = {
> > +   "Unknown",
> > +   "CPU",
> > +   "PCI",
> > +   "ISA",
> > +   "Memory",
> > +   "Power Mgmt",
> > +   };
> > +   static const char * const mc_err_types[] = {
> > +   "UE",
> > +   "SLB",
> > +   "ERAT",
> > +   "TLB",
> > +   "D-Cache",
> > +   "Unknown",
> > +   "I-Cache",
> > +   };
> > +   static const char * const mc_ue_types[] = {
> > +   "Indeterminate",
> > +   "Instruction fetch",
> > +   "Page table walk ifetch",
> > +   "Load/Store",
> > +   "Page table walk Load/Store",
> > +   };
> > +
> > +   /* SLB sub errors valid values are 0x0, 0x1, 0x2 */
> > +   static const char * const mc_slb_types[] = {
> > +   "Parity",
> > +   "Multihit",
> > +   "Indeterminate",
> > +   };
> > +
> > +   /* TLB and ERAT sub errors valid values are 0x1, 0x2, 0x3
> > */
> > +   static const char * const mc_soft_types[] = {
> > +   "Unknown",
> > +   "Parity",
> > +   "Multihit",
> > +   "Indeterminate",
> > +   };
> > +
> > +   pseries_log = get_pseries_errorlog(errp,
> > PSERIES_ELOG_SECT_ID_MCE);
> > +   if (pseries_log == NULL)
> > +   return;
> > +
> > +   mce_log = (struct pseries_mc_errorlog *)pseries_log->data;
> > +
> > +   error_type = rtas_mc_error_type(mce_log);
> > +   err_sub_type = rtas_mc_error_sub_type(mce_log);
> > +
> > +   switch (rtas_error_severity(errp)) {
> > +   case RTAS_SEVERITY_NO_ERROR:
> > +   level = KERN_INFO;
> > +   sevstr = "Harmless";
> > +   break;
> > +   case RTAS_SEVERITY_WARNING:
> > +   level = KERN_WARNING;
> > +   sevstr = "";
> > +   break;
> > +   case RTAS_SEVERITY_ERROR:
> > +   case RTAS_SEVERITY_ERROR_SYNC:
> > +   level = KERN_ERR;
> > +   sevstr = "Severe";
> > +   break;
> > +   case RTAS_SEVERITY_FATAL:
> > +   default:
> > +   level = KERN_ERR;
> > +   sevstr = "Fatal";
> > +   break;
> > +   }
> > +
> > +   printk("%s%s Machine check interrupt [%s]\n", level,
> > sevstr,
> > +   disposition == RTAS_DISP_FULLY_RECOVERED ?
> > +   "Recovered" : "Not recovered");
> > +   if (user_mode(regs)) {
> > +   printk("%s  NIP: [%016lx] PID: %d Comm: %s\n",
> > level,
> > +   regs->nip, current->pid, current->comm);
> > +   } else {
> > +   printk("%s  NIP [%016lx]: %pS\n", level, regs->nip,
> > +   (void *)regs->nip);
> > +   }  
> 
> I 

Re: [PATCH 0/9] Fix references for some missing documentation files

2018-07-02 Thread Jonathan Corbet
On Tue, 26 Jun 2018 06:49:02 -0300
Mauro Carvalho Chehab  wrote:

> Having nothing to do while waiting for my plane to arrive while
> returning back from Japan, I ended by writing a small series of 
> patches meant to reduce the number of bad Documentation/* 
> links that are detected by:
>   ./scripts/documentation-file-ref-check

I've applied everything except the two networking patches, since I expect
those to go through Dave's tree.

Thanks,

jon


Re: [PATCH] powerpc: mpc5200: Remove VLA usage

2018-07-02 Thread Segher Boessenkool
On Mon, Jul 02, 2018 at 11:33:32AM +1000, Michael Ellerman wrote:
> What if we write it:
> 
>char saved_0x500[0x600 - 0x500];
> 
> Hopefully the compiler is smart enough not to generate a VLA for that :)

It is a VLA if the array size is not an integer constant expression.  This
is defined by C; the compiler has nothing to do with it.  0x600-0x500 is
an integer constant expression, so this is not a VLA.

But if you meant if GCC will ever do a dynamic stack allocation for a fixed
size local variable: yes indeed, I hope not!

(Sometimes GCC can avoid this even with VLAs; but in this example we do
not even have a VLA, so it's easier than that :-) )


Segher


[PATCH v2] powerpc: mpc5200: Remove VLA usage

2018-07-02 Thread Kees Cook
In the quest to remove all stack VLA usage from the kernel[1], this
switches to using a stack size large enough for the saved routine and
adds a sanity check making sure the routine doesn't overflow into the
0x600 exception handler.

[1] 
https://lkml.kernel.org/r/CA+55aFzCG-zNmZwX4A2FQpadafLfEzK6CC=qpxydaacu1rq...@mail.gmail.com

Signed-off-by: Kees Cook 
Reviewed-by: Arnd Bergmann 
---
v2: use "0x600-0x500" for size calculation to illustrate handler sizes
---
 arch/powerpc/platforms/52xx/mpc52xx_pm.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/52xx/mpc52xx_pm.c 
b/arch/powerpc/platforms/52xx/mpc52xx_pm.c
index 31d3515672f3..b1d208ded981 100644
--- a/arch/powerpc/platforms/52xx/mpc52xx_pm.c
+++ b/arch/powerpc/platforms/52xx/mpc52xx_pm.c
@@ -117,7 +117,10 @@ int mpc52xx_pm_enter(suspend_state_t state)
u32 intr_main_mask;
void __iomem * irq_0x500 = (void __iomem *)CONFIG_KERNEL_START + 0x500;
unsigned long irq_0x500_stop = (unsigned long)irq_0x500 + 
mpc52xx_ds_cached_size;
-   char saved_0x500[mpc52xx_ds_cached_size];
+   char saved_0x500[0x600-0x500];
+
+   if (WARN_ON(mpc52xx_ds_cached_size > sizeof(saved_0x500)))
+   return -ENOMEM;
 
/* disable all interrupts in PIC */
intr_main_mask = in_be32(>main_mask);
-- 
2.17.1


-- 
Kees Cook
Pixel Security


Re: [PATCH] powerpc: mpc5200: Remove VLA usage

2018-07-02 Thread Kees Cook
On Sun, Jul 1, 2018 at 6:33 PM, Michael Ellerman  wrote:
> Kees Cook  writes:
>
>> On Fri, Jun 29, 2018 at 2:02 PM, Arnd Bergmann  wrote:
>>> On Fri, Jun 29, 2018 at 8:53 PM, Kees Cook  wrote:
 In the quest to remove all stack VLA usage from the kernel[1], this
 switches to using a stack size large enough for the saved routine and
 adds a sanity check.

 [1] 
 https://lkml.kernel.org/r/CA+55aFzCG-zNmZwX4A2FQpadafLfEzK6CC=qpxydaacu1rq...@mail.gmail.com

 Signed-off-by: Kees Cook 
>>>
>>> This seems particularly nice, not only avoids it the dynamic stack
>>> allocation, it
>>> also makes sure the new 0x500 handler doesn't overflow into the 0x600
>>> exception handler.
>>>
>>> It would help to explain how you arrived at that '256 byte' number in
>>> the changelog though.
>>
>> Honestly, I just counted instructions, multiplied by 8 and rounded up
>> to the next nearest power of 2, and the result felt right for giving
>> some level of flexibility for code growth before tripping the WARN. :P
>>
>> I'm happy to adjust, of course. :)
>
> What if we write it:
>
>char saved_0x500[0x600 - 0x500];
>
> Hopefully the compiler is smart enough not to generate a VLA for that :)

Sure, that's fine. I'll send an updated patch.

-Kees

-- 
Kees Cook
Pixel Security


[PATCH v2 2/2] powernv/cpuidle: Use parsed device tree values for cpuidle_init

2018-07-02 Thread Akshay Adiga
Export pnv_idle_states and nr_pnv_idle_states so that its accessible to
cpuidle driver. Use properties from pnv_idle_states structure for powernv
cpuidle_init.

Signed-off-by: Akshay Adiga 
---
 arch/powerpc/include/asm/cpuidle.h |   2 +
 drivers/cpuidle/cpuidle-powernv.c  | 143 +
 2 files changed, 26 insertions(+), 119 deletions(-)

diff --git a/arch/powerpc/include/asm/cpuidle.h 
b/arch/powerpc/include/asm/cpuidle.h
index 574b0ce1d671..43e5f31fe64d 100644
--- a/arch/powerpc/include/asm/cpuidle.h
+++ b/arch/powerpc/include/asm/cpuidle.h
@@ -90,6 +90,8 @@ struct pnv_idle_states_t {
bool valid;
 };
 
+extern struct pnv_idle_states_t *pnv_idle_states;
+extern int nr_pnv_idle_states;
 extern u32 pnv_fastsleep_workaround_at_entry[];
 extern u32 pnv_fastsleep_workaround_at_exit[];
 
diff --git a/drivers/cpuidle/cpuidle-powernv.c 
b/drivers/cpuidle/cpuidle-powernv.c
index 7ab613d4dca1..ec93b2ae7b17 100644
--- a/drivers/cpuidle/cpuidle-powernv.c
+++ b/drivers/cpuidle/cpuidle-powernv.c
@@ -242,6 +242,7 @@ static inline void add_powernv_state(int index, const char 
*name,
powernv_states[index].target_residency = target_residency;
powernv_states[index].exit_latency = exit_latency;
powernv_states[index].enter = idle_fn;
+   /* For power8 and below psscr_* will be 0 */
stop_psscr_table[index].val = psscr_val;
stop_psscr_table[index].mask = psscr_mask;
 }
@@ -263,179 +264,84 @@ static inline int validate_dt_prop_sizes(const char 
*prop1, int prop1_len,
 extern u32 pnv_get_supported_cpuidle_states(void);
 static int powernv_add_idle_states(void)
 {
-   struct device_node *power_mgt;
int nr_idle_states = 1; /* Snooze */
-   int dt_idle_states, count;
-   u32 latency_ns[CPUIDLE_STATE_MAX];
-   u32 residency_ns[CPUIDLE_STATE_MAX];
-   u32 flags[CPUIDLE_STATE_MAX];
-   u64 psscr_val[CPUIDLE_STATE_MAX];
-   u64 psscr_mask[CPUIDLE_STATE_MAX];
-   const char *names[CPUIDLE_STATE_MAX];
+   int dt_idle_states;
u32 has_stop_states = 0;
-   int i, rc;
+   int i;
u32 supported_flags = pnv_get_supported_cpuidle_states();
 
 
/* Currently we have snooze statically defined */
-
-   power_mgt = of_find_node_by_path("/ibm,opal/power-mgt");
-   if (!power_mgt) {
-   pr_warn("opal: PowerMgmt Node not found\n");
+   if (nr_pnv_idle_states <= 0) {
+   pr_warn("cpuidle-powernv : Only Snooze is available\n");
goto out;
}
 
-   /* Read values of any property to determine the num of idle states */
-   dt_idle_states = of_property_count_u32_elems(power_mgt, 
"ibm,cpu-idle-state-flags");
-   if (dt_idle_states < 0) {
-   pr_warn("cpuidle-powernv: no idle states found in the DT\n");
-   goto out;
-   }
-
-   count = of_property_count_u32_elems(power_mgt,
-   "ibm,cpu-idle-state-latencies-ns");
-
-   if (validate_dt_prop_sizes("ibm,cpu-idle-state-flags", dt_idle_states,
-  "ibm,cpu-idle-state-latencies-ns",
-  count) != 0)
-   goto out;
-
-   count = of_property_count_strings(power_mgt,
- "ibm,cpu-idle-state-names");
-   if (validate_dt_prop_sizes("ibm,cpu-idle-state-flags", dt_idle_states,
-  "ibm,cpu-idle-state-names",
-  count) != 0)
-   goto out;
+   /* TODO: Count only states which are eligible for cpuidle */
+   dt_idle_states = nr_pnv_idle_states;
 
/*
 * Since snooze is used as first idle state, max idle states allowed is
 * CPUIDLE_STATE_MAX -1
 */
-   if (dt_idle_states > CPUIDLE_STATE_MAX - 1) {
+   if (nr_pnv_idle_states > CPUIDLE_STATE_MAX - 1) {
pr_warn("cpuidle-powernv: discovered idle states more than 
allowed");
dt_idle_states = CPUIDLE_STATE_MAX - 1;
}
 
-   if (of_property_read_u32_array(power_mgt,
-   "ibm,cpu-idle-state-flags", flags, dt_idle_states)) {
-   pr_warn("cpuidle-powernv : missing ibm,cpu-idle-state-flags in 
DT\n");
-   goto out;
-   }
-
-   if (of_property_read_u32_array(power_mgt,
-   "ibm,cpu-idle-state-latencies-ns", latency_ns,
-   dt_idle_states)) {
-   pr_warn("cpuidle-powernv: missing 
ibm,cpu-idle-state-latencies-ns in DT\n");
-   goto out;
-   }
-   if (of_property_read_string_array(power_mgt,
-   "ibm,cpu-idle-state-names", names, dt_idle_states) < 0) {
-   pr_warn("cpuidle-powernv: missing ibm,cpu-idle-state-names in 
DT\n");
-   goto out;
-   }
-
/*
 * If the idle states use stop instruction, probe for psscr values
 * and psscr mask which are 

[PATCH v2 1/2] powernv/cpuidle: Parse dt idle properties into global structure

2018-07-02 Thread Akshay Adiga
Device-tree parsing happens twice, once while deciding idle state to be
used for hotplug and once during cpuidle init. Hence, parsing the device
tree and caching it will reduce code duplication. Parsing code has been
moved to pnv_parse_cpuidle_dt() from pnv_probe_idle_states(). In addition
to the properties in the device tree the number of available states is
also required.

Signed-off-by: Akshay Adiga 
---
 arch/powerpc/include/asm/cpuidle.h|  11 ++
 arch/powerpc/platforms/powernv/idle.c | 216 --
 drivers/cpuidle/cpuidle-powernv.c |  11 +-
 3 files changed, 151 insertions(+), 87 deletions(-)

diff --git a/arch/powerpc/include/asm/cpuidle.h 
b/arch/powerpc/include/asm/cpuidle.h
index e210a83eb196..574b0ce1d671 100644
--- a/arch/powerpc/include/asm/cpuidle.h
+++ b/arch/powerpc/include/asm/cpuidle.h
@@ -79,6 +79,17 @@ struct stop_sprs {
u64 mmcra;
 };
 
+#define PNV_IDLE_NAME_LEN16
+struct pnv_idle_states_t {
+   char name[PNV_IDLE_NAME_LEN];
+   u32 latency_ns;
+   u32 residency_ns;
+   u64 psscr_val;
+   u64 psscr_mask;
+   u32 flags;
+   bool valid;
+};
+
 extern u32 pnv_fastsleep_workaround_at_entry[];
 extern u32 pnv_fastsleep_workaround_at_exit[];
 
diff --git a/arch/powerpc/platforms/powernv/idle.c 
b/arch/powerpc/platforms/powernv/idle.c
index 1c5d0675b43c..7cf71b3e03a1 100644
--- a/arch/powerpc/platforms/powernv/idle.c
+++ b/arch/powerpc/platforms/powernv/idle.c
@@ -36,6 +36,8 @@
 #define P9_STOP_SPR_PSSCR  855
 
 static u32 supported_cpuidle_states;
+struct pnv_idle_states_t *pnv_idle_states;
+int nr_pnv_idle_states;
 
 /*
  * The default stop state that will be used by ppc_md.power_save
@@ -622,48 +624,10 @@ int validate_psscr_val_mask(u64 *psscr_val, u64 
*psscr_mask, u32 flags)
  * @dt_idle_states: Number of idle state entries
  * Returns 0 on success
  */
-static int __init pnv_power9_idle_init(struct device_node *np, u32 *flags,
-   int dt_idle_states)
+static int __init pnv_power9_idle_init(void)
 {
-   u64 *psscr_val = NULL;
-   u64 *psscr_mask = NULL;
-   u32 *residency_ns = NULL;
u64 max_residency_ns = 0;
-   int rc = 0, i;
-
-   psscr_val = kcalloc(dt_idle_states, sizeof(*psscr_val), GFP_KERNEL);
-   psscr_mask = kcalloc(dt_idle_states, sizeof(*psscr_mask), GFP_KERNEL);
-   residency_ns = kcalloc(dt_idle_states, sizeof(*residency_ns),
-  GFP_KERNEL);
-
-   if (!psscr_val || !psscr_mask || !residency_ns) {
-   rc = -1;
-   goto out;
-   }
-
-   if (of_property_read_u64_array(np,
-   "ibm,cpu-idle-state-psscr",
-   psscr_val, dt_idle_states)) {
-   pr_warn("cpuidle-powernv: missing ibm,cpu-idle-state-psscr in 
DT\n");
-   rc = -1;
-   goto out;
-   }
-
-   if (of_property_read_u64_array(np,
-  "ibm,cpu-idle-state-psscr-mask",
-  psscr_mask, dt_idle_states)) {
-   pr_warn("cpuidle-powernv: missing ibm,cpu-idle-state-psscr-mask 
in DT\n");
-   rc = -1;
-   goto out;
-   }
-
-   if (of_property_read_u32_array(np,
-  "ibm,cpu-idle-state-residency-ns",
-   residency_ns, dt_idle_states)) {
-   pr_warn("cpuidle-powernv: missing 
ibm,cpu-idle-state-residency-ns in DT\n");
-   rc = -1;
-   goto out;
-   }
+   int i;
 
/*
 * Set pnv_first_deep_stop_state, pnv_deepest_stop_psscr_{val,mask},
@@ -679,33 +643,36 @@ static int __init pnv_power9_idle_init(struct device_node 
*np, u32 *flags,
 * the shallowest (OPAL_PM_STOP_INST_FAST) loss-less stop state.
 */
pnv_first_deep_stop_state = MAX_STOP_STATE;
-   for (i = 0; i < dt_idle_states; i++) {
+   for (i = 0; i < nr_pnv_idle_states; i++) {
int err;
-   u64 psscr_rl = psscr_val[i] & PSSCR_RL_MASK;
+   struct pnv_idle_states_t *state = _idle_states[i];
+   u64 psscr_rl = state->psscr_val & PSSCR_RL_MASK;
 
-   if ((flags[i] & OPAL_PM_LOSE_FULL_CONTEXT) &&
-(pnv_first_deep_stop_state > psscr_rl))
+   if ((state->flags & OPAL_PM_LOSE_FULL_CONTEXT) &&
+   pnv_first_deep_stop_state > psscr_rl)
pnv_first_deep_stop_state = psscr_rl;
 
-   err = validate_psscr_val_mask(_val[i], _mask[i],
- flags[i]);
+   err = validate_psscr_val_mask(>psscr_val,
+ >psscr_mask,
+ state->flags);
if (err) {
-   report_invalid_psscr_val(psscr_val[i], err);
+   state->valid = false;
+   

[PATCH v2 0/2] powernv/cpuidle Device-tree parsing cleanup

2018-07-02 Thread Akshay Adiga
Device-tree parsed multiple time in powernv cpuidle and powernv
hotplug code. 

First to identify supported flags. Second time, to identify deepest_state
and first deep state. Third time, during cpuidle init to find the available
idle states. Any change in device-tree format will lead to make changes in
these 3 places. Errors in device-tree can be handled in a better manner.

This series adds code to parse device tree once and save in global structure.

Changes from v1 :  
 - folded first 2 patches into 1 
 - rename pm_ctrl_reg_* as psscr_* 
 - added comment stating removal of pmicr parsing code 
 - removed parsing code for pmicr  
 - add member valid in pnv_idle_states_t to indicate if the psscr-mask/val 
are valid combination, 
 - Change function description of pnv_parse_cpuidle_dt 
 - Added error handling code.  

Akshay Adiga (2):
  powernv/cpuidle: Parse dt idle properties into global structure
  powernv/cpuidle: Use parsed device tree values for cpuidle_init

 arch/powerpc/include/asm/cpuidle.h|  13 ++
 arch/powerpc/platforms/powernv/idle.c | 216 --
 drivers/cpuidle/cpuidle-powernv.c | 156 ---
 3 files changed, 178 insertions(+), 207 deletions(-)

-- 
2.18.0.rc2.85.g1fb9df7



[PATCH] powerpc: hwrng; fix missing of_node_put()

2018-07-02 Thread Nicholas Mc Guire
 The call to of_find_compatible_node() returns a node pointer with refcount
incremented thus it must be explicitly decremented here before returning.

Signed-off-by: Nicholas Mc Guire 
Fixes: commit a489043f4626 ("powerpc/pseries: Implement arch_get_random_long() 
based on H_RANDOM")
---
Problem found with experimental coccinelle script

Patch was compiletested with: ppc64_defconfig (implies CONFIG_PPC_PSERIES=y)
with some unrelated sparse warnings (which I did not understand)
./arch/powerpc/include/asm/head-64.h:13:36: warning: Unknown escape '('
./arch/powerpc/include/asm/head-64.h:16:36: warning: Unknown escape '('
./arch/powerpc/include/asm/head-64.h:19:36: warning: Unknown escape '('

Patch is aginst 4.18-rc2 (localversion-next is next-20180702)

 arch/powerpc/platforms/pseries/rng.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/platforms/pseries/rng.c 
b/arch/powerpc/platforms/pseries/rng.c
index 31ca557..262b8c5 100644
--- a/arch/powerpc/platforms/pseries/rng.c
+++ b/arch/powerpc/platforms/pseries/rng.c
@@ -40,6 +40,7 @@ static __init int rng_init(void)
 
ppc_md.get_random_seed = pseries_get_random_long;
 
+   of_node_put(dn);
return 0;
 }
 machine_subsys_initcall(pseries, rng_init);
-- 
2.1.4



Re: [PATCH v4 01/11] macintosh/via-pmu: Fix section mismatch warning

2018-07-02 Thread Finn Thain
On Mon, 2 Jul 2018, Mathieu Malaterre wrote:

> On Mon, Jul 2, 2018 at 10:25 AM Finn Thain  
> wrote:
> >
> > The pmu_init() function has the __init qualifier, but the ops struct 
> > that holds a pointer to it does not. This causes a build warning. The 
> > driver works fine because the pointer is only dereferenced early.
> >
> > The function is so small that there's negligible benefit from using 
> > the __init qualifier. Remove it to fix the warning, consistent with 
> > the other ADB drivers.
> 
> Would you mind copy/pasting the warning you are seeing.
> 
> Make sure you have:
> 
> 58935176ad17 powerpc/via-pmu: Fix section mismatch warning
> 
> Thanks
> 

It's true, the section mismatch warning from 'make' has disappeared since 
I wrote this patch, but that doesn't mean it is wrong.

Before this patch:

$ powerpc-linux-gnu-objdump -xda vmlinux |egrep -w "via_pmu_driver|pmu_init"
c0711c84 l F .init.text 001c pmu_init
c05eb408 g O .rodata0028 via_pmu_driver
c0711c84 :
$ 

After:

$ powerpc-linux-gnu-objdump -xda vmlinux |egrep -w "via_pmu_driver|pmu_init"
c038e42c l F .text  001c pmu_init
c05e1e58 g O .rodata0028 via_pmu_driver
c038e42c :
$

I gather that commit 58935176ad17 ("powerpc/via-pmu: Fix section mismatch 
warning") has moved via_pmu_driver from .data to .rodata, but I'm afraid I 
don't see the point of that change. The commit log entry doesn't explain 
it either.

If .rodata is not discarded then the dangling pointer remains, right?

-- 


Re: [PATCH v4 01/11] macintosh/via-pmu: Fix section mismatch warning

2018-07-02 Thread Mathieu Malaterre
On Mon, Jul 2, 2018 at 10:25 AM Finn Thain  wrote:
>
> The pmu_init() function has the __init qualifier, but the ops struct
> that holds a pointer to it does not. This causes a build warning.
> The driver works fine because the pointer is only dereferenced early.
>
> The function is so small that there's negligible benefit from using
> the __init qualifier. Remove it to fix the warning, consistent with
> the other ADB drivers.

Would you mind copy/pasting the warning you are seeing.

Make sure you have:

58935176ad17 powerpc/via-pmu: Fix section mismatch warning

Thanks

> Tested-by: Stan Johnson 
> Signed-off-by: Finn Thain 
> Reviewed-by: Geert Uytterhoeven 
> ---
>  drivers/macintosh/via-pmu.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/drivers/macintosh/via-pmu.c b/drivers/macintosh/via-pmu.c
> index 25c1ce811053..f8a2c917201f 100644
> --- a/drivers/macintosh/via-pmu.c
> +++ b/drivers/macintosh/via-pmu.c
> @@ -378,7 +378,7 @@ static int pmu_probe(void)
> return vias == NULL? -ENODEV: 0;
>  }
>
> -static int __init pmu_init(void)
> +static int pmu_init(void)
>  {
> if (vias == NULL)
> return -ENODEV;
> --
> 2.16.4
>


Re: [PATCH v11 00/26] Speculative page faults

2018-07-02 Thread Laurent Dufour
On 11/06/2018 09:49, Song, HaiyanX wrote:
> Hi Laurent,
> 
> Regression test for v11 patch serials have been run, some regression is found 
> by LKP-tools (linux kernel performance)
> tested on Intel 4s skylake platform. This time only test the cases which have 
> been run and found regressions on
> V9 patch serials.
> 
> The regression result is sorted by the metric will-it-scale.per_thread_ops.
> branch: Laurent-Dufour/Speculative-page-faults/20180520-045126
> commit id:
>   head commit : a7a8993bfe3ccb54ad468b9f1799649e4ad1ff12
>   base commit : ba98a1cdad71d259a194461b3a61471b49b14df1
> Benchmark: will-it-scale
> Download link: https://github.com/antonblanchard/will-it-scale/tree/master
> 
> Metrics:
>   will-it-scale.per_process_ops=processes/nr_cpu
>   will-it-scale.per_thread_ops=threads/nr_cpu
>   test box: lkp-skl-4sp1(nr_cpu=192,memory=768G)
> THP: enable / disable
> nr_task:100%
> 
> 1. Regressions:
> 
> a). Enable THP
> testcase  base   change  head   
> metric
> page_fault3/enable THP   10519  -20.5%836  
> will-it-scale.per_thread_ops
> page_fault2/enalbe THP8281  -18.8%   6728  
> will-it-scale.per_thread_ops
> brk1/eanble THP 998475   -2.2% 976893  
> will-it-scale.per_process_ops
> context_switch1/enable THP  223910   -1.3% 220930  
> will-it-scale.per_process_ops
> context_switch1/enable THP  233722   -1.0% 231288  
> will-it-scale.per_thread_ops
> 
> b). Disable THP
> page_fault3/disable THP  10856  -23.1%   8344  
> will-it-scale.per_thread_ops
> page_fault2/disable THP   8147  -18.8%   6613  
> will-it-scale.per_thread_ops
> brk1/disable THP   957   -7.9%881  
> will-it-scale.per_thread_ops
> context_switch1/disable THP 237006   -2.2% 231907  
> will-it-scale.per_thread_ops
> brk1/disable THP997317   -2.0% 98  
> will-it-scale.per_process_ops
> page_fault3/disable THP 467454   -1.8% 459251  
> will-it-scale.per_process_ops
> context_switch1/disable THP 224431   -1.3% 221567  
> will-it-scale.per_process_ops
> 
> Notes: for the above  values of test result, the higher is better.

I tried the same tests on my PowerPC victim VM (1024 CPUs, 11TB) and I can't
get reproducible results. The results have huge variation, even on the vanilla
kernel, and I can't state on any changes due to that.

I tried on smaller node (80 CPUs, 32G), and the tests ran better, but I didn't
measure any changes between the vanilla and the SPF patched ones:

test THP enabled4.17.0-rc4-mm1  spf delta
page_fault3_threads 2697.7  2683.5  -0.53%
page_fault2_threads 170660.6169574.1-0.64%
context_switch1_threads 6915269.2   6877507.3   -0.55%
context_switch1_processes   6478076.2   6529493.5   0.79%
brk1243391.2238527.5-2.00%

Tests were run 10 times, no high variation detected.

Did you see high variation on your side ? How many times the test were run to
compute the average values ?

Thanks,
Laurent.


> 
> 2. Improvement: not found improvement based on the selected test cases.
> 
> 
> Best regards
> Haiyan Song
> 
> From: owner-linux...@kvack.org [owner-linux...@kvack.org] on behalf of 
> Laurent Dufour [lduf...@linux.vnet.ibm.com]
> Sent: Monday, May 28, 2018 4:54 PM
> To: Song, HaiyanX
> Cc: a...@linux-foundation.org; mho...@kernel.org; pet...@infradead.org; 
> kir...@shutemov.name; a...@linux.intel.com; d...@stgolabs.net; j...@suse.cz; 
> Matthew Wilcox; khand...@linux.vnet.ibm.com; aneesh.ku...@linux.vnet.ibm.com; 
> b...@kernel.crashing.org; m...@ellerman.id.au; pau...@samba.org; Thomas 
> Gleixner; Ingo Molnar; h...@zytor.com; Will Deacon; Sergey Senozhatsky; 
> sergey.senozhatsky.w...@gmail.com; Andrea Arcangeli; Alexei Starovoitov; 
> Wang, Kemi; Daniel Jordan; David Rientjes; Jerome Glisse; Ganesh Mahendran; 
> Minchan Kim; Punit Agrawal; vinayak menon; Yang Shi; 
> linux-ker...@vger.kernel.org; linux...@kvack.org; ha...@linux.vnet.ibm.com; 
> npig...@gmail.com; bsinghar...@gmail.com; paul...@linux.vnet.ibm.com; Tim 
> Chen; linuxppc-dev@lists.ozlabs.org; x...@kernel.org
> Subject: Re: [PATCH v11 00/26] Speculative page faults
> 
> On 28/05/2018 10:22, Haiyan Song wrote:
>> Hi Laurent,
>>
>> Yes, these tests are done on V9 patch.
> 
> Do you plan to give this V11 a run ?
> 
>>
>>
>> Best regards,
>> Haiyan Song
>>
>> On Mon, May 28, 2018 at 09:51:34AM +0200, Laurent Dufour wrote:
>>> On 28/05/2018 07:23, Song, HaiyanX wrote:

 Some regression and improvements is found by LKP-tools(linux kernel 
 performance) on V9 patch series
 tested on Intel 4s 

Re: [PATCH kernel 1/2] powerpc/powernv: Reuse existing TCE code for sketchy bypass

2018-07-02 Thread Alexey Kardashevskiy
On Sat, 16 Jun 2018 11:04:32 +1000
Benjamin Herrenschmidt  wrote:

> On Fri, 2018-06-01 at 18:10 +1000, Alexey Kardashevskiy wrote:
> > The existing sketchy bypass ignores the existing default 32bit TCE table
> > (created by default for every PE at boot time or after being used by
> > VFIO) and it allocates another table instead without updating PE DMA
> > config (pe->table_group). So if we decide to use such device for VFIO
> > later, this new table will also leak memory.
> > 
> > This replaces adhoc table allocation and programming with the existing
> > API which handles memory leaks.
> > 
> > This programs the default 32bit table back to TVE#0 if configuring
> > the new table failed for some reason.
> > 
> > While we are at it, switch from the hardcoded 256MB TCEs to the biggest
> > size supported by the hardware and reported by the firmware. This allows
> > the sketchy bypass (originally made for POWER8 only) to work on POWER9
> > too assuming that PHB4 type is defined and pnv_pci_ioda_dma_64bit_bypass()
> > is called (coming next).  
> 
> It won't work on POWER9 for other reasons. Mostly memory isn't
> contiguous there.


This simply allocates a table using the existing API which makes
things more accurate and manageable. "[PATCH 0/3] PCI DMA pseudo-bypass
for powernv" still allocates a table and programs it via OPAL, why does
it have to replicate all the helpers I put there? Do they all suck and
the new ones are better? If so, how exactly are they better? And let's
fix them then.



> 
> What we shuld look into rather than removing the 32-bit table is
> exploit DD2 P9 feature allowing to overlay TVE0 and TVE1 so that 0..4G
> is in control of TVE0 and the rest goes to TVE1.
>  
> > This does not call iommu_init_table() for the new table as the caller
> > will use _nommu_ops and therefore ::it_map is not needed.
> > 
> > Signed-off-by: Alexey Kardashevskiy 
> > ---
> > 
> > Tested with:
> > if (pe->tce_bypass_enabled) {
> > top = pe->tce_bypass_base + memblock_end_of_DRAM() - 1;
> > -   bypass = (dma_mask >= top);
> > +   bypass = false;//(dma_mask >= top);
> > }
> > ---
> >  arch/powerpc/platforms/powernv/pci-ioda.c | 71 
> > +--
> >  1 file changed, 39 insertions(+), 32 deletions(-)
> > 
> > diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
> > b/arch/powerpc/platforms/powernv/pci-ioda.c
> > index ceb7e64..9239142 100644
> > --- a/arch/powerpc/platforms/powernv/pci-ioda.c
> > +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
> > @@ -1791,54 +1791,61 @@ static bool pnv_pci_ioda_pe_single_vendor(struct 
> > pnv_ioda_pe *pe)
> >   *
> >   * Currently this will only work on PHB3 (POWER8).
> >   */
> > +static long pnv_pci_ioda2_create_table(struct iommu_table_group 
> > *table_group,
> > +   int num, __u32 page_shift, __u64 window_size, __u32 levels,
> > +   struct iommu_table **ptbl);
> > +
> > +static long pnv_pci_ioda2_set_window(struct iommu_table_group *table_group,
> > +   int num, struct iommu_table *tbl);
> > +
> > +static unsigned long pnv_ioda_parse_tce_sizes(struct pnv_phb *phb);
> > +
> >  static int pnv_pci_ioda_dma_64bit_bypass(struct pnv_ioda_pe *pe)
> >  {
> > -   u64 window_size, table_size, tce_count, addr;
> > -   struct page *table_pages;
> > -   u64 tce_order = 28; /* 256MB TCEs */
> > -   __be64 *tces;
> > +   u64 window_size;
> > s64 rc;
> > +   struct iommu_table *tbl, *oldtbl = NULL;
> > +   unsigned long shift, offset;
> >  
> > /*
> >  * Window size needs to be a power of two, but needs to account for
> >  * shifting memory by the 4GB offset required to skip 32bit space.
> >  */
> > -   window_size = roundup_pow_of_two(memory_hotplug_max() + (1ULL << 32));
> > -   tce_count = window_size >> tce_order;
> > -   table_size = tce_count << 3;
> > -
> > -   if (table_size < PAGE_SIZE)
> > -   table_size = PAGE_SIZE;
> > +   window_size = roundup_pow_of_two(memory_hotplug_max() + SZ_4G);
> > +   shift = ilog2(pnv_ioda_parse_tce_sizes(pe->phb));
> > +   rc = pnv_pci_ioda2_create_table(>table_group, 0, shift, window_size,
> > +   POWERNV_IOMMU_DEFAULT_LEVELS, );
> > +   if (rc) {
> > +   pe_err(pe, "Failed to create 64-bypass TCE table, err %ld", rc);
> > +   return rc;
> > +   }
> >  
> > -   table_pages = alloc_pages_node(pe->phb->hose->node, GFP_KERNEL,
> > -  get_order(table_size));
> > -   if (!table_pages)
> > +   offset = SZ_4G >> shift;
> > +   rc = tbl->it_ops->set(tbl, offset, tbl->it_size - offset,
> > +   0 /* uaddr */, DMA_BIDIRECTIONAL, 0 /* attrs */);
> > +   if (rc)
> > goto err;
> >  
> > -   tces = page_address(table_pages);
> > -   if (!tces)
> > +   if (pe->table_group.tables[0]) {
> > +   oldtbl = pe->table_group.tables[0];
> > +   pnv_pci_ioda2_unset_window(>table_group, 0);
> > +   }
> > +
> > +   rc = 

Re: [PATCH 2/3] powerpc/powernv: DMA operations for discontiguous allocation

2018-07-02 Thread Alexey Kardashevskiy
On Fri, 29 Jun 2018 17:34:36 +1000
Russell Currey  wrote:

> DMA pseudo-bypass is a new set of DMA operations that solve some issues for
> devices that want to address more than 32 bits but can't address the 59
> bits required to enable direct DMA.
> 
> The previous implementation for POWER8/PHB3 worked around this by
> configuring a bypass from the default 32-bit address space into 64-bit
> address space.  This approach does not work for POWER9/PHB4 because
> regions of memory are discontiguous and many devices will be unable to
> address memory beyond the first node.
> 
> Instead, implement a new set of DMA operations that allocate TCEs as DMA
> mappings are requested so that all memory is addressable even when a
> one-to-one mapping between real addresses and DMA addresses isn't
> possible.  These TCEs are the maximum size available on the platform,
> which is 256M on PHB3 and 1G on PHB4.
> 
> Devices can now map any region of memory up to the maximum amount they can
> address according to the DMA mask set, in chunks of the largest available
> TCE size.
> 
> This implementation replaces the need for the existing PHB3 solution and
> should be compatible with future PHB versions.
> 
> It is, however, rather naive.  There is no unmapping, and as a result
> devices can eventually run out of space if they address their entire
> DMA mask worth of TCEs.  An implementation with unmap() will come in
> future (and requires a much more complex implementation), but this is a
> good start due to the drastic performance improvement.


Why does not dma_iommu_ops work in this case? I keep asking and yet no
comment in the commit log or mails...


> 
> Signed-off-by: Russell Currey 
> ---
>  arch/powerpc/include/asm/dma-mapping.h|   1 +
>  arch/powerpc/platforms/powernv/Makefile   |   2 +-
>  arch/powerpc/platforms/powernv/pci-dma.c  | 243 ++
>  arch/powerpc/platforms/powernv/pci-ioda.c |  82 +++-
>  arch/powerpc/platforms/powernv/pci.h  |   7 +
>  5 files changed, 281 insertions(+), 54 deletions(-)
>  create mode 100644 arch/powerpc/platforms/powernv/pci-dma.c

Too generic file name for such a hack.


> 
> diff --git a/arch/powerpc/include/asm/dma-mapping.h 
> b/arch/powerpc/include/asm/dma-mapping.h
> index 8fa394520af6..354f435160f3 100644
> --- a/arch/powerpc/include/asm/dma-mapping.h
> +++ b/arch/powerpc/include/asm/dma-mapping.h
> @@ -74,6 +74,7 @@ static inline unsigned long device_to_mask(struct device 
> *dev)
>  extern struct dma_map_ops dma_iommu_ops;
>  #endif
>  extern const struct dma_map_ops dma_nommu_ops;
> +extern const struct dma_map_ops dma_pseudo_bypass_ops;
>  
>  static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type 
> *bus)
>  {
> diff --git a/arch/powerpc/platforms/powernv/Makefile 
> b/arch/powerpc/platforms/powernv/Makefile
> index 703a350a7f4e..2467bdab3c13 100644
> --- a/arch/powerpc/platforms/powernv/Makefile
> +++ b/arch/powerpc/platforms/powernv/Makefile
> @@ -6,7 +6,7 @@ obj-y += opal-msglog.o opal-hmi.o 
> opal-power.o opal-irqchip.o
>  obj-y+= opal-kmsg.o opal-powercap.o opal-psr.o 
> opal-sensor-groups.o
>  
>  obj-$(CONFIG_SMP)+= smp.o subcore.o subcore-asm.o
> -obj-$(CONFIG_PCI)+= pci.o pci-ioda.o npu-dma.o
> +obj-$(CONFIG_PCI)+= pci.o pci-ioda.o npu-dma.o pci-dma.o
>  obj-$(CONFIG_CXL_BASE)   += pci-cxl.o
>  obj-$(CONFIG_EEH)+= eeh-powernv.o
>  obj-$(CONFIG_PPC_SCOM)   += opal-xscom.o
> diff --git a/arch/powerpc/platforms/powernv/pci-dma.c 
> b/arch/powerpc/platforms/powernv/pci-dma.c
> new file mode 100644
> index ..79382627c7be
> --- /dev/null
> +++ b/arch/powerpc/platforms/powernv/pci-dma.c
> @@ -0,0 +1,243 @@
> +/*
> + * DMA operations supporting pseudo-bypass for PHB3+
> + *
> + * Author: Russell Currey 
> + *
> + * Copyright 2018 IBM Corporation.
> + *
> + * This program is free software; you can redistribute it and/or modify it
> + * under the terms of the GNU General Public License as published by the
> + * Free Software Foundation; either version 2 of the License, or (at your
> + * option) any later version.
> + */
> +
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +
> +#include 
> +#include 
> +#include 
> +#include 
> +
> +#include "pci.h"
> +
> +/*
> + * This is a naive implementation that directly operates on TCEs, allocating
> + * on demand.  There is no locking or refcounts since no TCEs are ever 
> removed
> + * and unmap does nothing.
> + */
> +static dma_addr_t dma_pseudo_bypass_get_address(struct device *dev,
> + phys_addr_t addr)
> +{
> + struct pci_dev *pdev = container_of(dev, struct pci_dev, dev);
> + struct pci_controller *hose = pci_bus_to_host(pdev->bus);
> + struct pnv_phb *phb = hose->private_data;
> + struct pnv_ioda_pe *pe;
> + u64 i, tce, ret, offset;
> + __be64 entry;
> +
> + offset = addr & ((1 << phb->ioda.max_tce_order) - 1);
> +
> + 

[PATCH v4 09/11] macintosh/via-pmu: Replace via-pmu68k driver with via-pmu driver

2018-07-02 Thread Finn Thain
Now that the PowerMac via-pmu driver supports m68k PowerBooks,
switch over to that driver and remove the via-pmu68k driver.

Cc: Geert Uytterhoeven 
Tested-by: Stan Johnson 
Signed-off-by: Finn Thain 
---
 arch/m68k/configs/mac_defconfig   |   2 +-
 arch/m68k/configs/multi_defconfig |   2 +-
 arch/m68k/mac/config.c|   2 +-
 arch/m68k/mac/misc.c  |  48 +--
 drivers/macintosh/Kconfig |  13 +-
 drivers/macintosh/Makefile|   1 -
 drivers/macintosh/adb.c   |   2 +-
 drivers/macintosh/via-pmu68k.c| 846 --
 include/uapi/linux/pmu.h  |   2 +-
 9 files changed, 14 insertions(+), 904 deletions(-)
 delete mode 100644 drivers/macintosh/via-pmu68k.c

diff --git a/arch/m68k/configs/mac_defconfig b/arch/m68k/configs/mac_defconfig
index b52e597899eb..087ca15e32f1 100644
--- a/arch/m68k/configs/mac_defconfig
+++ b/arch/m68k/configs/mac_defconfig
@@ -369,7 +369,7 @@ CONFIG_TCM_PSCSI=m
 CONFIG_ADB=y
 CONFIG_ADB_MACII=y
 CONFIG_ADB_IOP=y
-CONFIG_ADB_PMU68K=y
+CONFIG_ADB_PMU=y
 CONFIG_ADB_CUDA=y
 CONFIG_INPUT_ADBHID=y
 CONFIG_MAC_EMUMOUSEBTN=y
diff --git a/arch/m68k/configs/multi_defconfig 
b/arch/m68k/configs/multi_defconfig
index 2a84eeec5b02..3f9334084d55 100644
--- a/arch/m68k/configs/multi_defconfig
+++ b/arch/m68k/configs/multi_defconfig
@@ -402,7 +402,7 @@ CONFIG_TCM_PSCSI=m
 CONFIG_ADB=y
 CONFIG_ADB_MACII=y
 CONFIG_ADB_IOP=y
-CONFIG_ADB_PMU68K=y
+CONFIG_ADB_PMU=y
 CONFIG_ADB_CUDA=y
 CONFIG_INPUT_ADBHID=y
 CONFIG_MAC_EMUMOUSEBTN=y
diff --git a/arch/m68k/mac/config.c b/arch/m68k/mac/config.c
index e522307db47c..92e80cf0d8aa 100644
--- a/arch/m68k/mac/config.c
+++ b/arch/m68k/mac/config.c
@@ -891,7 +891,7 @@ static void __init mac_identify(void)
 #ifdef CONFIG_ADB_CUDA
find_via_cuda();
 #endif
-#ifdef CONFIG_ADB_PMU68K
+#ifdef CONFIG_ADB_PMU
find_via_pmu();
 #endif
 }
diff --git a/arch/m68k/mac/misc.c b/arch/m68k/mac/misc.c
index 7ccb799eeb57..28090a44fa09 100644
--- a/arch/m68k/mac/misc.c
+++ b/arch/m68k/mac/misc.c
@@ -85,7 +85,7 @@ static void cuda_write_pram(int offset, __u8 data)
 }
 #endif /* CONFIG_ADB_CUDA */
 
-#ifdef CONFIG_ADB_PMU68K
+#ifdef CONFIG_ADB_PMU
 static long pmu_read_time(void)
 {
struct adb_request req;
@@ -136,7 +136,7 @@ static void pmu_write_pram(int offset, __u8 data)
while (!req.complete)
pmu_poll();
 }
-#endif /* CONFIG_ADB_PMU68K */
+#endif /* CONFIG_ADB_PMU */
 
 /*
  * VIA PRAM/RTC access routines
@@ -367,38 +367,6 @@ static void cuda_shutdown(void)
 }
 #endif /* CONFIG_ADB_CUDA */
 
-#ifdef CONFIG_ADB_PMU68K
-
-void pmu_restart(void)
-{
-   struct adb_request req;
-   if (pmu_request(, NULL,
-   2, PMU_SET_INTR_MASK, PMU_INT_ADB|PMU_INT_TICK) < 0)
-   return;
-   while (!req.complete)
-   pmu_poll();
-   if (pmu_request(, NULL, 1, PMU_RESET) < 0)
-   return;
-   while (!req.complete)
-   pmu_poll();
-}
-
-void pmu_shutdown(void)
-{
-   struct adb_request req;
-   if (pmu_request(, NULL,
-   2, PMU_SET_INTR_MASK, PMU_INT_ADB|PMU_INT_TICK) < 0)
-   return;
-   while (!req.complete)
-   pmu_poll();
-   if (pmu_request(, NULL, 5, PMU_SHUTDOWN, 'M', 'A', 'T', 'T') < 0)
-   return;
-   while (!req.complete)
-   pmu_poll();
-}
-
-#endif
-
 /*
  *---
  * Below this point are the generic routines; they'll dispatch to the
@@ -423,7 +391,7 @@ void mac_pram_read(int offset, __u8 *buffer, int len)
func = cuda_read_pram;
break;
 #endif
-#ifdef CONFIG_ADB_PMU68K
+#ifdef CONFIG_ADB_PMU
case MAC_ADB_PB2:
func = pmu_read_pram;
break;
@@ -453,7 +421,7 @@ void mac_pram_write(int offset, __u8 *buffer, int len)
func = cuda_write_pram;
break;
 #endif
-#ifdef CONFIG_ADB_PMU68K
+#ifdef CONFIG_ADB_PMU
case MAC_ADB_PB2:
func = pmu_write_pram;
break;
@@ -477,7 +445,7 @@ void mac_poweroff(void)
   macintosh_config->adb_type == MAC_ADB_CUDA) {
cuda_shutdown();
 #endif
-#ifdef CONFIG_ADB_PMU68K
+#ifdef CONFIG_ADB_PMU
} else if (macintosh_config->adb_type == MAC_ADB_PB2) {
pmu_shutdown();
 #endif
@@ -518,7 +486,7 @@ void mac_reset(void)
   macintosh_config->adb_type == MAC_ADB_CUDA) {
cuda_restart();
 #endif
-#ifdef CONFIG_ADB_PMU68K
+#ifdef CONFIG_ADB_PMU
} else if (macintosh_config->adb_type == MAC_ADB_PB2) {
pmu_restart();
 #endif
@@ -670,7 +638,7 @@ int mac_hwclk(int op, struct rtc_time *t)
now = cuda_read_time();
break;
 #endif
-#ifdef CONFIG_ADB_PMU68K
+#ifdef CONFIG_ADB_PMU
case MAC_ADB_PB2:
now = 

[PATCH v4 11/11] macintosh/via-pmu: Disambiguate interrupt statistics

2018-07-02 Thread Finn Thain
Some of the event counters are overloaded which makes it very
difficult to interpret their values.

Counter 0 is supposed to report CB1 interrupts but it can also count
PMU_INT_WAITING_CHARGER events.

Counter 1 is supposed to report GPIO interrupts but it can also count
other events (depending upon the value of the PMU_INT_ADB bit).

Disambiguate these statistics with dedicated counters for GPIO and
CB1 interrupts.

Comments in the MkLinux source code say that the type 0 and type 1
interrupts are model-specific. Label them as "unknown".

This change to the contents of /proc/pmu/interrupts is by necessity
visible in userland. However, packages which interact with the PMU
(that is, pbbuttonsd, pmac-utils and pmud) don't open this file.
AFAIK, user software has no need to poll these counters.

Tested-by: Stan Johnson 
Signed-off-by: Finn Thain 
---
The file now looks like this,

  0:  0 (Unknown interrupt (type 0))
  1:  0 (Unknown interrupt (type 1))
  2:  0 (PC-Card eject button)
  3: 23 (Sound/Brightness button)
  4: 74 (ADB message)
  5:  0 (Battery state change)
  6:  0 (Environment interrupt)
  7:121 (Tick timer)
  8:  0 (Ghost interrupt (zero len))
  9:  1 (Empty interrupt (empty mask))
 10:  2 (Max irqs in a row)
 11:194 (Total CB1 triggered events)
 12:  0 (Total GPIO1 triggered events)

rather than this,

  0:194 (Total CB1 triggered events)
  1:  0 (Total GPIO1 triggered events)
  2:  0 (PC-Card eject button)
  3: 23 (Sound/Brightness button)
  4: 74 (ADB message)
  5:  0 (Battery state change)
  6:  0 (Environment interrupt)
  7:121 (Tick timer)
  8:  0 (Ghost interrupt (zero len))
  9:  1 (Empty interrupt (empty mask))
 10:  2 (Max irqs in a row)

If some parser exists for this file, and if this change is problematic,
we could increment the driver version number in /proc/pmu/info, to
correspond with the format change.
---
 drivers/macintosh/via-pmu.c | 20 
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/drivers/macintosh/via-pmu.c b/drivers/macintosh/via-pmu.c
index 3da5d40309d4..d72c450aebe5 100644
--- a/drivers/macintosh/via-pmu.c
+++ b/drivers/macintosh/via-pmu.c
@@ -172,7 +172,9 @@ static int drop_interrupts;
 static int option_lid_wakeup = 1;
 #endif /* CONFIG_SUSPEND && CONFIG_PPC32 */
 static unsigned long async_req_locks;
-static unsigned int pmu_irq_stats[11];
+
+#define NUM_IRQ_STATS 13
+static unsigned int pmu_irq_stats[NUM_IRQ_STATS];
 
 static struct proc_dir_entry *proc_pmu_root;
 static struct proc_dir_entry *proc_pmu_info;
@@ -873,9 +875,9 @@ static int pmu_info_proc_show(struct seq_file *m, void *v)
 static int pmu_irqstats_proc_show(struct seq_file *m, void *v)
 {
int i;
-   static const char *irq_names[] = {
-   "Total CB1 triggered events",
-   "Total GPIO1 triggered events",
+   static const char *irq_names[NUM_IRQ_STATS] = {
+   "Unknown interrupt (type 0)",
+   "Unknown interrupt (type 1)",
"PC-Card eject button",
"Sound/Brightness button",
"ADB message",
@@ -884,10 +886,12 @@ static int pmu_irqstats_proc_show(struct seq_file *m, 
void *v)
"Tick timer",
"Ghost interrupt (zero len)",
"Empty interrupt (empty mask)",
-   "Max irqs in a row"
+   "Max irqs in a row",
+   "Total CB1 triggered events",
+   "Total GPIO1 triggered events",
 };
 
-   for (i=0; i<11; i++) {
+   for (i = 0; i < NUM_IRQ_STATS; i++) {
seq_printf(m, " %2u: %10u (%s)\n",
 i, pmu_irq_stats[i], irq_names[i]);
}
@@ -1622,7 +1626,7 @@ via_pmu_interrupt(int irq, void *arg)
}
if (intr & CB1_INT) {
adb_int_pending = 1;
-   pmu_irq_stats[0]++;
+   pmu_irq_stats[11]++;
}
if (intr & SR_INT) {
req = pmu_sr_intr();
@@ -1709,7 +1713,7 @@ gpio1_interrupt(int irq, void *arg)
disable_irq_nosync(gpio_irq);
gpio_irq_enabled = 0;
}
-   pmu_irq_stats[1]++;
+   pmu_irq_stats[12]++;
adb_int_pending = 1;
spin_unlock_irqrestore(_lock, flags);
via_pmu_interrupt(0, NULL);
-- 
2.16.4



[PATCH v4 06/11] macintosh/via-pmu: Add support for m68k PowerBooks

2018-07-02 Thread Finn Thain
Put #ifdefs around the Open Firmware, xmon, interrupt dispatch,
battery and suspend code. Add the necessary interrupt handling to
support m68k PowerBooks.

The pmu_kind value is available to userspace using the
PMU_IOC_GET_MODEL ioctl. It is not clear yet what hardware classes
are be needed to describe m68k PowerBook models, so pmu_kind is given
the provisional value PMU_UNKNOWN.

To find out about the hardware, user programs can use /proc/bootinfo
or /proc/hardware, or send the PMU_GET_VERSION command using /dev/adb.

Tested-by: Stan Johnson 
Signed-off-by: Finn Thain 
---
 drivers/macintosh/Kconfig   |   2 +-
 drivers/macintosh/via-pmu.c | 101 +++-
 2 files changed, 91 insertions(+), 12 deletions(-)

diff --git a/drivers/macintosh/Kconfig b/drivers/macintosh/Kconfig
index 97a420c11eed..9c6452b38c36 100644
--- a/drivers/macintosh/Kconfig
+++ b/drivers/macintosh/Kconfig
@@ -65,7 +65,7 @@ config ADB_CUDA
  If unsure say Y.
 
 config ADB_PMU
-   bool "Support for PMU  based PowerMacs"
+   bool "Support for PMU based PowerMacs and PowerBooks"
depends on PPC_PMAC
help
  On PowerBooks, iBooks, and recent iMacs and Power Macintoshes, the
diff --git a/drivers/macintosh/via-pmu.c b/drivers/macintosh/via-pmu.c
index 2557f3e49f18..a68e7a6f00cc 100644
--- a/drivers/macintosh/via-pmu.c
+++ b/drivers/macintosh/via-pmu.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
- * Device driver for the via-pmu on Apple Powermacs.
+ * Device driver for the PMU in Apple PowerBooks and PowerMacs.
  *
  * The VIA (versatile interface adapter) interfaces to the PMU,
  * a 6805 microprocessor core whose primary function is to control
@@ -49,20 +49,26 @@
 #include 
 #include 
 #include 
-#include 
+#include 
 #include 
 #include 
 #include 
 #include 
 #include 
+#ifdef CONFIG_PPC_PMAC
 #include 
 #include 
 #include 
-#include 
+#include 
 #include 
 #include 
 #include 
 #include 
+#else
+#include 
+#include 
+#include 
+#endif
 
 #include "via-pmu-event.h"
 
@@ -97,8 +103,13 @@ static DEFINE_MUTEX(pmu_info_proc_mutex);
 #define ANH(15*RS) /* A-side data, no handshake */
 
 /* Bits in B data register: both active low */
+#ifdef CONFIG_PPC_PMAC
 #define TACK   0x08/* Transfer acknowledge (input) */
 #define TREQ   0x10/* Transfer request (output) */
+#else
+#define TACK   0x02
+#define TREQ   0x04
+#endif
 
 /* Bits in ACR */
 #define SR_CTRL0x1c/* Shift register control bits 
*/
@@ -140,13 +151,15 @@ static int data_index;
 static int data_len;
 static volatile int adb_int_pending;
 static volatile int disable_poll;
-static struct device_node *vias;
 static int pmu_kind = PMU_UNKNOWN;
 static int pmu_fully_inited;
 static int pmu_has_adb;
+#ifdef CONFIG_PPC_PMAC
 static volatile unsigned char __iomem *via1;
 static volatile unsigned char __iomem *via2;
+static struct device_node *vias;
 static struct device_node *gpio_node;
+#endif
 static unsigned char __iomem *gpio_reg;
 static int gpio_irq = 0;
 static int gpio_irq_enabled = -1;
@@ -273,6 +286,7 @@ static char *pbook_type[] = {
 
 int __init find_via_pmu(void)
 {
+#ifdef CONFIG_PPC_PMAC
u64 taddr;
const u32 *reg;
 
@@ -355,9 +369,6 @@ int __init find_via_pmu(void)
if (!init_pmu())
goto fail_init;
 
-   printk(KERN_INFO "PMU driver v%d initialized for %s, firmware: %02x\n",
-  PMU_DRIVER_VERSION, pbook_type[pmu_kind], pmu_version);
-  
sys_ctrler = SYS_CTRLER_PMU;

return 1;
@@ -373,6 +384,30 @@ int __init find_via_pmu(void)
vias = NULL;
pmu_state = uninitialized;
return 0;
+#else
+   if (macintosh_config->adb_type != MAC_ADB_PB2)
+   return 0;
+
+   pmu_kind = PMU_UNKNOWN;
+
+   spin_lock_init(_lock);
+
+   pmu_has_adb = 1;
+
+   pmu_intr_mask = PMU_INT_PCEJECT |
+   PMU_INT_SNDBRT |
+   PMU_INT_ADB |
+   PMU_INT_TICK;
+
+   pmu_state = idle;
+
+   if (!init_pmu()) {
+   pmu_state = uninitialized;
+   return 0;
+   }
+
+   return 1;
+#endif /* !CONFIG_PPC_PMAC */
 }
 
 #ifdef CONFIG_ADB
@@ -396,13 +431,14 @@ static int pmu_init(void)
  */
 static int __init via_pmu_start(void)
 {
-   unsigned int irq;
+   unsigned int __maybe_unused irq;
 
if (pmu_state == uninitialized)
return -ENODEV;
 
batt_req.complete = 1;
 
+#ifdef CONFIG_PPC_PMAC
irq = irq_of_parse_and_map(vias, 0);
if (!irq) {
printk(KERN_ERR "via-pmu: can't map interrupt\n");
@@ -439,6 +475,19 @@ static int __init via_pmu_start(void)
 
/* Enable interrupts */
out_8([IER], IER_SET | SR_INT | CB1_INT);
+#else
+   if (request_irq(IRQ_MAC_ADB_SR, via_pmu_interrupt, IRQF_NO_SUSPEND,
+

[PATCH v4 08/11] macintosh/via-pmu68k: Don't load driver on unsupported hardware

2018-07-02 Thread Finn Thain
Don't load the via-pmu68k driver on early PowerBooks. The M50753 PMU
device found in those models was never supported by this driver.
Attempting to load the driver usually causes a boot hang.

Cc: Geert Uytterhoeven 
Signed-off-by: Finn Thain 
Reviewed-by: Michael Schmitz 
---
 arch/m68k/mac/misc.c   | 6 ++
 drivers/macintosh/via-pmu68k.c | 4 
 include/uapi/linux/pmu.h   | 2 +-
 3 files changed, 3 insertions(+), 9 deletions(-)

diff --git a/arch/m68k/mac/misc.c b/arch/m68k/mac/misc.c
index c68054361615..7ccb799eeb57 100644
--- a/arch/m68k/mac/misc.c
+++ b/arch/m68k/mac/misc.c
@@ -478,8 +478,7 @@ void mac_poweroff(void)
cuda_shutdown();
 #endif
 #ifdef CONFIG_ADB_PMU68K
-   } else if (macintosh_config->adb_type == MAC_ADB_PB1
-   || macintosh_config->adb_type == MAC_ADB_PB2) {
+   } else if (macintosh_config->adb_type == MAC_ADB_PB2) {
pmu_shutdown();
 #endif
}
@@ -520,8 +519,7 @@ void mac_reset(void)
cuda_restart();
 #endif
 #ifdef CONFIG_ADB_PMU68K
-   } else if (macintosh_config->adb_type == MAC_ADB_PB1
-   || macintosh_config->adb_type == MAC_ADB_PB2) {
+   } else if (macintosh_config->adb_type == MAC_ADB_PB2) {
pmu_restart();
 #endif
} else if (CPU_IS_030) {
diff --git a/drivers/macintosh/via-pmu68k.c b/drivers/macintosh/via-pmu68k.c
index d545ed45e482..bec8e1837d7d 100644
--- a/drivers/macintosh/via-pmu68k.c
+++ b/drivers/macintosh/via-pmu68k.c
@@ -175,9 +175,6 @@ static s8 pmu_data_len[256][2] = {
 int __init find_via_pmu(void)
 {
switch (macintosh_config->adb_type) {
-   case MAC_ADB_PB1:
-   pmu_kind = PMU_68K_V1;
-   break;
case MAC_ADB_PB2:
pmu_kind = PMU_68K_V2;
break;
@@ -785,7 +782,6 @@ pmu_enable_backlight(int on)
/* first call: get current backlight value */
if (backlight_level < 0) {
switch(pmu_kind) {
-   case PMU_68K_V1:
case PMU_68K_V2:
pmu_request(, NULL, 3, PMU_READ_NVRAM, 0x14, 0xe);
while (!req.complete)
diff --git a/include/uapi/linux/pmu.h b/include/uapi/linux/pmu.h
index 89cb1acea93a..e128f609281a 100644
--- a/include/uapi/linux/pmu.h
+++ b/include/uapi/linux/pmu.h
@@ -93,7 +93,7 @@ enum {
PMU_HEATHROW_BASED, /* PowerBook G3 series */
PMU_PADDINGTON_BASED,   /* 1999 PowerBook G3 */
PMU_KEYLARGO_BASED, /* Core99 motherboard (PMU99) */
-   PMU_68K_V1, /* 68K PMU, version 1 */
+   PMU_68K_V1, /* Unused/deprecated */
PMU_68K_V2, /* 68K PMU, version 2 */
 };
 
-- 
2.16.4



[PATCH v4 10/11] macintosh/via-pmu: Clean up interrupt statistics

2018-07-02 Thread Finn Thain
Replace an open-coded ffs() with the function call.
Simplify an if-else cascade using a switch statement.
Correct a typo and an indentation issue.

Tested-by: Stan Johnson 
Signed-off-by: Finn Thain 
Reviewed-by: Geert Uytterhoeven 
---
 drivers/macintosh/via-pmu.c | 39 ++-
 1 file changed, 22 insertions(+), 17 deletions(-)

diff --git a/drivers/macintosh/via-pmu.c b/drivers/macintosh/via-pmu.c
index a68e7a6f00cc..3da5d40309d4 100644
--- a/drivers/macintosh/via-pmu.c
+++ b/drivers/macintosh/via-pmu.c
@@ -1355,7 +1355,8 @@ pmu_resume(void)
 static void
 pmu_handle_data(unsigned char *data, int len)
 {
-   unsigned char ints, pirq;
+   unsigned char ints;
+   int idx;
int i = 0;
 
asleep = 0;
@@ -1377,25 +1378,24 @@ pmu_handle_data(unsigned char *data, int len)
ints &= ~(PMU_INT_ADB_AUTO | PMU_INT_AUTO_SRQ_POLL);
 
 next:
-
if (ints == 0) {
if (i > pmu_irq_stats[10])
pmu_irq_stats[10] = i;
return;
}
-
-   for (pirq = 0; pirq < 8; pirq++)
-   if (ints & (1 << pirq))
-   break;
-   pmu_irq_stats[pirq]++;
i++;
-   ints &= ~(1 << pirq);
+
+   idx = ffs(ints) - 1;
+   ints &= ~BIT(idx);
+
+   pmu_irq_stats[idx]++;
 
/* Note: for some reason, we get an interrupt with len=1,
 * data[0]==0 after each normal ADB interrupt, at least
 * on the Pismo. Still investigating...  --BenH
 */
-   if ((1 << pirq) & PMU_INT_ADB) {
+   switch (BIT(idx)) {
+   case PMU_INT_ADB:
if ((data[0] & PMU_INT_ADB_AUTO) == 0) {
struct adb_request *req = req_awaiting_reply;
if (!req) {
@@ -1433,25 +1433,28 @@ pmu_handle_data(unsigned char *data, int len)
adb_input(data+1, len-1, 1);
 #endif /* CONFIG_ADB */
}
-   }
+   break;
+
/* Sound/brightness button pressed */
-   else if ((1 << pirq) & PMU_INT_SNDBRT) {
+   case PMU_INT_SNDBRT:
 #ifdef CONFIG_PMAC_BACKLIGHT
if (len == 3)
pmac_backlight_set_legacy_brightness_pmu(data[1] >> 4);
 #endif
-   }
+   break;
+
/* Tick interrupt */
-   else if ((1 << pirq) & PMU_INT_TICK) {
-   /* Environement or tick interrupt, query batteries */
+   case PMU_INT_TICK:
+   /* Environment or tick interrupt, query batteries */
if (pmu_battery_count) {
if ((--query_batt_timer) == 0) {
query_battery_state();
query_batt_timer = BATTERY_POLLING_COUNT;
}
}
-}
-   else if ((1 << pirq) & PMU_INT_ENVIRONMENT) {
+   break;
+
+   case PMU_INT_ENVIRONMENT:
if (pmu_battery_count)
query_battery_state();
pmu_pass_intr(data, len);
@@ -1461,7 +1464,9 @@ pmu_handle_data(unsigned char *data, int len)
via_pmu_event(PMU_EVT_POWER, !!(data[1]&8));
via_pmu_event(PMU_EVT_LID, data[1]&1);
}
-   } else {
+   break;
+
+   default:
   pmu_pass_intr(data, len);
}
goto next;
-- 
2.16.4



[PATCH v4 07/11] macintosh/via-pmu: Explicitly specify CONFIG_PPC_PMAC dependencies

2018-07-02 Thread Finn Thain
At present, CONFIG_ADB_PMU depends on CONFIG_PPC_PMAC. When this gets
relaxed to CONFIG_PPC_PMAC || CONFIG_MAC, those Kconfig symbols with
implicit deps on PPC_PMAC will need explicit deps. Add them now.
No functional change.

Tested-by: Stan Johnson 
Signed-off-by: Finn Thain 
---
 drivers/macintosh/Kconfig | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/macintosh/Kconfig b/drivers/macintosh/Kconfig
index 9c6452b38c36..26abae4c899d 100644
--- a/drivers/macintosh/Kconfig
+++ b/drivers/macintosh/Kconfig
@@ -79,7 +79,7 @@ config ADB_PMU
 
 config ADB_PMU_LED
bool "Support for the Power/iBook front LED"
-   depends on ADB_PMU
+   depends on PPC_PMAC && ADB_PMU
select NEW_LEDS
select LEDS_CLASS
help
@@ -122,7 +122,7 @@ config PMAC_MEDIABAY
 
 config PMAC_BACKLIGHT
bool "Backlight control for LCD screens"
-   depends on ADB_PMU && FB = y && (BROKEN || !PPC64)
+   depends on PPC_PMAC && ADB_PMU && FB = y && (BROKEN || !PPC64)
select FB_BACKLIGHT
help
  Say Y here to enable Macintosh specific extensions of the generic
-- 
2.16.4



[PATCH v4 05/11] macintosh/via-pmu: Replace via pointer with via1 and via2 pointers

2018-07-02 Thread Finn Thain
On most PowerPC Macs, the PMU driver uses the shift register and
IO port B from a single VIA chip.

On 68k and early PowerPC PowerBooks, the driver uses the shift register
from one VIA chip together with IO port B from another.

Replace via with via1 and via2 to accommodate this. For the
CONFIG_PPC_PMAC case, set via1 = via2 so there is no change.

Tested-by: Stan Johnson 
Signed-off-by: Finn Thain 
---
 drivers/macintosh/via-pmu.c | 142 +---
 1 file changed, 69 insertions(+), 73 deletions(-)

diff --git a/drivers/macintosh/via-pmu.c b/drivers/macintosh/via-pmu.c
index 6a6f1666712e..2557f3e49f18 100644
--- a/drivers/macintosh/via-pmu.c
+++ b/drivers/macintosh/via-pmu.c
@@ -76,7 +76,6 @@
 #define BATTERY_POLLING_COUNT  2
 
 static DEFINE_MUTEX(pmu_info_proc_mutex);
-static volatile unsigned char __iomem *via;
 
 /* VIA registers - spaced 0x200 bytes apart */
 #define RS 0x200   /* skip between registers */
@@ -145,6 +144,8 @@ static struct device_node *vias;
 static int pmu_kind = PMU_UNKNOWN;
 static int pmu_fully_inited;
 static int pmu_has_adb;
+static volatile unsigned char __iomem *via1;
+static volatile unsigned char __iomem *via2;
 static struct device_node *gpio_node;
 static unsigned char __iomem *gpio_reg;
 static int gpio_irq = 0;
@@ -340,14 +341,14 @@ int __init find_via_pmu(void)
} else
pmu_kind = PMU_UNKNOWN;
 
-   via = ioremap(taddr, 0x2000);
-   if (via == NULL) {
+   via1 = via2 = ioremap(taddr, 0x2000);
+   if (via1 == NULL) {
printk(KERN_ERR "via-pmu: Can't map address !\n");
goto fail_via_remap;
}

-   out_8([IER], IER_CLR | 0x7f);   /* disable all intrs */
-   out_8([IFR], 0x7f); /* clear IFR */
+   out_8([IER], IER_CLR | 0x7f);  /* disable all intrs */
+   out_8([IFR], 0x7f);/* clear IFR */
 
pmu_state = idle;
 
@@ -362,8 +363,8 @@ int __init find_via_pmu(void)
return 1;
 
  fail_init:
-   iounmap(via);
-   via = NULL;
+   iounmap(via1);
+   via1 = via2 = NULL;
  fail_via_remap:
iounmap(gpio_reg);
gpio_reg = NULL;
@@ -437,7 +438,7 @@ static int __init via_pmu_start(void)
}
 
/* Enable interrupts */
-   out_8([IER], IER_SET | SR_INT | CB1_INT);
+   out_8([IER], IER_SET | SR_INT | CB1_INT);
 
pmu_fully_inited = 1;
 
@@ -535,8 +536,8 @@ init_pmu(void)
struct adb_request req;
 
/* Negate TREQ. Set TACK to input and TREQ to output. */
-   out_8([B], in_8([B]) | TREQ);
-   out_8([DIRB], (in_8([DIRB]) | TREQ) & ~TACK);
+   out_8([B], in_8([B]) | TREQ);
+   out_8([DIRB], (in_8([DIRB]) | TREQ) & ~TACK);
 
pmu_request(, NULL, 2, PMU_SET_INTR_MASK, pmu_intr_mask);
timeout =  10;
@@ -1137,7 +1138,7 @@ wait_for_ack(void)
 * reported
 */
int timeout = 4000;
-   while ((in_8([B]) & TACK) == 0) {
+   while ((in_8([B]) & TACK) == 0) {
if (--timeout < 0) {
printk(KERN_ERR "PMU not responding (!ack)\n");
return;
@@ -1151,23 +1152,19 @@ wait_for_ack(void)
 static inline void
 send_byte(int x)
 {
-   volatile unsigned char __iomem *v = via;
-
-   out_8([ACR], in_8([ACR]) | SR_OUT | SR_EXT);
-   out_8([SR], x);
-   out_8([B], in_8([B]) & ~TREQ);  /* assert TREQ */
-   (void)in_8([B]);
+   out_8([ACR], in_8([ACR]) | SR_OUT | SR_EXT);
+   out_8([SR], x);
+   out_8([B], in_8([B]) & ~TREQ);/* assert TREQ */
+   (void)in_8([B]);
 }
 
 static inline void
 recv_byte(void)
 {
-   volatile unsigned char __iomem *v = via;
-
-   out_8([ACR], (in_8([ACR]) & ~SR_OUT) | SR_EXT);
-   in_8([SR]);   /* resets SR */
-   out_8([B], in_8([B]) & ~TREQ);
-   (void)in_8([B]);
+   out_8([ACR], (in_8([ACR]) & ~SR_OUT) | SR_EXT);
+   in_8([SR]);/* resets SR */
+   out_8([B], in_8([B]) & ~TREQ);
+   (void)in_8([B]);
 }
 
 static inline void
@@ -1270,7 +1267,7 @@ pmu_suspend(void)
if (!adb_int_pending && pmu_state == idle && 
!req_awaiting_reply) {
if (gpio_irq >= 0)
disable_irq_nosync(gpio_irq);
-   out_8([IER], CB1_INT | IER_CLR);
+   out_8([IER], CB1_INT | IER_CLR);
spin_unlock_irqrestore(_lock, flags);
break;
}
@@ -1294,7 +1291,7 @@ pmu_resume(void)
adb_int_pending = 1;
if (gpio_irq >= 0)
enable_irq(gpio_irq);
-   out_8([IER], CB1_INT | IER_SET);
+   out_8([IER], CB1_INT | IER_SET);
spin_unlock_irqrestore(_lock, flags);
pmu_poll();
 }
@@ -1419,20 +1416,20 @@ pmu_sr_intr(void)
struct adb_request *req;
int bite = 0;
 
-   if (in_8([B]) & 

[PATCH v4 04/11] macintosh/via-pmu: Enhance state machine with new 'uninitialized' state

2018-07-02 Thread Finn Thain
On 68k Macs, the via/vias pointer can't be used to determine whether
the PMU driver has been initialized. For portability, add a new state
to indicate that via_find_pmu() succeeded.

After via_find_pmu() executes, testing vias == NULL is equivalent to
testing via == NULL. Replace these tests with pmu_state == uninitialized
which is simpler and more consistent. No functional change.

Tested-by: Stan Johnson 
Signed-off-by: Finn Thain 
---
 drivers/macintosh/via-pmu.c | 44 ++--
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/drivers/macintosh/via-pmu.c b/drivers/macintosh/via-pmu.c
index c313ddfdb17a..6a6f1666712e 100644
--- a/drivers/macintosh/via-pmu.c
+++ b/drivers/macintosh/via-pmu.c
@@ -114,6 +114,7 @@ static volatile unsigned char __iomem *via;
 #define CB1_INT0x10/* transition on CB1 input */
 
 static volatile enum pmu_state {
+   uninitialized = 0,
idle,
sending,
intack,
@@ -274,7 +275,7 @@ int __init find_via_pmu(void)
u64 taddr;
const u32 *reg;
 
-   if (via)
+   if (pmu_state != uninitialized)
return 1;
vias = of_find_node_by_name(NULL, "via-pmu");
if (vias == NULL)
@@ -369,20 +370,19 @@ int __init find_via_pmu(void)
  fail:
of_node_put(vias);
vias = NULL;
+   pmu_state = uninitialized;
return 0;
 }
 
 #ifdef CONFIG_ADB
 static int pmu_probe(void)
 {
-   return vias == NULL? -ENODEV: 0;
+   return pmu_state == uninitialized ? -ENODEV : 0;
 }
 
 static int pmu_init(void)
 {
-   if (vias == NULL)
-   return -ENODEV;
-   return 0;
+   return pmu_state == uninitialized ? -ENODEV : 0;
 }
 #endif /* CONFIG_ADB */
 
@@ -397,7 +397,7 @@ static int __init via_pmu_start(void)
 {
unsigned int irq;
 
-   if (vias == NULL)
+   if (pmu_state == uninitialized)
return -ENODEV;
 
batt_req.complete = 1;
@@ -463,7 +463,7 @@ arch_initcall(via_pmu_start);
  */
 static int __init via_pmu_dev_init(void)
 {
-   if (vias == NULL)
+   if (pmu_state == uninitialized)
return -ENODEV;
 
 #ifdef CONFIG_PMAC_BACKLIGHT
@@ -929,7 +929,7 @@ static int pmu_send_request(struct adb_request *req, int 
sync)
 {
int i, ret;
 
-   if ((vias == NULL) || (!pmu_fully_inited)) {
+   if (pmu_state == uninitialized || !pmu_fully_inited) {
req->complete = 1;
return -ENXIO;
}
@@ -1023,7 +1023,7 @@ static int __pmu_adb_autopoll(int devs)
 
 static int pmu_adb_autopoll(int devs)
 {
-   if ((vias == NULL) || (!pmu_fully_inited) || !pmu_has_adb)
+   if (pmu_state == uninitialized || !pmu_fully_inited || !pmu_has_adb)
return -ENXIO;
 
adb_dev_map = devs;
@@ -1036,7 +1036,7 @@ static int pmu_adb_reset_bus(void)
struct adb_request req;
int save_autopoll = adb_dev_map;
 
-   if ((vias == NULL) || (!pmu_fully_inited) || !pmu_has_adb)
+   if (pmu_state == uninitialized || !pmu_fully_inited || !pmu_has_adb)
return -ENXIO;
 
/* anyone got a better idea?? */
@@ -1072,7 +1072,7 @@ pmu_request(struct adb_request *req, void (*done)(struct 
adb_request *),
va_list list;
int i;
 
-   if (vias == NULL)
+   if (pmu_state == uninitialized)
return -ENXIO;
 
if (nbytes < 0 || nbytes > 32) {
@@ -1097,7 +1097,7 @@ pmu_queue_request(struct adb_request *req)
unsigned long flags;
int nsend;
 
-   if (via == NULL) {
+   if (pmu_state == uninitialized) {
req->complete = 1;
return -ENXIO;
}
@@ -1210,7 +1210,7 @@ pmu_start(void)
 void
 pmu_poll(void)
 {
-   if (!via)
+   if (pmu_state == uninitialized)
return;
if (disable_poll)
return;
@@ -1220,7 +1220,7 @@ pmu_poll(void)
 void
 pmu_poll_adb(void)
 {
-   if (!via)
+   if (pmu_state == uninitialized)
return;
if (disable_poll)
return;
@@ -1235,7 +1235,7 @@ pmu_poll_adb(void)
 void
 pmu_wait_complete(struct adb_request *req)
 {
-   if (!via)
+   if (pmu_state == uninitialized)
return;
while((pmu_state != idle && pmu_state != locked) || !req->complete)
via_pmu_interrupt(0, NULL);
@@ -1251,7 +1251,7 @@ pmu_suspend(void)
 {
unsigned long flags;
 
-   if (!via)
+   if (pmu_state == uninitialized)
return;

spin_lock_irqsave(_lock, flags);
@@ -1282,7 +1282,7 @@ pmu_resume(void)
 {
unsigned long flags;
 
-   if (!via || (pmu_suspended < 1))
+   if (pmu_state == uninitialized || pmu_suspended < 1)
return;
 
spin_lock_irqsave(_lock, flags);
@@ -1644,7 +1644,7 @@ pmu_enable_irled(int on)
 {
struct adb_request req;
 
-   if (vias == NULL)
+   if (pmu_state == 

[PATCH v4 00/11] macintosh: Resolve various PMU driver problems

2018-07-02 Thread Finn Thain
This series of patches has the following aims.

1) Eliminate duplicated code. Linux presently has two drivers for
   the 68HC05-based PMU devices found in Macs: via-pmu and via-pmu68k.
   There's no value in having separate PMU drivers for each architecture.

2) Avoid further work on via-pmu68k that's not needed for via-pmu.

3) Fix some bugs in the via-pmu driver.

4) Enable the /dev/pmu and /proc/pmu/* userspace APIs on m68k Macs
   by adopting via-pmu.

5) Improve stability on early 100-series PowerBooks by loading no PMU
   driver at all. Neither via-pmu nor via-pmu68k supports the early
   M50753-based PMU device found in these models.

6) Assist the out-of-tree NuBus PowerMac port to support PMU designs
   shared with the m68k Mac port (e.g. PowerBooks 190 and 5300).

This patch series has been regression tested on various PowerBooks
(190, 520, 3400, Pismo G3) and PowerMacs (Beige G3, G5). These patches
did not affect userland utilities. (Note that there is a userland-
visible change to the contents of /proc/pmu/interrupts.)

Changed since v1:
1) Added blank lines after 'break' statements in patch 10.
2) Improved patch description for patch 3.
3) Added reviewed-by tags.
4) Split patch 8 to make code review easier.

Changed since v2:
1) Added reviewed-by tag.
2) Retained PMU_68K_V1 and PMU_68K_V2 symbols.

Changed since v3:
1) Rebased on v4.18-rc2.
2) Omitted patch 10/12, since these RTC changes now conflict with mainline.
   It will be reworked once the mainline m68k/powerpc RTC code stabilizes.


Finn Thain (11):
  macintosh/via-pmu: Fix section mismatch warning
  macintosh/via-pmu: Add missing mmio accessors
  macintosh/via-pmu: Don't clear shift register interrupt flag twice
  macintosh/via-pmu: Enhance state machine with new 'uninitialized'
state
  macintosh/via-pmu: Replace via pointer with via1 and via2 pointers
  macintosh/via-pmu: Add support for m68k PowerBooks
  macintosh/via-pmu: Explicitly specify CONFIG_PPC_PMAC dependencies
  macintosh/via-pmu68k: Don't load driver on unsupported hardware
  macintosh/via-pmu: Replace via-pmu68k driver with via-pmu driver
  macintosh/via-pmu: Clean up interrupt statistics
  macintosh/via-pmu: Disambiguate interrupt statistics

 arch/m68k/configs/mac_defconfig   |   2 +-
 arch/m68k/configs/multi_defconfig |   2 +-
 arch/m68k/mac/config.c|   2 +-
 arch/m68k/mac/misc.c  |  54 +--
 drivers/macintosh/Kconfig |  19 +-
 drivers/macintosh/Makefile|   1 -
 drivers/macintosh/adb.c   |   2 +-
 drivers/macintosh/via-pmu.c   | 346 ++--
 drivers/macintosh/via-pmu68k.c| 850 --
 include/uapi/linux/pmu.h  |   4 +-
 10 files changed, 235 insertions(+), 1047 deletions(-)
 delete mode 100644 drivers/macintosh/via-pmu68k.c

-- 
2.16.4



[PATCH v4 02/11] macintosh/via-pmu: Add missing mmio accessors

2018-07-02 Thread Finn Thain
Add missing in_8() accessors to init_pmu() and pmu_sr_intr().

This fixes several sparse warnings:
drivers/macintosh/via-pmu.c:536:29: warning: dereference of noderef expression
drivers/macintosh/via-pmu.c:537:33: warning: dereference of noderef expression
drivers/macintosh/via-pmu.c:1455:17: warning: dereference of noderef expression
drivers/macintosh/via-pmu.c:1456:69: warning: dereference of noderef expression

Tested-by: Stan Johnson 
Signed-off-by: Finn Thain 
Reviewed-by: Geert Uytterhoeven 
---
 drivers/macintosh/via-pmu.c | 9 +
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/drivers/macintosh/via-pmu.c b/drivers/macintosh/via-pmu.c
index f8a2c917201f..ba41220f618e 100644
--- a/drivers/macintosh/via-pmu.c
+++ b/drivers/macintosh/via-pmu.c
@@ -534,8 +534,9 @@ init_pmu(void)
int timeout;
struct adb_request req;
 
-   out_8([B], via[B] | TREQ);  /* negate TREQ */
-   out_8([DIRB], (via[DIRB] | TREQ) & ~TACK);  /* TACK in, TREQ out */
+   /* Negate TREQ. Set TACK to input and TREQ to output. */
+   out_8([B], in_8([B]) | TREQ);
+   out_8([DIRB], (in_8([DIRB]) | TREQ) & ~TACK);
 
pmu_request(, NULL, 2, PMU_SET_INTR_MASK, pmu_intr_mask);
timeout =  10;
@@ -1418,8 +1419,8 @@ pmu_sr_intr(void)
struct adb_request *req;
int bite = 0;
 
-   if (via[B] & TREQ) {
-   printk(KERN_ERR "PMU: spurious SR intr (%x)\n", via[B]);
+   if (in_8([B]) & TREQ) {
+   printk(KERN_ERR "PMU: spurious SR intr (%x)\n", in_8([B]));
out_8([IFR], SR_INT);
return NULL;
}
-- 
2.16.4



[PATCH v4 01/11] macintosh/via-pmu: Fix section mismatch warning

2018-07-02 Thread Finn Thain
The pmu_init() function has the __init qualifier, but the ops struct
that holds a pointer to it does not. This causes a build warning.
The driver works fine because the pointer is only dereferenced early.

The function is so small that there's negligible benefit from using
the __init qualifier. Remove it to fix the warning, consistent with
the other ADB drivers.

Tested-by: Stan Johnson 
Signed-off-by: Finn Thain 
Reviewed-by: Geert Uytterhoeven 
---
 drivers/macintosh/via-pmu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/macintosh/via-pmu.c b/drivers/macintosh/via-pmu.c
index 25c1ce811053..f8a2c917201f 100644
--- a/drivers/macintosh/via-pmu.c
+++ b/drivers/macintosh/via-pmu.c
@@ -378,7 +378,7 @@ static int pmu_probe(void)
return vias == NULL? -ENODEV: 0;
 }
 
-static int __init pmu_init(void)
+static int pmu_init(void)
 {
if (vias == NULL)
return -ENODEV;
-- 
2.16.4



[PATCH v4 03/11] macintosh/via-pmu: Don't clear shift register interrupt flag twice

2018-07-02 Thread Finn Thain
The shift register interrupt flag gets cleared in via_pmu_interrupt()
and once again in pmu_sr_intr(). Fix this theoretical race condition.

Tested-by: Stan Johnson 
Signed-off-by: Finn Thain 
Reviewed-by: Geert Uytterhoeven 
---
 drivers/macintosh/via-pmu.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/macintosh/via-pmu.c b/drivers/macintosh/via-pmu.c
index ba41220f618e..c313ddfdb17a 100644
--- a/drivers/macintosh/via-pmu.c
+++ b/drivers/macintosh/via-pmu.c
@@ -1421,7 +1421,6 @@ pmu_sr_intr(void)
 
if (in_8([B]) & TREQ) {
printk(KERN_ERR "PMU: spurious SR intr (%x)\n", in_8([B]));
-   out_8([IFR], SR_INT);
return NULL;
}
/* The ack may not yet be low when we get the interrupt */
-- 
2.16.4



[PATCH kernel] powerpc/powernv/ioda2: Add 256M IOMMU page size to the default POWER8 case

2018-07-02 Thread Alexey Kardashevskiy
The sketchy bypass uses 256M pages so add this page size as well.

This should cause no behavioral change but will be used later.

Fixes: 477afd6ea6 "powerpc/ioda: Use ibm,supported-tce-sizes for IOMMU page 
size mask"
Signed-off-by: Alexey Kardashevskiy 
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 5bd0eb6..557c11d 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -2925,7 +2925,7 @@ static unsigned long pnv_ioda_parse_tce_sizes(struct 
pnv_phb *phb)
/* Add 16M for POWER8 by default */
if (cpu_has_feature(CPU_FTR_ARCH_207S) &&
!cpu_has_feature(CPU_FTR_ARCH_300))
-   mask |= SZ_16M;
+   mask |= SZ_16M | SZ_256M;
return mask;
}
 
-- 
2.11.0



Re: [PATCH 1/3] powerpc/powernv/pci: Track largest available TCE order per PHB

2018-07-02 Thread Alexey Kardashevskiy
On Mon, 2 Jul 2018 17:32:56 +1000
Alexey Kardashevskiy  wrote:

> On Fri, 29 Jun 2018 17:34:35 +1000
> Russell Currey  wrote:
> 
> > Knowing the largest possible TCE size of a PHB is useful, so get it
> > out of the device tree.  This relies on the property being added in
> > OPAL.
> > 
> > It is assumed that any PHB4 or later machine would be running
> > firmware that implemented this property, and otherwise assumed to
> > be PHB3, which has a maximum TCE order of 28 bits or 256MB TCEs.
> > 
> > This is used later in the series.
> > 
> > Signed-off-by: Russell Currey 
> > ---
> >  arch/powerpc/platforms/powernv/pci-ioda.c | 16 
> >  arch/powerpc/platforms/powernv/pci.h  |  3 +++
> >  2 files changed, 19 insertions(+)
> > 
> > diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c
> > b/arch/powerpc/platforms/powernv/pci-ioda.c index
> > 5bd0eb6681bc..17c590087279 100644 ---
> > a/arch/powerpc/platforms/powernv/pci-ioda.c +++
> > b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -3873,11 +3873,13 @@
> > static void __init pnv_pci_init_ioda_phb(struct device_node *np,
> > struct resource r; const __be64 *prop64;
> > const __be32 *prop32;
> > +   struct property *prop;
> > int len;
> > unsigned int segno;
> > u64 phb_id;
> > void *aux;
> > long rc;
> > +   u32 val;
> >  
> > if (!of_device_is_available(np))
> > return;
> > @@ -4016,6 +4018,20 @@ static void __init
> > pnv_pci_init_ioda_phb(struct device_node *np, }
> > phb->ioda.pe_array = aux + pemap_off;
> >  
> > +   phb->ioda.max_tce_order = 0;
> > +   /* Get TCE order from the DT.  If it's not present, assume
> > P8 */
> > +   if (!of_get_property(np, "ibm,supported-tce-sizes", NULL))
> > {
> > +   phb->ioda.max_tce_order = 28; /* assume P8 256mb
> > TCEs */
> > +   } else {
> > +   of_property_for_each_u32(np,
> > "ibm,supported-tce-sizes", prop,
> > +prop32, val) {
> > +   if (val > phb->ioda.max_tce_order)
> > +   phb->ioda.max_tce_order = val;
> > +   }
> > +   pr_debug("PHB%llx Found max TCE order of %d
> > bits\n",
> > +phb->opal_id, phb->ioda.max_tce_order);
> > +   }  
> 
> 
> pnv_ioda_parse_tce_sizes() does this, use it. It even reports 256MB
> pages for P8 as in v4.18-rc3.


ah, not, not in rc3, my bad. I'll post it soon.


--
Alexey


Re: [PATCH 1/3] powerpc/powernv/pci: Track largest available TCE order per PHB

2018-07-02 Thread Alexey Kardashevskiy
On Fri, 29 Jun 2018 17:34:35 +1000
Russell Currey  wrote:

> Knowing the largest possible TCE size of a PHB is useful, so get it out
> of the device tree.  This relies on the property being added in OPAL.
> 
> It is assumed that any PHB4 or later machine would be running firmware
> that implemented this property, and otherwise assumed to be PHB3, which
> has a maximum TCE order of 28 bits or 256MB TCEs.
> 
> This is used later in the series.
> 
> Signed-off-by: Russell Currey 
> ---
>  arch/powerpc/platforms/powernv/pci-ioda.c | 16 
>  arch/powerpc/platforms/powernv/pci.h  |  3 +++
>  2 files changed, 19 insertions(+)
> 
> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
> b/arch/powerpc/platforms/powernv/pci-ioda.c
> index 5bd0eb6681bc..17c590087279 100644
> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
> @@ -3873,11 +3873,13 @@ static void __init pnv_pci_init_ioda_phb(struct 
> device_node *np,
>   struct resource r;
>   const __be64 *prop64;
>   const __be32 *prop32;
> + struct property *prop;
>   int len;
>   unsigned int segno;
>   u64 phb_id;
>   void *aux;
>   long rc;
> + u32 val;
>  
>   if (!of_device_is_available(np))
>   return;
> @@ -4016,6 +4018,20 @@ static void __init pnv_pci_init_ioda_phb(struct 
> device_node *np,
>   }
>   phb->ioda.pe_array = aux + pemap_off;
>  
> + phb->ioda.max_tce_order = 0;
> + /* Get TCE order from the DT.  If it's not present, assume P8 */
> + if (!of_get_property(np, "ibm,supported-tce-sizes", NULL)) {
> + phb->ioda.max_tce_order = 28; /* assume P8 256mb TCEs */
> + } else {
> + of_property_for_each_u32(np, "ibm,supported-tce-sizes", prop,
> +  prop32, val) {
> + if (val > phb->ioda.max_tce_order)
> + phb->ioda.max_tce_order = val;
> + }
> + pr_debug("PHB%llx Found max TCE order of %d bits\n",
> +  phb->opal_id, phb->ioda.max_tce_order);
> + }


pnv_ioda_parse_tce_sizes() does this, use it. It even reports 256MB pages for 
P8 as in v4.18-rc3. And since this is going to be used once per device driver 
bind operation, there is no need at all to cache it, just call 
ilog2(pnv_ioda_parse_tce_sizes()) whenever you want to know the maximum page 
size.


> +
>   /*
>* Choose PE number for root bus, which shouldn't have
>* M64 resources consumed by its child devices. To pick
> diff --git a/arch/powerpc/platforms/powernv/pci.h 
> b/arch/powerpc/platforms/powernv/pci.h
> index eada4b6068cb..c9952def5e93 100644
> --- a/arch/powerpc/platforms/powernv/pci.h
> +++ b/arch/powerpc/platforms/powernv/pci.h
> @@ -173,6 +173,9 @@ struct pnv_phb {
>   struct list_headpe_list;
>   struct mutexpe_list_mutex;
>  
> + /* Largest supported TCE order bits */
> + uint8_t max_tce_order;
> +
>   /* Reverse map of PEs, indexed by {bus, devfn} */
>   unsigned intpe_rmap[0x1];
>   } ioda;
> -- 
> 2.17.1
> 



--
Alexey


Re: [PATCH kernel v2 2/2] KVM: PPC: Check if IOMMU page is contained in the pinned physical page

2018-07-02 Thread Alexey Kardashevskiy
On Mon, 2 Jul 2018 14:52:43 +1000
David Gibson  wrote:

> On Mon, Jul 02, 2018 at 02:33:30PM +1000, Alexey Kardashevskiy wrote:
> > On Mon, 2 Jul 2018 14:08:52 +1000
> > David Gibson  wrote:
> >   
> > > On Fri, Jun 29, 2018 at 05:07:47PM +1000, Alexey Kardashevskiy wrote:  
> > > > On Fri, 29 Jun 2018 15:18:20 +1000
> > > > Alexey Kardashevskiy  wrote:
> > > > 
> > > > > On Fri, 29 Jun 2018 14:57:02 +1000
> > > > > David Gibson  wrote:
> > > > > 
> > > > > > On Fri, Jun 29, 2018 at 02:51:21PM +1000, Alexey Kardashevskiy 
> > > > > > wrote:  
> > > > > > > On Fri, 29 Jun 2018 14:12:41 +1000
> > > > > > > David Gibson  wrote:
> > > > > > > 
> > > > > > > > On Tue, Jun 26, 2018 at 03:59:26PM +1000, Alexey Kardashevskiy 
> > > > > > > > wrote:
> > > > > > > > > We already have a check in 
> > > > > > > > > drivers/vfio/vfio_iommu_spapr_tce.c that
> > > > > > > > > an IOMMU page is contained in the physical page so the PCI 
> > > > > > > > > hardware won't
> > > > > > > > > get access to unassigned host memory.
> > > > > > > > > 
> > > > > > > > > However we do not have this check in KVM fastpath (H_PUT_TCE 
> > > > > > > > > accelerated
> > > > > > > > > code) so the user space can pin memory backed with 64k pages 
> > > > > > > > > and create
> > > > > > > > > a hardware TCE table with a bigger page size. We were lucky 
> > > > > > > > > so far and
> > > > > > > > > did not hit this yet as the very first time the mapping 
> > > > > > > > > happens
> > > > > > > > > we do not have tbl::it_userspace allocated yet and fall back 
> > > > > > > > > to
> > > > > > > > > the userspace which in turn calls VFIO IOMMU driver and that 
> > > > > > > > > fails
> > > > > > > > > because of the check in vfio_iommu_spapr_tce.c which is really
> > > > > > > > > sustainable solution.
> > > > > > > > > 
> > > > > > > > > This stores the smallest preregistered page size in the 
> > > > > > > > > preregistered
> > > > > > > > > region descriptor and changes the mm_iommu_xxx API to check 
> > > > > > > > > this against
> > > > > > > > > the IOMMU page size.
> > > > > > > > > 
> > > > > > > > > Signed-off-by: Alexey Kardashevskiy 
> > > > > > > > > ---
> > > > > > > > > Changes:
> > > > > > > > > v2:
> > > > > > > > > * explicitly check for compound pages before calling 
> > > > > > > > > compound_order()
> > > > > > > > > 
> > > > > > > > > ---
> > > > > > > > > The bug is: run QEMU _without_ hugepages (no -mempath) and 
> > > > > > > > > tell it to
> > > > > > > > > advertise 16MB pages to the guest; a typical pseries guest 
> > > > > > > > > will use 16MB
> > > > > > > > > for IOMMU pages without checking the mmu pagesize and this 
> > > > > > > > > will fail
> > > > > > > > > at 
> > > > > > > > > https://git.qemu.org/?p=qemu.git;a=blob;f=hw/vfio/common.c;h=fb396cf00ac40eb35967a04c9cc798ca896eed57;hb=refs/heads/master#l256
> > > > > > > > > 
> > > > > > > > > With the change, mapping will fail in KVM and the guest will 
> > > > > > > > > print:
> > > > > > > > > 
> > > > > > > > > mlx5_core :00:00.0: ibm,create-pe-dma-window(2027) 0 
> > > > > > > > > 800 2000 18 1f returned 0 (liobn = 0x8001 
> > > > > > > > > starting addr = 800 0)
> > > > > > > > > mlx5_core :00:00.0: created tce table LIOBN 0x8001 
> > > > > > > > > for /pci@8002000/ethernet@0
> > > > > > > > > mlx5_core :00:00.0: failed to map direct window for
> > > > > > > > > /pci@8002000/ethernet@0: -1  
> > > > > > > > 
> > > > > > > > [snip]
> > > > > > > > > @@ -124,7 +125,7 @@ long mm_iommu_get(struct mm_struct *mm, 
> > > > > > > > > unsigned long ua, unsigned long entries,
> > > > > > > > >   struct mm_iommu_table_group_mem_t **pmem)
> > > > > > > > >  {
> > > > > > > > >   struct mm_iommu_table_group_mem_t *mem;
> > > > > > > > > - long i, j, ret = 0, locked_entries = 0;
> > > > > > > > > + long i, j, ret = 0, locked_entries = 0, pageshift;
> > > > > > > > >   struct page *page = NULL;
> > > > > > > > >  
> > > > > > > > >   mutex_lock(_list_mutex);
> > > > > > > > > @@ -166,6 +167,8 @@ long mm_iommu_get(struct mm_struct *mm, 
> > > > > > > > > unsigned long ua, unsigned long entries,
> > > > > > > > >   goto unlock_exit;
> > > > > > > > >   }
> > > > > > > > >  
> > > > > > >  > > +mem->pageshift = 30; /* start from 1G pages - the 
> > > > > > > biggest we have */  
> > > > > > > > 
> > > > > > > > What about 16G pages on an HPT system?
> > > > > > > 
> > > > > > > 
> > > > > > > Below in the loop mem->pageshift will reduce to the biggest 
> > > > > > > actual size
> > > > > > > which will be 16mb/64k/4k. Or remain 1GB if no memory is actually
> > > > > > > pinned, no loss there.
> > > > > > 
> > > > > > Are you saying that 16G IOMMU pages aren't supported?  Or that 
> > > > > > there's
> > > > > > some reason a guest can never use them?  
> > > > > 
> > > > > 
> > > > > ah, 16_G_, not 

[PATCH v5 7/7] powerpc/pseries: Dump the SLB contents on SLB MCE errors.

2018-07-02 Thread Mahesh J Salgaonkar
From: Mahesh Salgaonkar 

If we get a machine check exceptions due to SLB errors then dump the
current SLB contents which will be very much helpful in debugging the
root cause of SLB errors. Introduce an exclusive buffer per cpu to hold
faulty SLB entries. In real mode mce handler saves the old SLB contents
into this buffer accessible through paca and print it out later in virtual
mode.

With this patch the console will log SLB contents like below on SLB MCE
errors:

[ 3022.938065] SLB contents of cpu 0x3
[ 3022.938066] 00 c800 400ea1b217000500
[ 3022.938067]   1T  ESID=   c0  VSID=  ea1b217 LLP:100
[ 3022.938068] 01 d800 400d43642f000510
[ 3022.938069]   1T  ESID=   d0  VSID=  d43642f LLP:110
[ 3022.938070] 05 f800 400a86c85f000500
[ 3022.938071]   1T  ESID=   f0  VSID=  a86c85f LLP:100
[ 3022.938072] 06 7f000800 400a628b13000d90
[ 3022.938073]   1T  ESID=   7f  VSID=  a628b13 LLP:110
[ 3022.938074] 07 1800 000b7979f523fd90
[ 3022.938075]  256M ESID=1  VSID=   b7979f523f LLP:110
[ 3022.938076] 08 c800 400ea1b217000510
[ 3022.938076]   1T  ESID=   c0  VSID=  ea1b217 LLP:110
[ 3022.938077] 09 c800 400ea1b217000510
[ 3022.938078]   1T  ESID=   c0  VSID=  ea1b217 LLP:110

Suggested-by: Aneesh Kumar K.V 
Suggested-by: Michael Ellerman 
Signed-off-by: Mahesh Salgaonkar 
---
 arch/powerpc/include/asm/book3s/64/mmu-hash.h |7 +++
 arch/powerpc/include/asm/paca.h   |1 
 arch/powerpc/mm/slb.c |   57 +
 arch/powerpc/platforms/pseries/ras.c  |   10 
 arch/powerpc/platforms/pseries/setup.c|   10 
 5 files changed, 84 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h 
b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
index cc00a7088cf3..5a3fe282076d 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
@@ -485,9 +485,16 @@ static inline void hpte_init_pseries(void) { }
 
 extern void hpte_init_native(void);
 
+struct slb_entry {
+   u64 esid;
+   u64 vsid;
+};
+
 extern void slb_initialize(void);
 extern void slb_flush_and_rebolt(void);
 extern void slb_flush_and_rebolt_realmode(void);
+extern void slb_save_contents(struct slb_entry *slb_ptr);
+extern void slb_dump_contents(struct slb_entry *slb_ptr);
 
 extern void slb_vmalloc_update(void);
 extern void slb_set_size(u16 size);
diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index b441fef53077..653f87c69423 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -253,6 +253,7 @@ struct paca_struct {
 #endif
 #ifdef CONFIG_PPC_PSERIES
u8 *mce_data_buf;   /* buffer to hold per cpu rtas errlog */
+   struct slb_entry *mce_faulty_slbs;
 #endif /* CONFIG_PPC_PSERIES */
 } cacheline_aligned;
 
diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c
index 5b1813b98358..476ab0b1d4e8 100644
--- a/arch/powerpc/mm/slb.c
+++ b/arch/powerpc/mm/slb.c
@@ -151,6 +151,63 @@ void slb_flush_and_rebolt_realmode(void)
get_paca()->slb_cache_ptr = 0;
 }
 
+void slb_save_contents(struct slb_entry *slb_ptr)
+{
+   int i;
+   unsigned long e, v;
+
+   if (!slb_ptr)
+   return;
+
+   for (i = 0; i < mmu_slb_size; i++) {
+   asm volatile("slbmfee  %0,%1" : "=r" (e) : "r" (i));
+   asm volatile("slbmfev  %0,%1" : "=r" (v) : "r" (i));
+   slb_ptr->esid = e;
+   slb_ptr->vsid = v;
+   slb_ptr++;
+   }
+}
+
+void slb_dump_contents(struct slb_entry *slb_ptr)
+{
+   int i;
+   unsigned long e, v;
+   unsigned long llp;
+
+   if (!slb_ptr)
+   return;
+
+   pr_err("SLB contents of cpu 0x%x\n", smp_processor_id());
+
+   for (i = 0; i < mmu_slb_size; i++) {
+   e = slb_ptr->esid;
+   v = slb_ptr->vsid;
+   slb_ptr++;
+
+   if (!e && !v)
+   continue;
+
+   pr_err("%02d %016lx %016lx\n", i, e, v);
+
+   if (!(e & SLB_ESID_V)) {
+   pr_err("\n");
+   continue;
+   }
+   llp = v & SLB_VSID_LLP;
+   if (v & SLB_VSID_B_1T) {
+   pr_err("  1T  ESID=%9lx  VSID=%13lx LLP:%3lx\n",
+   GET_ESID_1T(e),
+   (v & ~SLB_VSID_B) >> SLB_VSID_SHIFT_1T,
+   llp);
+   } else {
+   pr_err(" 256M ESID=%9lx  VSID=%13lx LLP:%3lx\n",
+   GET_ESID(e),
+   (v & ~SLB_VSID_B) >> SLB_VSID_SHIFT,
+   llp);
+   }
+   }
+}
+
 void slb_vmalloc_update(void)
 {
unsigned long