date:20160919

Re: [PATCH 2/2] powernv:idle:Implement lite variant of power_enter_stop

2016-09-19 Thread Balbir Singh



On 16/09/16 19:47, Gautham R. Shenoy wrote:
> From: "Gautham R. Shenoy" 
> 
> This patch adds a function named power_enter_stop_lite() that can
> execute a stop instruction when ESL and EC bits are set to zero in the
> PSSCR.  The function handles the wake-up from idle at the instruction
> immediately after the stop instruction.
> 
> If the flag OPAL_PM_WAKEUP_AT_NEXT_INST[1] is set in the device tree
> for a stop state, then use the lite variant for that particular stop
> state.
> 
> [1] : The corresponding patch in skiboot that defines
>   OPAL_PM_WAKEUP_AT_NEXT_INST and enables it in the device tree
>   can be found here:
>   https://lists.ozlabs.org/pipermail/skiboot/2016-September/004805.html
> 
> Signed-off-by: Gautham R. Shenoy 
> ---
>  arch/powerpc/include/asm/opal-api.h   |  1 +
>  arch/powerpc/include/asm/processor.h  |  3 ++-
>  arch/powerpc/kernel/idle_book3s.S | 28 +---
>  arch/powerpc/platforms/powernv/idle.c | 17 ++---
>  arch/powerpc/platforms/powernv/smp.c  |  2 +-
>  drivers/cpuidle/cpuidle-powernv.c | 24 ++--
>  6 files changed, 65 insertions(+), 10 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/opal-api.h 
> b/arch/powerpc/include/asm/opal-api.h
> index 0e2e57b..6e5741e 100644
> --- a/arch/powerpc/include/asm/opal-api.h
> +++ b/arch/powerpc/include/asm/opal-api.h
> @@ -179,6 +179,7 @@
>  #define OPAL_PM_TIMEBASE_STOP0x0002
>  #define OPAL_PM_LOSE_HYP_CONTEXT 0x2000
>  #define OPAL_PM_LOSE_FULL_CONTEXT0x4000
> +#define OPAL_PM_WAKEUP_AT_NEXT_INST  0x8000
>  #define OPAL_PM_NAP_ENABLED  0x0001
>  #define OPAL_PM_SLEEP_ENABLED0x0002
>  #define OPAL_PM_WINKLE_ENABLED   0x0004
> diff --git a/arch/powerpc/include/asm/processor.h 
> b/arch/powerpc/include/asm/processor.h
> index 68e3bf5..e0549a0 100644
> --- a/arch/powerpc/include/asm/processor.h
> +++ b/arch/powerpc/include/asm/processor.h
> @@ -460,7 +460,8 @@ extern int powersave_nap; /* set if nap mode can be used 
> in idle loop */
>  extern unsigned long power7_nap(int check_irq);
>  extern unsigned long power7_sleep(void);
>  extern unsigned long power7_winkle(void);
> -extern unsigned long power9_idle_stop(unsigned long stop_level);
> +extern unsigned long power9_idle_stop(unsigned long stop_level,
> + unsigned long exec_lite);
>  
>  extern void flush_instruction_cache(void);
>  extern void hard_reset_now(void);
> diff --git a/arch/powerpc/kernel/idle_book3s.S 
> b/arch/powerpc/kernel/idle_book3s.S
> index 32d666b..47ee106 100644
> --- a/arch/powerpc/kernel/idle_book3s.S
> +++ b/arch/powerpc/kernel/idle_book3s.S
> @@ -43,6 +43,8 @@
>  #define PSSCR_HV_TEMPLATEPSSCR_ESL | PSSCR_EC | \
>   PSSCR_PSLL_MASK | PSSCR_TR_MASK | \
>   PSSCR_MTL_MASK
> +#define PSSCR_HV_TEMPLATE_LITE   PSSCR_PSLL_MASK | PSSCR_TR_MASK | \
> +  PSSCR_MTL_MASK
>  
>   .text
>  
> @@ -246,6 +248,20 @@ enter_winkle:
>  
>   IDLE_STATE_ENTER_SEQ_NORET(PPC_WINKLE)
>  
> +
> +/*
> + * power_enter_stop_lite : This will resume the wake up from
> + * idle at the subsequent instruction.
> + *
> + * Caller should set ESL=EC=0 in PSSCR before calling
> + * this function.
> + *
> + */
> +power_enter_stop_lite:
> + IDLE_STATE_ENTER_SEQ(PPC_STOP)
> +7:   li  r3,0  /* Since we didn't lose state, return 0 */
> + b   pnv_wakeup_noloss
> +
>  /*
>   * r3 - requested stop state
>   */
> @@ -333,13 +349,19 @@ ALT_FTR_SECTION_END_NESTED_IFSET(CPU_FTR_ARCH_207S, 
> 66);\
>  
>  /*
>   * r3 - requested stop state
> + * r4 - Indicates if the lite variant with ESL=EC=0 should be executed.
>   */
>  _GLOBAL(power9_idle_stop)
> - LOAD_REG_IMMEDIATE(r4, PSSCR_HV_TEMPLATE)
> - or  r4,r4,r3
> + cmpdi   r4, 1
> + bne 4f
> + LOAD_REG_IMMEDIATE(r4, PSSCR_HV_TEMPLATE_LITE)
> + LOAD_REG_ADDR(r5,power_enter_stop_lite)
> + b   5f
> +4:   LOAD_REG_IMMEDIATE(r4, PSSCR_HV_TEMPLATE)
> + LOAD_REG_ADDR(r5,power_enter_stop)
> +5:   or  r4,r4,r3
>   mtspr   SPRN_PSSCR, r4
>   li  r4, 1
> - LOAD_REG_ADDR(r5,power_enter_stop)
>   b   pnv_powersave_common
>   /* No return */
>  /*
> diff --git a/arch/powerpc/platforms/powernv/idle.c 
> b/arch/powerpc/platforms/powernv/idle.c
> index 479c256..c3d3fed 100644
> --- a/arch/powerpc/platforms/powernv/idle.c
> +++ b/arch/powerpc/platforms/powernv/idle.c
> @@ -244,8 +244,15 @@ static DEVICE_ATTR(fastsleep_workaround_applyonce, 0600,
>  static void power9_idle(void)
>  {
>   /* Requesting stop state 0 */
> - power9_idle_stop(0);
> + power9_idle_stop(0, 0);
>  }
> +
> +static void power9_idle_lite(void)
> +{
> + /* Requesting stop state 0 with ESL=EC=0 */
> + power9_idle_stop(0, 1);
> +}
> +
>

Re: [PATCH] powerpc/64s: exception optimise MSR handling

2016-09-19 Thread Nicholas Piggin

On Tue, 20 Sep 2016 14:25:48 +1000
Michael Ellerman  wrote:

> Nicholas Piggin  writes:
> 
> > mtmsrd with L=1 only affects MSR_EE and MSR_RI bits, and we always
> > know what state those bits are, so the kernel MSR does not need to be
> > loaded when modifying them.
> >
> > mtmsrd is often in the critical execution path, so avoiding dependency
> > on even L1 load is noticable. On a POWER8 this saves about 3 cycles
> > from the syscall path, and possibly a few from other exception returns
> > (not measured).  
> 
> This looks good in principle.
> 
> My worry is that these code paths have lots of assumptions about what's
> left in registers, so we may have a path somewhere which expects rX to
> contain PACAKMSR. Hence the review below ...
> 
> > diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
> > index 6b8bc0d..585b9ca 100644
> > --- a/arch/powerpc/kernel/entry_64.S
> > +++ b/arch/powerpc/kernel/entry_64.S
> > @@ -139,7 +139,7 @@ END_FW_FTR_SECTION_IFSET(FW_FEATURE_SPLPAR)
> >  #ifdef CONFIG_PPC_BOOK3E
> > wrteei  1
> >  #else
> > -   ld  r11,PACAKMSR(r13)
> > +   li  r11,MSR_RI
> > ori r11,r11,MSR_EE
> > mtmsrd  r11,1
> >  #endif /* CONFIG_PPC_BOOK3E */  
> 
>   /* We do need to set SOFTE in the stack frame or the return
>* from interrupt will be painful
>*/
>   li  r10,1
>   std r10,SOFTE(r1)
> 
>   CURRENT_THREAD_INFO(r11, r1)
> 
> So that's one OK. r11 isn't used again until its clobbered here.
> 
> 
> > @@ -195,7 +195,6 @@ system_call:/* label this so stack 
> > traces look sane */  
> 
> #ifdef CONFIG_PPC_BOOK3S
>   /* No MSR:RI on BookE */
>   andi.   r10,r8,MSR_RI
>   beq-unrecov_restore
> #endif
> 
> So at this point r10 == MSR_RI, otherwise we would have taken the branch.
> 
>   /*
>* Disable interrupts so current_thread_info()->flags can't change,
>* and so that we don't get interrupted after loading SRR0/1.
>*/
> >  #ifdef CONFIG_PPC_BOOK3E
> > wrteei  0
> >  #else
> > -   ld  r10,PACAKMSR(r13)
> > /*
> >  * For performance reasons we clear RI the same time that we
> >  * clear EE. We only need to clear RI just before we restore r13
> >  * below, but batching it with EE saves us one expensive mtmsrd call.
> >  * We have to be careful to restore RI if we branch anywhere from
> >  * here (eg syscall_exit_work).
> >  */
> > -   li  r9,MSR_RI
> > -   andcr11,r10,r9
> > +   li  r11,0
> > mtmsrd  r11,1
> >  #endif /* CONFIG_PPC_BOOK3E */  
> 
>   ld  r9,TI_FLAGS(r12)
>   li  r11,-MAX_ERRNO
>   andi.   
> r0,r9,(_TIF_SYSCALL_DOTRACE|_TIF_SINGLESTEP|_TIF_USER_WORK_MASK|_TIF_PERSYSCALL_MASK)
>   bne-syscall_exit_work
> 
> Which is:
> 
> syscall_exit_work:
> #ifdef CONFIG_PPC_BOOK3S
>   mtmsrd  r10,1   /* Restore RI */
> #endif
> 
> So that appears to still work. But it's super fragile.

Agreed. We'll go with the idea you mentioned offline to just load r10 again
here to avoid the long dependency -- it's not going to be a measurable cost.


> What I'd like to do is drop that optimisation of clearing RI early with
> EE. That would avoid us needing to restore RI in syscall_exit_work and
> before restore_math (and reclearing it after).
> 
> But I guess that will hurt performance too much :/

Yes that's something to look into. Newer cores, more kernel code using fp
registers. I'll look into it.

Thanks,
Nick

Re: [PATCH v2 2/3] powerpc: get hugetlbpage handling more generic

2016-09-19 Thread Christophe Leroy




Le 20/09/2016 à 04:28, Aneesh Kumar K.V a écrit :

christophe leroy  writes:


Le 19/09/2016 à 07:50, Aneesh Kumar K.V a écrit :


Christophe Leroy  writes:

+#else
+static void hugepd_free(struct mmu_gather *tlb, void *hugepte)
+{
+   BUG();
+}
+
 #endif



I was expecting that BUG will get removed in the next patch. But I don't
see it in the next patch. Considering

@@ -475,11 +453,10 @@ static void free_hugepd_range(struct mmu_gather *tlb, 
hugepd_t *hpdp, int pdshif
for (i = 0; i < num_hugepd; i++, hpdp++)
hpdp->pd = 0;

-#ifdef CONFIG_PPC_FSL_BOOK3E
-   hugepd_free(tlb, hugepte);
-#else
-   pgtable_free_tlb(tlb, hugepte, pdshift - shift);
-#endif
+   if (shift >= pdshift)
+   hugepd_free(tlb, hugepte);
+   else
+   pgtable_free_tlb(tlb, hugepte, pdshift - shift);
 }

What is that I am missing ?



Previously, call to hugepd_free() was compiled only when #ifdef
CONFIG_PPC_FSL_BOOK3E
Now, it is compiled at all time, but it should never be called if not
CONFIG_PPC_FSL_BOOK3E because we always have shift < pdshift in that case.
Then the function needs to be defined anyway but should never be called.
Should I just define it static inline {} ?



For 8M with 4K mode, we have shift >= pdshift right ?



Yes, thats the reason why in the following patch we get. That way we get 
a real hugepd_free() also for the 8xx.


@@ -366,7 +373,7 @@ int alloc_bootmem_huge_page(struct hstate *hstate)
 }
 #endif

-#ifdef CONFIG_PPC_FSL_BOOK3E
+#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx)
 #define HUGEPD_FREELIST_SIZE \
((PAGE_SIZE - sizeof(struct hugepd_freelist)) / sizeof(pte_t))



Christophe

[PATCH v2 6/6] powerpc/boot: Add support for XZ compression

2016-09-19 Thread Oliver O'Halloran

This patch adds an option to use XZ compression for the kernel image.
Currently this is only enabled for PPC64 targets since the bulk of the
32bit platforms produce uboot images which do not use the wrapper.

Signed-off-by: Oliver O'Halloran 
---
 arch/powerpc/boot/Makefile |  3 +++
 arch/powerpc/boot/decompress.c |  5 +
 arch/powerpc/boot/types.h  | 10 +
 arch/powerpc/boot/xz_config.h  | 39 ++
 arch/powerpc/platforms/Kconfig.cputype |  1 +
 5 files changed, 58 insertions(+)
 create mode 100644 arch/powerpc/boot/xz_config.h

diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile
index 9fb451d0586e..eae2dc8bc218 100644
--- a/arch/powerpc/boot/Makefile
+++ b/arch/powerpc/boot/Makefile
@@ -20,6 +20,7 @@
 all: $(obj)/zImage
 
 compress-$(CONFIG_KERNEL_GZIP) := CONFIG_KERNEL_GZIP
+compress-$(CONFIG_KERNEL_XZ)   := CONFIG_KERNEL_XZ
 
 BOOTCFLAGS:= -Wall -Wundef -Wstrict-prototypes -Wno-trigraphs \
 -fno-strict-aliasing -Os -msoft-float -pipe \
@@ -226,6 +227,7 @@ endif
 endif
 
 compressor-$(CONFIG_KERNEL_GZIP) := gz
+compressor-$(CONFIG_KERNEL_XZ)   := xz
 
 # args (to if_changed): 1 = (this rule), 2 = platform, 3 = dts 4=dtb 5=initrd
 quiet_cmd_wrap = WRAP$@
@@ -433,6 +435,7 @@ clean-files += $(image-) $(initrd-) cuImage.* dtbImage.* 
treeImage.* \
 # clean up files cached by wrapper
 clean-kernel-base := vmlinux.strip vmlinux.bin
 clean-kernel := $(addsuffix .gz,$(clean-kernel-base))
+clean-kernel += $(addsuffix .xz,$(clean-kernel-base))
 # If not absolute clean-files are relative to $(obj).
 clean-files += $(addprefix $(objtree)/, $(clean-kernel))
 
diff --git a/arch/powerpc/boot/decompress.c b/arch/powerpc/boot/decompress.c
index 60fc6fb26867..8f32ea4289af 100644
--- a/arch/powerpc/boot/decompress.c
+++ b/arch/powerpc/boot/decompress.c
@@ -37,6 +37,11 @@
 #  include "decompress_inflate.c"
 #endif
 
+#ifdef CONFIG_KERNEL_XZ
+#  include "xz_config.h"
+#  include "../../../lib/decompress_unxz.c"
+#endif
+
 /* globals for tracking the state of the decompression */
 static unsigned long decompressed_bytes;
 static unsigned long limit;
diff --git a/arch/powerpc/boot/types.h b/arch/powerpc/boot/types.h
index 85565a89bcc2..0362a262a299 100644
--- a/arch/powerpc/boot/types.h
+++ b/arch/powerpc/boot/types.h
@@ -34,4 +34,14 @@ typedef s64 int64_t;
(void) (&_x == &_y);\
_x > _y ? _x : _y; })
 
+#define min_t(type, a, b) min(((type) a), ((type) b))
+#define max_t(type, a, b) max(((type) a), ((type) b))
+
+#ifndef true
+#define true 1
+#endif
+
+#ifndef false
+#define false 0
+#endif
 #endif /* _TYPES_H_ */
diff --git a/arch/powerpc/boot/xz_config.h b/arch/powerpc/boot/xz_config.h
new file mode 100644
index ..5c6afdbca642
--- /dev/null
+++ b/arch/powerpc/boot/xz_config.h
@@ -0,0 +1,39 @@
+#ifndef __XZ_CONFIG_H__
+#define __XZ_CONFIG_H__
+
+/*
+ * most of this is copied from lib/xz/xz_private.h, we can't use their defines
+ * since the boot wrapper is not built in the same environment as the rest of
+ * the kernel.
+ */
+
+#include "types.h"
+#include "swab.h"
+
+static inline uint32_t swab32p(void *p)
+{
+   uint32_t *q = p;
+
+   return swab32(*q);
+}
+
+#ifdef __LITTLE_ENDIAN__
+#define get_le32(p) (*((uint32_t *) (p)))
+#else
+#define get_le32(p) swab32p(p)
+#endif
+
+#define memeq(a, b, size) (memcmp(a, b, size) == 0)
+#define memzero(buf, size) memset(buf, 0, size)
+
+/* prevent the inclusion of the xz-preboot MM headers */
+#define DECOMPR_MM_H
+#define memmove memmove
+#define XZ_EXTERN static
+
+/* xz.h needs to be included directly since we need enum xz_mode */
+#include "../../../include/linux/xz.h"
+
+#undef XZ_EXTERN
+
+#endif
diff --git a/arch/powerpc/platforms/Kconfig.cputype 
b/arch/powerpc/platforms/Kconfig.cputype
index f32edec13fd1..d5da55b01027 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -2,6 +2,7 @@ config PPC64
bool "64-bit kernel"
default n
select ZLIB_DEFLATE
+   select HAVE_KERNEL_XZ
help
  This option selects whether a 32-bit or a 64-bit kernel
  will be built.
-- 
2.5.5

[PATCH v2 5/6] powerpc/boot: add xz support to the wrapper script

2016-09-19 Thread Oliver O'Halloran

This modifies the script so that the -Z option takes an argument to
specify the compression type. It can either be 'gz', 'xz' or 'none'.
The legazy --no-gzip and -z options are still supported and will set
the compression to none and gzip respectively, but they are not
documented.

Signed-off-by: Oliver O'Halloran 
---
 arch/powerpc/boot/Makefile |  7 --
 arch/powerpc/boot/wrapper  | 61 ++
 2 files changed, 50 insertions(+), 18 deletions(-)

diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile
index 861348c72519..9fb451d0586e 100644
--- a/arch/powerpc/boot/Makefile
+++ b/arch/powerpc/boot/Makefile
@@ -225,10 +225,13 @@ CROSSWRAP := -C "$(CROSS_COMPILE)"
 endif
 endif
 
+compressor-$(CONFIG_KERNEL_GZIP) := gz
+
 # args (to if_changed): 1 = (this rule), 2 = platform, 3 = dts 4=dtb 5=initrd
 quiet_cmd_wrap = WRAP$@
-  cmd_wrap =$(CONFIG_SHELL) $(wrapper) -c -o $@ -p $2 $(CROSSWRAP) \
-   $(if $3, -s $3)$(if $4, -d $4)$(if $5, -i $5) vmlinux
+  cmd_wrap =$(CONFIG_SHELL) $(wrapper) -Z $(compressor-y) -c -o $@ -p $2 \
+   $(CROSSWRAP) $(if $3, -s $3)$(if $4, -d $4)$(if $5, -i $5) \
+   vmlinux
 
 image-$(CONFIG_PPC_PSERIES)+= zImage.pseries
 image-$(CONFIG_PPC_POWERNV)+= zImage.pseries
diff --git a/arch/powerpc/boot/wrapper b/arch/powerpc/boot/wrapper
index 6681ec3625c9..cf7631be5007 100755
--- a/arch/powerpc/boot/wrapper
+++ b/arch/powerpc/boot/wrapper
@@ -20,6 +20,8 @@
 # -D dir   specify directory containing data files used by script
 #  (default ./arch/powerpc/boot)
 # -W dir   specify working directory for temporary files (default .)
+# -z   use gzip (legacy)
+# -Z zsuffixcompression to use (gz, xz or none)
 
 # Stop execution if any command fails
 set -e
@@ -38,7 +40,7 @@ dtb=
 dts=
 cacheit=
 binary=
-gzip=.gz
+compression=.gz
 pie=
 format=
 
@@ -59,7 +61,8 @@ tmpdir=.
 usage() {
 echo 'Usage: wrapper [-o output] [-p platform] [-i initrd]' >&2
 echo '   [-d devtree] [-s tree.dts] [-c] [-C cross-prefix]' >&2
-echo '   [-D datadir] [-W workingdir] [--no-gzip] [vmlinux]' >&2
+echo '   [-D datadir] [-W workingdir] [-Z (gz|xz|none)]' >&2
+echo '   [--no-compression] [vmlinux]' >&2
 exit 1
 }
 
@@ -126,8 +129,24 @@ while [ "$#" -gt 0 ]; do
[ "$#" -gt 0 ] || usage
tmpdir="$1"
;;
+-z)
+   compression=.gz
+   ;;
+-Z)
+   shift
+   [ "$#" -gt 0 ] || usage
+[ "$1" != "gz" -o "$1" != "xz" -o "$1" != "none" ] || usage
+
+   compression=".$1"
+
+if [ $compression = ".none" ]; then
+compression=
+fi
+   ;;
 --no-gzip)
-gzip=
+# a "feature" of the the wrapper script is that it can be used outside
+# the kernel tree. So keeping this around for backwards compatibility.
+compression=
 ;;
 -?)
usage
@@ -140,6 +159,7 @@ while [ "$#" -gt 0 ]; do
 shift
 done
 
+
 if [ -n "$dts" ]; then
 if [ ! -r "$dts" -a -r "$object/dts/$dts" ]; then
dts="$object/dts/$dts"
@@ -212,7 +232,7 @@ miboot|uboot*)
 ;;
 cuboot*)
 binary=y
-gzip=
+compression=
 case "$platform" in
 *-mpc866ads|*-mpc885ads|*-adder875*|*-ep88xc)
 platformo=$object/cuboot-8xx.o
@@ -243,7 +263,7 @@ cuboot*)
 ps3)
 platformo="$object/ps3-head.o $object/ps3-hvcall.o $object/ps3.o"
 lds=$object/zImage.ps3.lds
-gzip=
+compression=
 ext=bin
 objflags="-O binary --set-section-flags=.bss=contents,alloc,load,data"
 ksection=.kernel:vmlinux.bin
@@ -310,27 +330,37 @@ mvme7100)
 esac
 
 vmz="$tmpdir/`basename \"$kernel\"`.$ext"
-if [ -z "$cacheit" -o ! -f "$vmz$gzip" -o "$vmz$gzip" -ot "$kernel" ]; then
-${CROSS}objcopy $objflags "$kernel" "$vmz.$$"
 
-strip_size=$(stat -c %s $vmz.$$)
+# Calculate the vmlinux.strip size
+${CROSS}objcopy $objflags "$kernel" "$vmz.$$"
+strip_size=$(stat -c %s $vmz.$$)
 
-if [ -n "$gzip" ]; then
+if [ -z "$cacheit" -o ! -f "$vmz$compression" -o "$vmz$compression" -ot 
"$kernel" ]; then
+# recompress the image if we need to
+case $compression in
+.xz)
+xz --check=crc32 -f -9 "$vmz.$$"
+;;
+.gz)
 gzip -n -f -9 "$vmz.$$"
-fi
+;;
+*)
+# drop the compression suffix so the stripped vmlinux is used
+compression=
+   ;;
+esac
 
 if [ -n "$cacheit" ]; then
-   mv -f "$vmz.$$$gzip" "$vmz$gzip"
+   mv -f "$vmz.$$$compression" "$vmz$compression"
 else
vmz="$vmz.$$"
 fi
 else
-# Calculate the vmlinux.strip size
-${CROSS}objcopy $objflags "$kernel" "$vmz.$$"
-strip_size=$(stat -c %s $vmz.$$)
 rm -f $vmz.$$
 fi
 
+vmz="$vmz$compression"
+
 if [ "$make_space" = "y" ]; then
# Round the size to next higher MB limit
round_size=$(((strip_size + 0xf) & 0xfff0))
@@

[PATCH v2 4/6] powerpc/boot: remove legacy gzip wrapper

2016-09-19 Thread Oliver O'Halloran

This code is no longer used and can be removed.

Signed-off-by: Oliver O'Halloran 
---
 arch/powerpc/boot/cuboot-c2k.c  |   1 -
 arch/powerpc/boot/gunzip_util.c | 204 
 arch/powerpc/boot/gunzip_util.h |  45 -
 3 files changed, 250 deletions(-)
 delete mode 100644 arch/powerpc/boot/gunzip_util.c
 delete mode 100644 arch/powerpc/boot/gunzip_util.h

diff --git a/arch/powerpc/boot/cuboot-c2k.c b/arch/powerpc/boot/cuboot-c2k.c
index e43594950ba3..9309c51f1d65 100644
--- a/arch/powerpc/boot/cuboot-c2k.c
+++ b/arch/powerpc/boot/cuboot-c2k.c
@@ -18,7 +18,6 @@
 #include "io.h"
 #include "ops.h"
 #include "elf.h"
-#include "gunzip_util.h"
 #include "mv64x60.h"
 #include "cuboot.h"
 #include "ppcboot.h"
diff --git a/arch/powerpc/boot/gunzip_util.c b/arch/powerpc/boot/gunzip_util.c
deleted file mode 100644
index 9dc52501de83..
--- a/arch/powerpc/boot/gunzip_util.c
+++ /dev/null
@@ -1,204 +0,0 @@
-/*
- * Copyright 2007 David Gibson, IBM Corporation.
- * Based on earlier work, Copyright (C) Paul Mackerras 1997.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#include 
-#include "string.h"
-#include "stdio.h"
-#include "ops.h"
-#include "gunzip_util.h"
-
-#define HEAD_CRC   2
-#define EXTRA_FIELD4
-#define ORIG_NAME  8
-#define COMMENT0x10
-#define RESERVED   0xe0
-
-/**
- * gunzip_start - prepare to decompress gzip data
- * @state: decompressor state structure to be initialized
- * @src:   buffer containing gzip compressed or uncompressed data
- * @srclen:size in bytes of the buffer at src
- *
- * If the buffer at @src contains a gzip header, this function
- * initializes zlib to decompress the data, storing the decompression
- * state in @state.  The other functions in this file can then be used
- * to decompress data from the gzipped stream.
- *
- * If the buffer at @src does not contain a gzip header, it is assumed
- * to contain uncompressed data.  The buffer information is recorded
- * in @state and the other functions in this file will simply copy
- * data from the uncompressed data stream at @src.
- *
- * Any errors, such as bad compressed data, cause an error to be
- * printed an the platform's exit() function to be called.
- */
-void gunzip_start(struct gunzip_state *state, void *src, int srclen)
-{
-   char *hdr = src;
-   int hdrlen = 0;
-
-   memset(state, 0, sizeof(*state));
-
-   /* Check for gzip magic number */
-   if ((hdr[0] == 0x1f) && (hdr[1] == 0x8b)) {
-   /* gzip data, initialize zlib parameters */
-   int r, flags;
-
-   state->s.workspace = state->scratch;
-   if (zlib_inflate_workspacesize() > sizeof(state->scratch))
-   fatal("insufficient scratch space for gunzip\n\r");
-
-   /* skip header */
-   hdrlen = 10;
-   flags = hdr[3];
-   if (hdr[2] != Z_DEFLATED || (flags & RESERVED) != 0)
-   fatal("bad gzipped data\n\r");
-   if ((flags & EXTRA_FIELD) != 0)
-   hdrlen = 12 + hdr[10] + (hdr[11] << 8);
-   if ((flags & ORIG_NAME) != 0)
-   while (hdr[hdrlen++] != 0)
-   ;
-   if ((flags & COMMENT) != 0)
-   while (hdr[hdrlen++] != 0)
-   ;
-   if ((flags & HEAD_CRC) != 0)
-   hdrlen += 2;
-   if (hdrlen >= srclen)
-   fatal("gunzip_start: ran out of data in header\n\r");
-
-   r = zlib_inflateInit2(>s, -MAX_WBITS);
-   if (r != Z_OK)
-   fatal("inflateInit2 returned %d\n\r", r);
-   }
-
-   state->s.total_in = hdrlen;
-   state->s.next_in = src + hdrlen;
-   state->s.avail_in = srclen - hdrlen;
-}
-
-/**
- * gunzip_partial - extract bytes from a gzip data stream
- * @state: gzip state structure previously initialized by gunzip_start()
- * @dst:   buffer to store extracted data
- * @dstlen:maximum number of bytes to extract
- *
- * This function extracts at most @dstlen bytes from the data stream
- * previously associated with @state by gunzip_start(), decompressing
- * if necessary.  Exactly @dstlen bytes are extracted unless the data
- * stream doesn't contain enough bytes, in which case the entire
- * remainder of the stream is decompressed.
- *
- * Returns the actual number of bytes extracted.  If any errors occur,
- * such as a corrupted compressed stream, an error is printed an the
- * platform's exit() function is called.
- */
-int gunzip_partial(struct gunzip_state *state, void *dst, int dstlen)
-{
-   int len;
-

[PATCH v2 3/6] powerpc/boot: use the preboot decompression API

2016-09-19 Thread Oliver O'Halloran

Currently the powerpc boot wrapper has its own wrapper around zlib to
handle decompressing gzipped kernels. The kernel decompressor library
functions now provide a generic interface that can be used in the pre-boot
environment. This allows boot wrappers to easily support different
compression algorithms. This patch converts the wrapper to use this new
API, but does not add support for using new algorithms.

Signed-off-by: Oliver O'Halloran 
---
 arch/powerpc/boot/Makefile |  34 +++---
 arch/powerpc/boot/decompress.c | 142 +
 arch/powerpc/boot/main.c   |  35 +-
 arch/powerpc/boot/ops.h|   3 +
 4 files changed, 189 insertions(+), 25 deletions(-)
 create mode 100644 arch/powerpc/boot/decompress.c

diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile
index bede555d78cf..861348c72519 100644
--- a/arch/powerpc/boot/Makefile
+++ b/arch/powerpc/boot/Makefile
@@ -63,13 +63,28 @@ $(obj)/treeboot-currituck.o: BOOTCFLAGS += -mcpu=405
 $(obj)/treeboot-akebono.o: BOOTCFLAGS += -mcpu=405
 $(obj)/virtex405-head.o: BOOTAFLAGS += -mcpu=405
 
-# the kernel's version of zlib pulls in a lot of other kernel headers
-# which we don't provide inside the wrapper.
+# The pre-boot decompressors pull in a lot of kernel headers and other source
+# files. This creates a bit of a dependency headache since we need to copy
+# these files into the build dir, fix up any includes and ensure that dependent
+# files are copied in the right order.
+
+# these need to be separate variables because they are copied out of different
+# directories in the kernel tree. Sure you COULd merge them, but it's a
+# cure-is-worse-than-disease situation.
+zlib-decomp-$(CONFIG_KERNEL_GZIP) := decompress_inflate.c
 zlib-$(CONFIG_KERNEL_GZIP) := inffast.c inflate.c inftrees.c
 zlibheader-$(CONFIG_KERNEL_GZIP) := inffast.h inffixed.h inflate.h inftrees.h 
infutil.h
 zliblinuxheader-$(CONFIG_KERNEL_GZIP) := zlib.h zconf.h zutil.h
 
-$(addprefix $(obj)/,$(zlib-y) cuboot-c2k.o gunzip_util.o main.o): \
+$(addprefix $(obj)/, decompress.o): \
+   $(addprefix $(obj)/,$(zlib-decomp-y))
+
+$(addprefix $(obj)/, $(zlib-decomp-y)): \
+   $(addprefix $(obj)/,$(zliblinuxheader-y)) \
+   $(addprefix $(obj)/,$(zlibheader-y)) \
+   $(addprefix $(obj)/,$(zlib-y))
+
+$(addprefix $(obj)/,$(zlib-y)): \
$(addprefix $(obj)/,$(zliblinuxheader-y)) \
$(addprefix $(obj)/,$(zlibheader-y))
 
@@ -79,10 +94,10 @@ libfdtheader := fdt.h libfdt.h libfdt_internal.h
 $(addprefix $(obj)/,$(libfdt) libfdt-wrapper.o simpleboot.o epapr.o opal.o): \
$(addprefix $(obj)/,$(libfdtheader))
 
-src-wlib-y := string.S crt0.S crtsavres.S stdio.c main.c \
+src-wlib-y := string.S crt0.S crtsavres.S stdio.c decompress.c main.c \
$(libfdt) libfdt-wrapper.c \
ns16550.c serial.c simple_alloc.c div64.S util.S \
-   gunzip_util.c elf_util.c $(zlib-y) devtree.c stdlib.c \
+   elf_util.c $(zlib-y) devtree.c stdlib.c \
oflib.c ofconsole.c cuboot.c mpsc.c cpm-serial.c \
uartlite.c mpc52xx-psc.c opal.c opal-calls.S
 src-wlib-$(CONFIG_40x) += 4xx.c planetcore.c
@@ -143,6 +158,9 @@ $(addprefix $(obj)/,$(zlibheader-y)): $(obj)/%: 
$(srctree)/lib/zlib_inflate/%
 $(addprefix $(obj)/,$(zliblinuxheader-y)): $(obj)/%: $(srctree)/include/linux/%
$(call cmd,copy_kern_src)
 
+$(addprefix $(obj)/,$(zlib-decomp-y)): $(obj)/%: $(srctree)/lib/%
+   $(call cmd,copy_kern_src)
+
 quiet_cmd_copy_libfdt = COPY$@
   cmd_copy_libfdt = cp $< $@
 
@@ -160,7 +178,7 @@ $(obj)/zImage.coff.lds $(obj)/zImage.ps3.lds : $(obj)/%: 
$(srctree)/$(src)/%.S
$(Q)cp $< $@
 
 clean-files := $(zlib-) $(zlibheader-) $(zliblinuxheader-) \
-   $(libfdt) $(libfdtheader) \
+   $(zlib-decomp-) $(libfdt) $(libfdtheader) \
empty.c zImage.coff.lds zImage.ps3.lds zImage.lds
 
 quiet_cmd_bootcc = BOOTCC  $@
@@ -410,8 +428,8 @@ clean-files += $(image-) $(initrd-) cuImage.* dtbImage.* 
treeImage.* \
zImage.maple simpleImage.* otheros.bld *.dtb
 
 # clean up files cached by wrapper
-clean-kernel := vmlinux.strip vmlinux.bin
-clean-kernel += $(addsuffix .gz,$(clean-kernel))
+clean-kernel-base := vmlinux.strip vmlinux.bin
+clean-kernel := $(addsuffix .gz,$(clean-kernel-base))
 # If not absolute clean-files are relative to $(obj).
 clean-files += $(addprefix $(objtree)/, $(clean-kernel))
 
diff --git a/arch/powerpc/boot/decompress.c b/arch/powerpc/boot/decompress.c
new file mode 100644
index ..60fc6fb26867
--- /dev/null
+++ b/arch/powerpc/boot/decompress.c
@@ -0,0 +1,142 @@
+/*
+ * Wrapper around the kernel's pre-boot decompression library.
+ *
+ * Copyright (C) IBM Corporation 2016.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation;

[PATCH v2 2/6] powerpc/boot: Use CONFIG_KERNEL_GZIP

2016-09-19 Thread Oliver O'Halloran

Most architectures allow the compression algorithm used to produced the
vmlinuz image to be selected as a kernel config option. In preperation
for supporting algorithms other than gzip in the powerpc boot wrapper
the makefile needs to be modified to use these config options.

Signed-off-by: Oliver O'Halloran 
---
 arch/powerpc/Kconfig   |  1 +
 arch/powerpc/boot/Makefile | 30 ++
 2 files changed, 19 insertions(+), 12 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 914983a29156..aa96bda118aa 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -161,6 +161,7 @@ config PPC
select GENERIC_CPU_AUTOPROBE
select HAVE_VIRT_CPU_ACCOUNTING
select HAVE_ARCH_HARDENED_USERCOPY
+   select HAVE_KERNEL_GZIP
 
 config GENERIC_CSUM
def_bool CPU_LITTLE_ENDIAN
diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile
index 7d6768253caa..bede555d78cf 100644
--- a/arch/powerpc/boot/Makefile
+++ b/arch/powerpc/boot/Makefile
@@ -19,10 +19,14 @@
 
 all: $(obj)/zImage
 
+compress-$(CONFIG_KERNEL_GZIP) := CONFIG_KERNEL_GZIP
+
 BOOTCFLAGS:= -Wall -Wundef -Wstrict-prototypes -Wno-trigraphs \
 -fno-strict-aliasing -Os -msoft-float -pipe \
 -fomit-frame-pointer -fno-builtin -fPIC -nostdinc \
--isystem $(shell $(CROSS32CC) -print-file-name=include)
+-isystem $(shell $(CROSS32CC) -print-file-name=include) \
+-D$(compress-y)
+
 ifdef CONFIG_PPC64_BOOT_WRAPPER
 BOOTCFLAGS += -m64
 endif
@@ -59,13 +63,15 @@ $(obj)/treeboot-currituck.o: BOOTCFLAGS += -mcpu=405
 $(obj)/treeboot-akebono.o: BOOTCFLAGS += -mcpu=405
 $(obj)/virtex405-head.o: BOOTAFLAGS += -mcpu=405
 
+# the kernel's version of zlib pulls in a lot of other kernel headers
+# which we don't provide inside the wrapper.
+zlib-$(CONFIG_KERNEL_GZIP) := inffast.c inflate.c inftrees.c
+zlibheader-$(CONFIG_KERNEL_GZIP) := inffast.h inffixed.h inflate.h inftrees.h 
infutil.h
+zliblinuxheader-$(CONFIG_KERNEL_GZIP) := zlib.h zconf.h zutil.h
 
-zlib   := inffast.c inflate.c inftrees.c
-zlibheader := inffast.h inffixed.h inflate.h inftrees.h infutil.h
-zliblinuxheader := zlib.h zconf.h zutil.h
-
-$(addprefix $(obj)/,$(zlib) cuboot-c2k.o gunzip_util.o main.o): \
-   $(addprefix $(obj)/,$(zliblinuxheader)) $(addprefix 
$(obj)/,$(zlibheader))
+$(addprefix $(obj)/,$(zlib-y) cuboot-c2k.o gunzip_util.o main.o): \
+   $(addprefix $(obj)/,$(zliblinuxheader-y)) \
+   $(addprefix $(obj)/,$(zlibheader-y))
 
 libfdt   := fdt.c fdt_ro.c fdt_wip.c fdt_sw.c fdt_rw.c fdt_strerror.c
 libfdtheader := fdt.h libfdt.h libfdt_internal.h
@@ -76,7 +82,7 @@ $(addprefix $(obj)/,$(libfdt) libfdt-wrapper.o simpleboot.o 
epapr.o opal.o): \
 src-wlib-y := string.S crt0.S crtsavres.S stdio.c main.c \
$(libfdt) libfdt-wrapper.c \
ns16550.c serial.c simple_alloc.c div64.S util.S \
-   gunzip_util.c elf_util.c $(zlib) devtree.c stdlib.c \
+   gunzip_util.c elf_util.c $(zlib-y) devtree.c stdlib.c \
oflib.c ofconsole.c cuboot.c mpsc.c cpm-serial.c \
uartlite.c mpc52xx-psc.c opal.c opal-calls.S
 src-wlib-$(CONFIG_40x) += 4xx.c planetcore.c
@@ -128,13 +134,13 @@ obj-plat: $(libfdt)
 quiet_cmd_copy_kern_src = COPY$@
   cmd_copy_kern_src = sed -f 
$(srctree)/arch/powerpc/boot/fixup-headers.sed $< > $@
 
-$(addprefix $(obj)/,$(zlib)): $(obj)/%: $(srctree)/lib/zlib_inflate/%
+$(addprefix $(obj)/,$(zlib-y)): $(obj)/%: $(srctree)/lib/zlib_inflate/%
$(call cmd,copy_kern_src)
 
-$(addprefix $(obj)/,$(zlibheader)): $(obj)/%: $(srctree)/lib/zlib_inflate/%
+$(addprefix $(obj)/,$(zlibheader-y)): $(obj)/%: $(srctree)/lib/zlib_inflate/%
$(call cmd,copy_kern_src)
 
-$(addprefix $(obj)/,$(zliblinuxheader)): $(obj)/%: $(srctree)/include/linux/%
+$(addprefix $(obj)/,$(zliblinuxheader-y)): $(obj)/%: $(srctree)/include/linux/%
$(call cmd,copy_kern_src)
 
 quiet_cmd_copy_libfdt = COPY$@
@@ -153,7 +159,7 @@ $(obj)/zImage.lds: $(obj)/%: $(srctree)/$(src)/%.S
 $(obj)/zImage.coff.lds $(obj)/zImage.ps3.lds : $(obj)/%: $(srctree)/$(src)/%.S
$(Q)cp $< $@
 
-clean-files := $(zlib) $(zlibheader) $(zliblinuxheader) \
+clean-files := $(zlib-) $(zlibheader-) $(zliblinuxheader-) \
$(libfdt) $(libfdtheader) \
empty.c zImage.coff.lds zImage.ps3.lds zImage.lds
 
-- 
2.5.5

[PATCH v2 1/6] powerpc/boot: add sed script

2016-09-19 Thread Oliver O'Halloran

The powerpc boot wrapper is compiled with a separate "bootcc" toolchain
rather than the toolchain used for the rest of the kernel. The main
problem with this is that the wrapper does not have access to the kernel
headers (without a lot of gross hacks). To get around this the required
headers are copied into the build directory via several sed scripts
which rewrite problematic includes. This patch moves these fixups out of
the makefile into a separate .sed script file to clean up makefile
slightly.

Signed-off-by: Oliver O'Halloran 
---
 arch/powerpc/boot/Makefile  | 16 +---
 arch/powerpc/boot/fixup-headers.sed | 12 
 2 files changed, 17 insertions(+), 11 deletions(-)
 create mode 100644 arch/powerpc/boot/fixup-headers.sed

diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile
index df0fd406aed1..7d6768253caa 100644
--- a/arch/powerpc/boot/Makefile
+++ b/arch/powerpc/boot/Makefile
@@ -125,23 +125,17 @@ obj-wlib := $(addsuffix .o, $(basename $(addprefix 
$(obj)/, $(src-wlib
 obj-plat := $(addsuffix .o, $(basename $(addprefix $(obj)/, $(src-plat
 obj-plat: $(libfdt)
 
-quiet_cmd_copy_zlib = COPY$@
-  cmd_copy_zlib = sed "s@__used@@;s@]*\).*@\"\1\"@" $< > $@
-
-quiet_cmd_copy_zlibheader = COPY$@
-  cmd_copy_zlibheader = sed "s@]*\).*@\"\1\"@" $< > $@
-# stddef.h for NULL
-quiet_cmd_copy_zliblinuxheader = COPY$@
-  cmd_copy_zliblinuxheader = sed 
"s@@\"string.h\"@;s@@@;s@]*\).*@\"\1\"@"
 $< > $@
+quiet_cmd_copy_kern_src = COPY$@
+  cmd_copy_kern_src = sed -f 
$(srctree)/arch/powerpc/boot/fixup-headers.sed $< > $@
 
 $(addprefix $(obj)/,$(zlib)): $(obj)/%: $(srctree)/lib/zlib_inflate/%
-   $(call cmd,copy_zlib)
+   $(call cmd,copy_kern_src)
 
 $(addprefix $(obj)/,$(zlibheader)): $(obj)/%: $(srctree)/lib/zlib_inflate/%
-   $(call cmd,copy_zlibheader)
+   $(call cmd,copy_kern_src)
 
 $(addprefix $(obj)/,$(zliblinuxheader)): $(obj)/%: $(srctree)/include/linux/%
-   $(call cmd,copy_zliblinuxheader)
+   $(call cmd,copy_kern_src)
 
 quiet_cmd_copy_libfdt = COPY$@
   cmd_copy_libfdt = cp $< $@
diff --git a/arch/powerpc/boot/fixup-headers.sed 
b/arch/powerpc/boot/fixup-headers.sed
new file mode 100644
index ..96362428eb37
--- /dev/null
+++ b/arch/powerpc/boot/fixup-headers.sed
@@ -0,0 +1,12 @@
+# Copyright 2016 IBM Corporation.
+#
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License version 2 or later as
+# published by the Free Software Foundation.
+
+s@#include @@;
+s@\"zlib_inflate/\([^\"]*\).*@"\1"@;
+s@@@;
+
+s@__used@@;
+s@]*\).*@"\1"@;
-- 
2.5.5

[v2] XZ compressed zImage support

2016-09-19 Thread Oliver O'Halloran

This series adds support for using XZ compression in addition to gzip in the
kernel boot wrapper. Currently this is only enabled for 64bit Book3S processors
since it seems that some embedded platforms rely on uBoot (or similar) to
decompress the image rather than having the kernel decompress itself. Enabling
it for other platforms should be fairly straight forward though.

Supporting other compression algorithms (like ARM and x86 do) is possible, but
painful. Each algorithm includes some kernel headers even when the #defines
that are supposed to make them usable in a pre-boot environment are set.
Including kernel headers is an issue because on powerpc  the boot wrapper is
compiled with a different toolchain and possibly for a different target for
backwards compatibility reasons*. This makes it difficult to include kernel
headers since the include paths, etc are not setup for BOOTCC.

This can be worked around by rewriting parts of the each decompressor with sed
scripts, but the rewriting requried is specific to each decompressor.

-oliver

*powermacs have 32bit firmware that cannot directly load a 64bit kernel. A 64
bit big endian kernel has a 32bit wrapper to work around this. On 64bit little
endian we don't have this legacy problem so the wrapper is also 64bit little
endian, but the toolchain issues are still there.

---
Changes from v1:
fixed some missing dependecies in the Makefile that were causing random
build breaks.

Fixed "make clean" so that it would remove the files copied into
arch/powerpc/boot/ when the wrapper was built.

previously this series renamed "zlibheader" to "zlibheaders". There were
consequences.
---

Re: [PATCH] powerpc/64s: exception optimise MSR handling

2016-09-19 Thread Michael Ellerman

Nicholas Piggin  writes:

> mtmsrd with L=1 only affects MSR_EE and MSR_RI bits, and we always
> know what state those bits are, so the kernel MSR does not need to be
> loaded when modifying them.
>
> mtmsrd is often in the critical execution path, so avoiding dependency
> on even L1 load is noticable. On a POWER8 this saves about 3 cycles
> from the syscall path, and possibly a few from other exception returns
> (not measured).

This looks good in principle.

My worry is that these code paths have lots of assumptions about what's
left in registers, so we may have a path somewhere which expects rX to
contain PACAKMSR. Hence the review below ...

> diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
> index 6b8bc0d..585b9ca 100644
> --- a/arch/powerpc/kernel/entry_64.S
> +++ b/arch/powerpc/kernel/entry_64.S
> @@ -139,7 +139,7 @@ END_FW_FTR_SECTION_IFSET(FW_FEATURE_SPLPAR)
>  #ifdef CONFIG_PPC_BOOK3E
>   wrteei  1
>  #else
> - ld  r11,PACAKMSR(r13)
> + li  r11,MSR_RI
>   ori r11,r11,MSR_EE
>   mtmsrd  r11,1
>  #endif /* CONFIG_PPC_BOOK3E */

/* We do need to set SOFTE in the stack frame or the return
 * from interrupt will be painful
 */
li  r10,1
std r10,SOFTE(r1)

CURRENT_THREAD_INFO(r11, r1)

So that's one OK. r11 isn't used again until its clobbered here.

> @@ -195,7 +195,6 @@ system_call:  /* label this so stack 
> traces look sane */

#ifdef CONFIG_PPC_BOOK3S
/* No MSR:RI on BookE */
andi.   r10,r8,MSR_RI
beq-unrecov_restore
#endif

So at this point r10 == MSR_RI, otherwise we would have taken the branch.

/*
 * Disable interrupts so current_thread_info()->flags can't change,
 * and so that we don't get interrupted after loading SRR0/1.
 */
>  #ifdef CONFIG_PPC_BOOK3E
>   wrteei  0
>  #else
> - ld  r10,PACAKMSR(r13)
>   /*
>* For performance reasons we clear RI the same time that we
>* clear EE. We only need to clear RI just before we restore r13
>* below, but batching it with EE saves us one expensive mtmsrd call.
>* We have to be careful to restore RI if we branch anywhere from
>* here (eg syscall_exit_work).
>*/
> - li  r9,MSR_RI
> - andcr11,r10,r9
> + li  r11,0
>   mtmsrd  r11,1
>  #endif /* CONFIG_PPC_BOOK3E */

ld  r9,TI_FLAGS(r12)
li  r11,-MAX_ERRNO
andi.   
r0,r9,(_TIF_SYSCALL_DOTRACE|_TIF_SINGLESTEP|_TIF_USER_WORK_MASK|_TIF_PERSYSCALL_MASK)
bne-syscall_exit_work

Which is:

syscall_exit_work:
#ifdef CONFIG_PPC_BOOK3S
mtmsrd  r10,1   /* Restore RI */
#endif

So that appears to still work. But it's super fragile.

What I'd like to do is drop that optimisation of clearing RI early with
EE. That would avoid us needing to restore RI in syscall_exit_work and
before restore_math (and reclearing it after).

But I guess that will hurt performance too much :/

cheers

Re: [PATCH] powerpc/64s: optimise syscall entry for virtual, relocatable case

2016-09-19 Thread Balbir Singh



On 15/09/16 19:03, Nicholas Piggin wrote:
> The mflr r10 instruction was left over saving of lr when the code used
> lr to branch to system_call_entry from the exception handler. That was
> changed by 6a404806d to use the count register. The value is never used
> now, so mflr can be removed, and r10 can be used for storage rather than
> spilling to the SPR scratch register.
> 
> The scratch register spill causes a long pipeline stall due to the SPR
> read after write. This change brings getppid syscall cost from 406 to
> 376 cycles on POWER8. getppid for non-relocatable case is 371 cycles.
> 
> Signed-off-by: Nicholas Piggin 
> ---
> 
>  arch/powerpc/kernel/exceptions-64s.S | 7 ++-
>  1 file changed, 2 insertions(+), 5 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/exceptions-64s.S 
> b/arch/powerpc/kernel/exceptions-64s.S
> index df6d45e..2cdd64f 100644
> --- a/arch/powerpc/kernel/exceptions-64s.S
> +++ b/arch/powerpc/kernel/exceptions-64s.S
> @@ -63,15 +63,12 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE)
> \
>* is volatile across system calls.
>*/
>  #define SYSCALL_PSERIES_2_DIRECT \
> - mflrr10 ;   \
>   ld  r12,PACAKBASE(r13) ;\
>   LOAD_HANDLER(r12, system_call_entry) ;  \
>   mtctr   r12 ;   \
>   mfspr   r12,SPRN_SRR1 ; \
> - /* Re-use of r13... No spare regs to do this */ \
> - li  r13,MSR_RI ;\
> - mtmsrd  r13,1 ; \
> - GET_PACA(r13) ; /* get r13 back */  \
> + li  r10,MSR_RI ;\
> + mtmsrd  r10,1 ; \
>   bctr ;
>  #else
>   /* We can branch directly */
> 

The patch makes sense

Acked-by: Balbir Singh

Re: PowerPC agpmode issues

2016-09-19 Thread Herminio Hernandez, Jr.

Michel,

Yes to both, however when I set radeon.agpmode=1 most of the time the
kernel freezes when booting. When I do get past that I get these errors:

rican-linux@Debian-G5:~$ dmesg |grep -e radeon -e drm
*[0.00] Kernel command line:
root=UUID=aeca9a67-31d7-4c4b-a0f8-4db328b33305  radeon.agpmode=1*
[   10.432049] [drm] Initialized drm 1.1.0 20060810
[   11.291427] [drm] radeon kernel modesetting enabled.
[   11.302838] fb: switching to radeondrmfb from OFfb ATY,Simone
[   11.317952] fb: switching to radeondrmfb from OFfb ATY,Simone
[   11.321318] radeon :f0:10.0: enabling device (0006 -> 0007)
[   11.321741] [drm] initializing kernel modesetting (RV350 0x1002:0x4150
0x1002:0x4150 0x00).
[   11.321770] [drm] register mmio base: 0xA000
[   11.321776] [drm] register mmio size: 65536
[   11.321820] radeon :f0:10.0: Invalid PCI ROM header signature:
expecting 0xaa55, got 0x
[   11.415530] [drm] Not an x86 BIOS ROM, not using.
[   11.415573] [drm] Using device-tree clock info
*[   11.415769] [drm:.radeon_agp_init [radeon]] *ERROR* Illegal AGP Mode: 1
(valid 4, 8), leaving at 8*
[   11.415818] radeon :f0:10.0: putting AGP V3 device into 8x mode
[   11.415925] radeon :f0:10.0: GTT: 256M 0x - 0x0FFF
[   11.415933] [drm] Generation 2 PCI interface, using max accessible memory
[   11.415943] radeon :f0:10.0: VRAM: 256M 0xB000 -
0xBFFF (64M used)
[   11.415987] [drm] Detected VRAM RAM=256M, BAR=256M
[   11.415993] [drm] RAM width 128bits DDR
[   11.416229] [drm] radeon: 64M of VRAM memory ready
[   11.416237] [drm] radeon: 256M of GTT memory ready.
[   11.416308] [drm] radeon: 1 quad pipes, 1 Z pipes initialized.
[   11.431606] radeon :f0:10.0: WB disabled
[   11.431627] radeon :f0:10.0: fence driver on ring 0 use gpu addr
0x and cpu addr 0xd3ee
[   11.431642] [drm] Supports vblank timestamp caching Rev 2 (21.10.2013).
[   11.431648] [drm] Driver supports precise vblank timestamp query.
[   11.431700] [drm] radeon: irq initialized.
[   11.431743] [drm] Loading R300 Microcode
[   11.488921] radeon :f0:10.0: firmware: direct-loading firmware
radeon/R300_cp.bin
[   11.489178] [drm] radeon: ring at 0x0001





*[   11.637210] [drm:.r100_ring_test [radeon]] *ERROR* radeon: ring test
failed (scratch(0x15E4)=0xCAFEDEAD)[   11.637318] [drm:.r100_cp_init
[radeon]] *ERROR* radeon: cp isn't working (-22).[   11.637331] radeon
:f0:10.0: failed initializing CP (-22).[   11.637338] radeon
:f0:10.0: Disabling GPU acceleration[   11.784325] [drm:.r100_cp_fini
[radeon]] *ERROR* Wait for CP idle timeout, shutting down CP.[   11.931351]
[drm] radeon: cp finalized*



*[   11.931464] radeon :f0:10.0: (r300_asic_reset:425)
RBBM_STATUS=0x80010140[   12.431463] radeon :f0:10.0:
(r300_asic_reset:444) RBBM_STATUS=0x80010140[   12.927471] radeon
:f0:10.0: (r300_asic_reset:456) RBBM_STATUS=0x0140[   12.927510]
radeon :f0:10.0: GPU reset succeed*



On Mon, Sep 19, 2016 at 8:05 PM, Michel Dänzer  wrote:

> On 19/09/16 09:36 PM, Mathieu Malaterre wrote:
> >
> > Finally your dmesg looks odd since the line `[drm] Forcing AGP to PCI
> > mode` comes only after the first error.
>
> That isn't odd but the AGP->PCI(e) fallback mechanism working as
> intended, trying AGP first and falling back to PCIe if AGP fails.
>
> Herminio, does the problem also occur if you specify radeon.agpmode=-1
> (or maybe =1) on the kernel command line?
>
>
> --
> Earthling Michel Dänzer   |   http://www.amd.com
> Libre software enthusiast | Mesa and X developer
>

Re: PowerPC agpmode issues

2016-09-19 Thread Michel Dänzer

On 19/09/16 09:36 PM, Mathieu Malaterre wrote:
> 
> Finally your dmesg looks odd since the line `[drm] Forcing AGP to PCI
> mode` comes only after the first error.

That isn't odd but the AGP->PCI(e) fallback mechanism working as
intended, trying AGP first and falling back to PCIe if AGP fails.

Herminio, does the problem also occur if you specify radeon.agpmode=-1
(or maybe =1) on the kernel command line?

-- 
Earthling Michel Dänzer   |   http://www.amd.com
Libre software enthusiast | Mesa and X developer

Re: [PATCH] powerpc/mm: Update the FORCE_MAX_ZONEORDER range to enable hugetlb

2016-09-19 Thread Balbir Singh



On 20/09/16 12:52, Aneesh Kumar K.V wrote:
> Balbir Singh  writes:
> 
>> On 20/09/16 03:31, Aneesh Kumar K.V wrote:
[...]
>>
>> Do we need the range to be 12 13?
> 
> static inline bool hstate_is_gigantic(struct hstate *h)
> {
>   return huge_page_order(h) >= MAX_ORDER;
> }
> 
> We consider hstate gigantic if it also == MAX_ORDER. Hence it should
> be > 12 .
> 

Good point, agreed

Balbir Singh.

Re: [PATCH] powerpc/mm: Update the FORCE_MAX_ZONEORDER range to enable hugetlb

2016-09-19 Thread Aneesh Kumar K.V

Balbir Singh  writes:

> On 20/09/16 03:31, Aneesh Kumar K.V wrote:
>> For hugetlb to work with 4K page size, we need the MAX_ORDER to be more
>> than 13. When switching from a 64K page size to 4K linux page size using
>> make nconfig, we endup with a CONFIG_FORCE_MAX_ZONEORDER value of 9.
>> This results in 16M hugepage to be considered as a gigantic huge page
>> which inturn can result in failure to setup hugepages if gigantic
>> hugepage support is not enabled.
>> 
>> This also results in kernel crash with 4K radix configuration. We
>> hit the below BUG_ON on radix
>> 
>>  kernel BUG at mm/huge_memory.c:364!
>>  Oops: Exception in kernel mode, sig: 5 [#1]
>>  SMP NR_CPUS=2048 NUMA PowerNV
>>  Modules linked in:
>>  CPU: 0 PID: 1 Comm: swapper/0 Not tainted 4.8.0-rc1-6-gbae9cc6 #1
>>  task: c000f1af8000 task.stack: c000f1aec000
>>  NIP: c0c5fa0c LR: c0c5f9d8 CTR: c0c5f9a4
>>  REGS: c000f1aef920 TRAP: 0700   Not tainted (4.8.0-rc1-6-gbae9cc6)
>>  MSR: 900102029033   CR: 24000844  
>> XER: 
>>  CFAR: c0c5f9e0 SOFTE: 1
>> .
>>  NIP [c0c5fa0c] hugepage_init+0x68/0x238
>>  LR [c0c5f9d8] hugepage_init+0x34/0x238
>> 
>> Fixes: a7ee539584acf ("powerpc/Kconfig: Update config option based on page 
>> size")
>> 
>> Reported-by: Santhosh 
>> Signed-off-by: Aneesh Kumar K.V 
>> ---
>>  arch/powerpc/Kconfig | 2 +-
>>  1 file changed, 1 insertion(+), 1 deletion(-)
>> 
>> diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
>> index 927d2ab2ce08..792cb1768c8f 100644
>> --- a/arch/powerpc/Kconfig
>> +++ b/arch/powerpc/Kconfig
>> @@ -637,7 +637,7 @@ config FORCE_MAX_ZONEORDER
>>  int "Maximum zone order"
>>  range 8 9 if PPC64 && PPC_64K_PAGES
>>  default "9" if PPC64 && PPC_64K_PAGES
>> -range 9 13 if PPC64 && !PPC_64K_PAGES
>> +range 13 13 if PPC64 && !PPC_64K_PAGES
>
> Do we need the range to be 12 13?

static inline bool hstate_is_gigantic(struct hstate *h)
{
return huge_page_order(h) >= MAX_ORDER;
}

We consider hstate gigantic if it also == MAX_ORDER. Hence it should
be > 12 .

>
>>  default "13" if PPC64 && !PPC_64K_PAGES
>>  range 9 64 if PPC32 && PPC_16K_PAGES
>>  default "9" if PPC32 && PPC_16K_PAGES
>> 
>
> Otherwise
>
> Acked-by: Balbir Singh

Re: [PATCH v2 2/3] powerpc: get hugetlbpage handling more generic

2016-09-19 Thread Aneesh Kumar K.V

christophe leroy  writes:

>>
>>
>>> for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
>>> unsigned shift;
>>> unsigned pdshift;
>>> @@ -860,16 +807,31 @@ static int __init hugetlbpage_init(void)
>>>  * if we have pdshift and shift value same, we don't
>>>  * use pgt cache for hugepd.
>>>  */
>>> -   if (pdshift != shift) {
>>> +   if (pdshift > shift) {
>>> pgtable_cache_add(pdshift - shift, NULL);
>>> if (!PGT_CACHE(pdshift - shift))
>>> panic("hugetlbpage_init(): could not create "
>>>   "pgtable cache for %d bit pagesize\n", 
>>> shift);
>>> +   } else if (!hugepte_cache) {
>>> +   /*
>>> +* Create a kmem cache for hugeptes.  The bottom bits in
>>> +* the pte have size information encoded in them, so
>>> +* align them to allow this
>>> +*/
>>> +   hugepte_cache = kmem_cache_create("hugepte-cache",
>>> + sizeof(pte_t),
>>> + HUGEPD_SHIFT_MASK + 1,
>>> + 0, NULL);
>>> +   if (hugepte_cache == NULL)
>>> +   panic("%s: Unable to create kmem cache "
>>> + "for hugeptes\n", __func__);
>>> +
>>
>>
>> We don't need hugepte_cache for book3s 64K. I guess we will endup
>> creating one here ?
>
> Should not, because on book3s 64k, we will have pdshift > shift
> won't we ?
>

on 64k book3s, we have pdshift == shift and we don't need to create 
hugepd cache on book3s 64k.

-aneesh

Re: [PATCH v2 2/3] powerpc: get hugetlbpage handling more generic

2016-09-19 Thread Aneesh Kumar K.V

christophe leroy  writes:

> Le 19/09/2016 à 07:50, Aneesh Kumar K.V a écrit :
>>
>> Christophe Leroy  writes:
>>> +#else
>>> +static void hugepd_free(struct mmu_gather *tlb, void *hugepte)
>>> +{
>>> +   BUG();
>>> +}
>>> +
>>>  #endif
>>
>>
>> I was expecting that BUG will get removed in the next patch. But I don't
>> see it in the next patch. Considering
>>
>> @@ -475,11 +453,10 @@ static void free_hugepd_range(struct mmu_gather *tlb, 
>> hugepd_t *hpdp, int pdshif
>> for (i = 0; i < num_hugepd; i++, hpdp++)
>> hpdp->pd = 0;
>>
>> -#ifdef CONFIG_PPC_FSL_BOOK3E
>> -hugepd_free(tlb, hugepte);
>> -#else
>> -pgtable_free_tlb(tlb, hugepte, pdshift - shift);
>> -#endif
>> +if (shift >= pdshift)
>> +hugepd_free(tlb, hugepte);
>> +else
>> +pgtable_free_tlb(tlb, hugepte, pdshift - shift);
>>  }
>>
>> What is that I am missing ?
>>
>
> Previously, call to hugepd_free() was compiled only when #ifdef 
> CONFIG_PPC_FSL_BOOK3E
> Now, it is compiled at all time, but it should never be called if not 
> CONFIG_PPC_FSL_BOOK3E because we always have shift < pdshift in that case.
> Then the function needs to be defined anyway but should never be called. 
> Should I just define it static inline {} ?
>

For 8M with 4K mode, we have shift >= pdshift right ?

-aneesh

Re: [PATCH] powerpc/mm: Update the FORCE_MAX_ZONEORDER range to enable hugetlb

2016-09-19 Thread Balbir Singh



On 20/09/16 03:31, Aneesh Kumar K.V wrote:
> For hugetlb to work with 4K page size, we need the MAX_ORDER to be more
> than 13. When switching from a 64K page size to 4K linux page size using
> make nconfig, we endup with a CONFIG_FORCE_MAX_ZONEORDER value of 9.
> This results in 16M hugepage to be considered as a gigantic huge page
> which inturn can result in failure to setup hugepages if gigantic
> hugepage support is not enabled.
> 
> This also results in kernel crash with 4K radix configuration. We
> hit the below BUG_ON on radix
> 
>  kernel BUG at mm/huge_memory.c:364!
>  Oops: Exception in kernel mode, sig: 5 [#1]
>  SMP NR_CPUS=2048 NUMA PowerNV
>  Modules linked in:
>  CPU: 0 PID: 1 Comm: swapper/0 Not tainted 4.8.0-rc1-6-gbae9cc6 #1
>  task: c000f1af8000 task.stack: c000f1aec000
>  NIP: c0c5fa0c LR: c0c5f9d8 CTR: c0c5f9a4
>  REGS: c000f1aef920 TRAP: 0700   Not tainted (4.8.0-rc1-6-gbae9cc6)
>  MSR: 900102029033   CR: 24000844  
> XER: 
>  CFAR: c0c5f9e0 SOFTE: 1
> .
>  NIP [c0c5fa0c] hugepage_init+0x68/0x238
>  LR [c0c5f9d8] hugepage_init+0x34/0x238
> 
> Fixes: a7ee539584acf ("powerpc/Kconfig: Update config option based on page 
> size")
> 
> Reported-by: Santhosh 
> Signed-off-by: Aneesh Kumar K.V 
> ---
>  arch/powerpc/Kconfig | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
> index 927d2ab2ce08..792cb1768c8f 100644
> --- a/arch/powerpc/Kconfig
> +++ b/arch/powerpc/Kconfig
> @@ -637,7 +637,7 @@ config FORCE_MAX_ZONEORDER
>   int "Maximum zone order"
>   range 8 9 if PPC64 && PPC_64K_PAGES
>   default "9" if PPC64 && PPC_64K_PAGES
> - range 9 13 if PPC64 && !PPC_64K_PAGES
> + range 13 13 if PPC64 && !PPC_64K_PAGES

Do we need the range to be 12 13?

>   default "13" if PPC64 && !PPC_64K_PAGES
>   range 9 64 if PPC32 && PPC_16K_PAGES
>   default "9" if PPC32 && PPC_16K_PAGES
> 

Otherwise

Acked-by: Balbir Singh

Re: [PATCH v21 00/20] perf, tools: Add support for PMU events in JSON format

2016-09-19 Thread Arnaldo Carvalho de Melo

Em Mon, Sep 19, 2016 at 09:02:58PM -0300, Arnaldo Carvalho de Melo escreveu:
> Em Mon, Sep 19, 2016 at 08:37:53PM -0300, Arnaldo Carvalho de Melo escreveu:
> > yeah, changing that typedef + true def to plain include 
> > makes it progress to the next failure, which is in cross compilation
> > environments, such as using fedora 24 + the Android NDK to try to build
> > a ARM android binary.

> 14 fedora:24-x-ARC-uClibc: FAIL
>   GEN  /tmp/build/perf/pmu-events/pmu-events.c
> /bin/sh: /tmp/build/perf/pmu-events/jevents: cannot execute binary file: Exec 
> format error
> pmu-events/Build:11: recipe for target 
> '/tmp/build/perf/pmu-events/pmu-events.c' failed
> make[2]: *** [/tmp/build/perf/pmu-events/pmu-events.c] Error 126
> Makefile.perf:461: recipe for target 
> '/tmp/build/perf/pmu-events/pmu-events-in.o' failed
> make[1]: *** [/tmp/build/perf/pmu-events/pmu-events-in.o] Error 2
> make[1]: *** Waiting for unfinished jobs

Jiri, we need something similar to scripts/Makefile.host :-\

Calling it a day, perhaps, for now, we should just detect that it is a
corss compile env (CROSS_COMPILE is set) and exclude all this code from
the build, emitting a warning.

I left what I did at the tmp.perf/core branch of my repo at
git://git.kernel.org/pub/scm/linux/kernel/git/acme/linux.git.

- Arnaldo

Re: [PATCH v21 00/20] perf, tools: Add support for PMU events in JSON format

2016-09-19 Thread Arnaldo Carvalho de Melo

Em Mon, Sep 19, 2016 at 08:37:53PM -0300, Arnaldo Carvalho de Melo escreveu:
> yeah, changing that typedef + true def to plain include 
> makes it progress to the next failure, which is in cross compilation
> environments, such as using fedora 24 + the Android NDK to try to build
> a ARM android binary.
> 
> On the bright side, in addition to alpine:3.4 now these are building ok:
> 
>  3 archlinux:latest: Ok
>  4 centos:5: Ok
>  5 centos:6: Ok
>  6 centos:7: Ok
>  7 debian:7: Ok
>  8 debian:8: Ok
>  9 fedora:20: Ok
> 
> Waiting for some extra cross compilation envs to check that hunch...

Yeap:

10 fedora:21: Ok
11 fedora:22: Ok
12 fedora:23: Ok
13 fedora:24: Ok
14 fedora:24-x-ARC-uClibc: FAIL
  GEN  /tmp/build/perf/pmu-events/pmu-events.c
/bin/sh: /tmp/build/perf/pmu-events/jevents: cannot execute binary file: Exec 
format error
pmu-events/Build:11: recipe for target 
'/tmp/build/perf/pmu-events/pmu-events.c' failed
make[2]: *** [/tmp/build/perf/pmu-events/pmu-events.c] Error 126
Makefile.perf:461: recipe for target 
'/tmp/build/perf/pmu-events/pmu-events-in.o' failed
make[1]: *** [/tmp/build/perf/pmu-events/pmu-events-in.o] Error 2
make[1]: *** Waiting for unfinished jobs
15 fedora:rawhide: Ok
16 mageia:5: Ok
17 opensuse:13.2: Ok
18 opensuse:42.1: Ok
19 86.364838418 opensuse:tumbleweed: Ok
20 62.414227297 ubuntu:12.04.5: Ok
21 41.172970676 ubuntu:14.04: Ok
22 74.243049939 ubuntu:14.04.4: Ok
23 75.634449405 ubuntu:15.10: Ok
24 70.487913217 ubuntu:16.04: Ok
25 33.938554395 ubuntu:16.04-x-arm: FAIL
26 31.902507224 ubuntu:16.04-x-arm64: FAIL
27 32.903876355 ubuntu:16.04-x-powerpc64: FAIL
28 32.849412876 ubuntu:16.04-x-powerpc64el: FAIL
29 79.446950856 ubuntu:16.10: Ok
30 33.443948539 ubuntu:16.10-x-arm64: FAIL
31 33.801496151 ubuntu:16.10-x-powerpc: FAIL
32 32.860975730 ubuntu:16.10-x-s390: FAIL

Re: [PATCH v21 00/20] perf, tools: Add support for PMU events in JSON format

2016-09-19 Thread Arnaldo Carvalho de Melo

Em Mon, Sep 19, 2016 at 08:31:13PM -0300, Arnaldo Carvalho de Melo escreveu:
> Em Mon, Sep 19, 2016 at 06:20:17PM -0300, Arnaldo Carvalho de Melo escreveu:
> > Em Thu, Sep 15, 2016 at 03:24:37PM -0700, Sukadev Bhattiprolu escreveu:
> > > CPUs support a large number of performance monitoring events (PMU events)
> > > and often these events are very specific to an architecture/model of the
> > > CPU. To use most of these PMU events with perf, we currently have to 
> > > identify
> > > them by their raw codes:
> > > 
> > >   perf stat -e r100f2 sleep 1
> > 
> > So, trying to build this with my set of containers I get lots of failures,
> > double checking running on another machine with those containers for the
> > segfault cases, changing sys/fcntl.h to fcntl.h cures the build on 
> > alpine:3.4
> > (musl libc), will try to fix all of them.
> > 
> > The list is incomplete, lots of other systems failed as well.
> > 
> > - Arnaldo
> > 
>  1 alpine:3.4: Ok
> 
> Fixed with fcntl.h + capping the maxfds parameter to nftw to avoid it
> exploding on alloca() in environments where rlim_max is set to a high
> value, like in docker.
> 
> Now looking at:
> 
>   CC   /tmp/build/perf/fs/tracing_path.o
> In file included from /git/linux/tools/include/linux/types.h:4:0,
>  from 
> /opt/android-ndk-r12b/platforms/android-24/arch-arm/usr/include/sys/types.h:35,
>  from 
> /opt/android-ndk-r12b/platforms/android-24/arch-arm/usr/include/strings.h:42,
>  from 
> /opt/android-ndk-r12b/platforms/android-24/arch-arm/usr/include/stdlib.h:36,
>  from pmu-events/json.c:31:
> pmu-events/json.h:15:22: error: two or more data types in declaration 
> specifiers
>  typedef unsigned int bool;
> 
> 
> Which looks like clashing with stdbool.h...

yeah, changing that typedef + true def to plain include 
makes it progress to the next failure, which is in cross compilation
environments, such as using fedora 24 + the Android NDK to try to build
a ARM android binary.

On the bright side, in addition to alpine:3.4 now these are building ok:

 3 archlinux:latest: Ok
 4 centos:5: Ok
 5 centos:6: Ok
 6 centos:7: Ok
 7 debian:7: Ok
 8 debian:8: Ok
 9 fedora:20: Ok

Waiting for some extra cross compilation envs to check that hunch...

- Arnaldo

Re: [PATCH v21 00/20] perf, tools: Add support for PMU events in JSON format

2016-09-19 Thread Arnaldo Carvalho de Melo

Em Mon, Sep 19, 2016 at 06:20:17PM -0300, Arnaldo Carvalho de Melo escreveu:
> Em Thu, Sep 15, 2016 at 03:24:37PM -0700, Sukadev Bhattiprolu escreveu:
> > CPUs support a large number of performance monitoring events (PMU events)
> > and often these events are very specific to an architecture/model of the
> > CPU. To use most of these PMU events with perf, we currently have to 
> > identify
> > them by their raw codes:
> > 
> > perf stat -e r100f2 sleep 1
> 
> So, trying to build this with my set of containers I get lots of failures,
> double checking running on another machine with those containers for the
> segfault cases, changing sys/fcntl.h to fcntl.h cures the build on alpine:3.4
> (musl libc), will try to fix all of them.
> 
> The list is incomplete, lots of other systems failed as well.
> 
> - Arnaldo
> 
 1 alpine:3.4: Ok

Fixed with fcntl.h + capping the maxfds parameter to nftw to avoid it
exploding on alloca() in environments where rlim_max is set to a high
value, like in docker.

Now looking at:

  CC   /tmp/build/perf/fs/tracing_path.o
In file included from /git/linux/tools/include/linux/types.h:4:0,
 from 
/opt/android-ndk-r12b/platforms/android-24/arch-arm/usr/include/sys/types.h:35,
 from 
/opt/android-ndk-r12b/platforms/android-24/arch-arm/usr/include/strings.h:42,
 from 
/opt/android-ndk-r12b/platforms/android-24/arch-arm/usr/include/stdlib.h:36,
 from pmu-events/json.c:31:
pmu-events/json.h:15:22: error: two or more data types in declaration specifiers
 typedef unsigned int bool;


Which looks like clashing with stdbool.h...

- Arnaldo


> android-ndk:r12b-arm: FAIL
> 
>   CC   /tmp/build/perf/event-plugin.o
> pmu-events/json.c:35:23: fatal error: sys/fcntl.h: No such file or
> directory
>  #include 
>^
> compilation terminated.
> ---
> archlinux:latest: FAIL
> /bin/sh: line 1:  1408 Segmentation fault  (core dumped) 
> /tmp/build/perf/pmu-events/jevents x86 pmu-events/arch 
> /tmp/build/perf/pmu-events/pmu-events.c
> make[2]: *** [pmu-events/Build:11: /tmp/build/perf/pmu-events/pmu-events.c] 
> Error 139
> make[1]: *** [Makefile.perf:461: /tmp/build/perf/pmu-events/pmu-events-in.o] 
> Error 2
> make[1]: *** Waiting for unfinished jobs
> ---
> centos:5: FAIL
> /bin/sh: line 1:  1336 Segmentation fault  (core dumped) 
> /tmp/build/perf/pmu-events/jevents x86 pmu-events/arch 
> /tmp/build/perf/pmu-events/pmu-events.c
> make[2]: *** [/tmp/build/perf/pmu-events/pmu-events.c] Error 139
> make[1]: *** Waiting for unfinished jobs
> ---
> centos:6: FAIL
> /bin/sh: line 1:  1633 Segmentation fault  (core dumped) 
> /tmp/build/perf/pmu-events/jevents x86 pmu-events/arch 
> /tmp/build/perf/pmu-events/pmu-events.c
> make[2]: *** [/tmp/build/perf/pmu-events/pmu-events.c] Error 139
> make[1]: *** [/tmp/build/perf/pmu-events/pmu-events-in.o] Error 2
> make[1]: *** Waiting for unfinished jobs
> ---
> centos:7: FAIL
>   GEN  /tmp/build/perf/pmu-events/pmu-events.c
> /bin/sh: line 1:  1548 Segmentation fault  (core dumped) 
> /tmp/build/perf/pmu-events/jevents x86 pmu-events/arch 
> /tmp/build/perf/pmu-events/pmu-events.c
> make[2]: *** [/tmp/build/perf/pmu-events/pmu-events.c] Error 139
> make[1]: *** [/tmp/build/perf/pmu-events/pmu-events-in.o] Error 2
> make[1]: *** Waiting for unfinished jobs
> ---
> debian:7: Ok
> ---
> debian:8: FAIL
>   GEN  /tmp/build/perf/pmu-events/pmu-events.c
> Segmentation fault (core dumped)
> pmu-events/Build:11: recipe for target 
> '/tmp/build/perf/pmu-events/pmu-events.c' failed
> make[2]: *** [/tmp/build/perf/pmu-events/pmu-events.c] Error 139
> Makefile.perf:461: recipe for target 
> '/tmp/build/perf/pmu-events/pmu-events-in.o' failed
> make[1]: *** [/tmp/build/perf/pmu-events/pmu-events-in.o] Error 2
> make[1]: *** Waiting for unfinished jobs
> ---
> debian:experimental: FAIL
>   GEN  /tmp/build/perf/pmu-events/pmu-events.c
> Segmentation fault (core dumped)
> pmu-events/Build:11: recipe for target 
> '/tmp/build/perf/pmu-events/pmu-events.c' failed
> make[2]: *** [/tmp/build/perf/pmu-events/pmu-events.c] Error 139
> Makefile.perf:461: recipe for target 
> '/tmp/build/perf/pmu-events/pmu-events-in.o' failed
> make[1]: *** [/tmp/build/perf/pmu-events/pmu-events-in.o] Error 2
> ---
> fedora:20: FAIL
> /bin/sh: line 1:  1460 Segmentation fault  (core dumped) 
>

Re: [PATCH v21 00/20] perf, tools: Add support for PMU events in JSON format

2016-09-19 Thread Arnaldo Carvalho de Melo

Em Thu, Sep 15, 2016 at 03:24:37PM -0700, Sukadev Bhattiprolu escreveu:
> CPUs support a large number of performance monitoring events (PMU events)
> and often these events are very specific to an architecture/model of the
> CPU. To use most of these PMU events with perf, we currently have to identify
> them by their raw codes:
> 
>   perf stat -e r100f2 sleep 1

So, trying to build this with my set of containers I get lots of failures,
double checking running on another machine with those containers for the
segfault cases, changing sys/fcntl.h to fcntl.h cures the build on alpine:3.4
(musl libc), will try to fix all of them.

The list is incomplete, lots of other systems failed as well.

- Arnaldo

 1 alpine:3.4: FAIL
  CC   /tmp/build/perf/pmu-events/jsmn.o
In file included from pmu-events/json.c:35:0:
/usr/include/sys/fcntl.h:1:2: error: #warning redirecting incorrect
#include  to  [-Werror=cpp]
 #warning redirecting incorrect #include  to 
---
android-ndk:r12b-arm: FAIL

  CC   /tmp/build/perf/event-plugin.o
pmu-events/json.c:35:23: fatal error: sys/fcntl.h: No such file or
directory
 #include 
   ^
compilation terminated.
---
archlinux:latest: FAIL
/bin/sh: line 1:  1408 Segmentation fault  (core dumped) 
/tmp/build/perf/pmu-events/jevents x86 pmu-events/arch 
/tmp/build/perf/pmu-events/pmu-events.c
make[2]: *** [pmu-events/Build:11: /tmp/build/perf/pmu-events/pmu-events.c] 
Error 139
make[1]: *** [Makefile.perf:461: /tmp/build/perf/pmu-events/pmu-events-in.o] 
Error 2
make[1]: *** Waiting for unfinished jobs
---
centos:5: FAIL
/bin/sh: line 1:  1336 Segmentation fault  (core dumped) 
/tmp/build/perf/pmu-events/jevents x86 pmu-events/arch 
/tmp/build/perf/pmu-events/pmu-events.c
make[2]: *** [/tmp/build/perf/pmu-events/pmu-events.c] Error 139
make[1]: *** [/tmp/build/perf/pmu-events/pmu-events-in.o] Error 2
make[1]: *** Waiting for unfinished jobs
---
centos:6: FAIL
/bin/sh: line 1:  1633 Segmentation fault  (core dumped) 
/tmp/build/perf/pmu-events/jevents x86 pmu-events/arch 
/tmp/build/perf/pmu-events/pmu-events.c
make[2]: *** [/tmp/build/perf/pmu-events/pmu-events.c] Error 139
make[1]: *** [/tmp/build/perf/pmu-events/pmu-events-in.o] Error 2
make[1]: *** Waiting for unfinished jobs
---
centos:7: FAIL
  GEN  /tmp/build/perf/pmu-events/pmu-events.c
/bin/sh: line 1:  1548 Segmentation fault  (core dumped) 
/tmp/build/perf/pmu-events/jevents x86 pmu-events/arch 
/tmp/build/perf/pmu-events/pmu-events.c
make[2]: *** [/tmp/build/perf/pmu-events/pmu-events.c] Error 139
make[1]: *** [/tmp/build/perf/pmu-events/pmu-events-in.o] Error 2
make[1]: *** Waiting for unfinished jobs
---
debian:7: Ok
---
debian:8: FAIL
  GEN  /tmp/build/perf/pmu-events/pmu-events.c
Segmentation fault (core dumped)
pmu-events/Build:11: recipe for target 
'/tmp/build/perf/pmu-events/pmu-events.c' failed
make[2]: *** [/tmp/build/perf/pmu-events/pmu-events.c] Error 139
Makefile.perf:461: recipe for target 
'/tmp/build/perf/pmu-events/pmu-events-in.o' failed
make[1]: *** [/tmp/build/perf/pmu-events/pmu-events-in.o] Error 2
make[1]: *** Waiting for unfinished jobs
---
debian:experimental: FAIL
  GEN  /tmp/build/perf/pmu-events/pmu-events.c
Segmentation fault (core dumped)
pmu-events/Build:11: recipe for target 
'/tmp/build/perf/pmu-events/pmu-events.c' failed
make[2]: *** [/tmp/build/perf/pmu-events/pmu-events.c] Error 139
Makefile.perf:461: recipe for target 
'/tmp/build/perf/pmu-events/pmu-events-in.o' failed
make[1]: *** [/tmp/build/perf/pmu-events/pmu-events-in.o] Error 2
---
fedora:20: FAIL
/bin/sh: line 1:  1460 Segmentation fault  (core dumped) 
/tmp/build/perf/pmu-events/jevents x86 pmu-events/arch 
/tmp/build/perf/pmu-events/pmu-events.c
make[2]: *** [/tmp/build/perf/pmu-events/pmu-events.c] Error 139
make[1]: *** [/tmp/build/perf/pmu-events/pmu-events-in.o] Error 2
make[1]: *** Waiting for unfinished jobs
---

Re: [PATCH v5 2/2] QE: remove PPCisms for QE

2016-09-19 Thread Leo Li

On Mon, Jul 25, 2016 at 12:43 AM, Zhao Qiang  wrote:
> QE was supported on PowerPC, and dependent on PPC,
> Now it is supported on other platforms. so remove PPCisms.
>
> Signed-off-by: Zhao Qiang 
> ---
> Changes for v2:
> - na
> Changes for v3:
> - add NO_IRQ
> Changes for v4:
> - modify spin_event_timeout to opencoded timeout loop
> - remove NO_IRQ
> - modify virq_to_hw to opencoed code
> Changes for v5:
> - modify commit msg
> - modify depends of QUICC_ENGINE
> - add kerneldoc header for qe_issue_cmd
>
>  drivers/irqchip/qe_ic.c   | 28 +--
>  drivers/soc/fsl/qe/Kconfig|  2 +-
>  drivers/soc/fsl/qe/qe.c   | 80 
> ++-
>  drivers/soc/fsl/qe/qe_io.c| 42 ++-
>  drivers/soc/fsl/qe/qe_tdm.c   |  8 ++---
>  drivers/soc/fsl/qe/ucc.c  | 10 +++---
>  drivers/soc/fsl/qe/ucc_fast.c | 68 ++--
>  include/soc/fsl/qe/qe.h   |  1 -
>  include/soc/fsl/qe/qe_ic.h| 12 +++
>  9 files changed, 133 insertions(+), 118 deletions(-)
>

[snip]

> diff --git a/drivers/soc/fsl/qe/Kconfig b/drivers/soc/fsl/qe/Kconfig
> index 73a2e08..b26b643 100644
> --- a/drivers/soc/fsl/qe/Kconfig
> +++ b/drivers/soc/fsl/qe/Kconfig
> @@ -4,7 +4,7 @@
>
>  config QUICC_ENGINE
> bool "Freescale QUICC Engine (QE) Support"
> -   depends on FSL_SOC && PPC32
> +   depends on OF && HAS_IOMEM
> select GENERIC_ALLOCATOR
> select CRC32
> help

You make it possible to build QE drivers on ARM, but the UCC_GETH
fails to build on arm64.  Please make sure all these drivers can build
on other architectures.  Or you can simply make them only build for
Power architecture as most of them are not available on ARM.

Regards,
Leo

Re: [PATCH] MAINTAINERS: Update cxl maintainers

2016-09-19 Thread Frederic Barrat



Le 16/09/2016 à 06:28, Michael Neuling a écrit :

Fred has taken over the cxl maintenance I was doing.  This updates the
MAINTAINERS file to reflect this.

It also removes a duplicate entry in the files covered.

Signed-off-by: Michael Neuling 



Acked-by: Frederic Barrat

Re: [PATCH v2 1/3] powerpc: port 64 bits pgtable_cache to 32 bits

2016-09-19 Thread christophe leroy




Le 19/09/2016 à 07:22, Aneesh Kumar K.V a écrit :

Christophe Leroy  writes:


Today powerpc64 uses a set of pgtable_caches while powerpc32 uses
standard pages when using 4k pages and a single pgtable_cache
if using other size pages.

In preparation of implementing huge pages on the 8xx, this patch
replaces the specific powerpc32 handling by the 64 bits approach.

This is done by:
* moving 64 bits pgtable_cache_add() and pgtable_cache_init()
in a new file called init-common.c
* modifying pgtable_cache_init() to also handle the case
without PMD
* removing the 32 bits version of pgtable_cache_add() and
pgtable_cache_init()
* copying related header contents from 64 bits into both the
book3s/32 and nohash/32 header files

On the 8xx, the following cache sizes will be used:
* 4k pages mode:
- PGT_CACHE(10) for PGD
- PGT_CACHE(3) for 512k hugepage tables
* 16k pages mode:
- PGT_CACHE(6) for PGD
- PGT_CACHE(7) for 512k hugepage tables
- PGT_CACHE(3) for 8M hugepage tables

Signed-off-by: Christophe Leroy 
---
v2: in v1, hugepte_cache was wrongly replaced by PGT_CACHE(1).
This modification has been removed from v2.

 arch/powerpc/include/asm/book3s/32/pgalloc.h |  44 ++--
 arch/powerpc/include/asm/book3s/32/pgtable.h |  43 
 arch/powerpc/include/asm/book3s/64/pgtable.h |   3 -
 arch/powerpc/include/asm/nohash/32/pgalloc.h |  44 ++--
 arch/powerpc/include/asm/nohash/32/pgtable.h |  45 
 arch/powerpc/include/asm/nohash/64/pgtable.h |   2 -
 arch/powerpc/include/asm/pgtable.h   |   2 +
 arch/powerpc/mm/Makefile |   3 +-
 arch/powerpc/mm/init-common.c| 147 +++
 arch/powerpc/mm/init_64.c|  77 --
 arch/powerpc/mm/pgtable_32.c |  37 ---
 11 files changed, 273 insertions(+), 174 deletions(-)
 create mode 100644 arch/powerpc/mm/init-common.c

diff --git a/arch/powerpc/include/asm/book3s/32/pgalloc.h 
b/arch/powerpc/include/asm/book3s/32/pgalloc.h
index 8e21bb4..d310546 100644
--- a/arch/powerpc/include/asm/book3s/32/pgalloc.h
+++ b/arch/powerpc/include/asm/book3s/32/pgalloc.h
@@ -2,14 +2,42 @@
 #define _ASM_POWERPC_BOOK3S_32_PGALLOC_H

 #include 
+#include 

-/* For 32-bit, all levels of page tables are just drawn from get_free_page() */
-#define MAX_PGTABLE_INDEX_SIZE 0
+/*
+ * Functions that deal with pagetables that could be at any level of
+ * the table need to be passed an "index_size" so they know how to
+ * handle allocation.  For PTE pages (which are linked to a struct
+ * page for now, and drawn from the main get_free_pages() pool), the
+ * allocation size will be (2^index_size * sizeof(pointer)) and
+ * allocations are drawn from the kmem_cache in PGT_CACHE(index_size).
+ *
+ * The maximum index size needs to be big enough to allow any
+ * pagetable sizes we need, but small enough to fit in the low bits of
+ * any page table pointer.  In other words all pagetables, even tiny
+ * ones, must be aligned to allow at least enough low 0 bits to
+ * contain this value.  This value is also used as a mask, so it must
+ * be one less than a power of two.
+ */
+#define MAX_PGTABLE_INDEX_SIZE 0xf

 extern void __bad_pte(pmd_t *pmd);

-extern pgd_t *pgd_alloc(struct mm_struct *mm);
-extern void pgd_free(struct mm_struct *mm, pgd_t *pgd);
+extern struct kmem_cache *pgtable_cache[];
+#define PGT_CACHE(shift) ({\
+   BUG_ON(!(shift));   \
+   pgtable_cache[(shift) - 1]; \
+   })
+
+static inline pgd_t *pgd_alloc(struct mm_struct *mm)
+{
+   return kmem_cache_alloc(PGT_CACHE(PGD_INDEX_SIZE), GFP_KERNEL);
+}
+
+static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
+{
+   kmem_cache_free(PGT_CACHE(PGD_INDEX_SIZE), pgd);
+}

 /*
  * We don't have any real pmd's, and this code never triggers because
@@ -68,8 +96,12 @@ static inline void pte_free(struct mm_struct *mm, pgtable_t 
ptepage)

 static inline void pgtable_free(void *table, unsigned index_size)
 {
-   BUG_ON(index_size); /* 32-bit doesn't use this */
-   free_page((unsigned long)table);
+   if (!index_size) {
+   free_page((unsigned long)table);
+   } else {
+   BUG_ON(index_size > MAX_PGTABLE_INDEX_SIZE);
+   kmem_cache_free(PGT_CACHE(index_size), table);
+   }
 }

 #define check_pgt_cache()  do { } while (0)
diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h 
b/arch/powerpc/include/asm/book3s/32/pgtable.h
index 6b8b2d5..f887499 100644
--- a/arch/powerpc/include/asm/book3s/32/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/32/pgtable.h
@@ -8,6 +8,26 @@
 /* And here we include common definitions */
 #include 

+#define PTE_INDEX_SIZE PTE_SHIFT
+#define PMD_INDEX_SIZE 0
+#define PUD_INDEX_SIZE 0
+#define PGD_INDEX_SIZE (32 - PGDIR_SHIFT)
+
+#define PMD_CACHE_INDEXPMD_INDEX_SIZE
+
+#ifndef

Re: [PATCH v2 2/3] powerpc: get hugetlbpage handling more generic

2016-09-19 Thread christophe leroy




Le 19/09/2016 à 07:50, Aneesh Kumar K.V a écrit :


Christophe Leroy  writes:

+#else
+static void hugepd_free(struct mmu_gather *tlb, void *hugepte)
+{
+   BUG();
+}
+
 #endif



I was expecting that BUG will get removed in the next patch. But I don't
see it in the next patch. Considering

@@ -475,11 +453,10 @@ static void free_hugepd_range(struct mmu_gather *tlb, 
hugepd_t *hpdp, int pdshif
for (i = 0; i < num_hugepd; i++, hpdp++)
hpdp->pd = 0;

-#ifdef CONFIG_PPC_FSL_BOOK3E
-   hugepd_free(tlb, hugepte);
-#else
-   pgtable_free_tlb(tlb, hugepte, pdshift - shift);
-#endif
+   if (shift >= pdshift)
+   hugepd_free(tlb, hugepte);
+   else
+   pgtable_free_tlb(tlb, hugepte, pdshift - shift);
 }

What is that I am missing ?



Previously, call to hugepd_free() was compiled only when #ifdef 
CONFIG_PPC_FSL_BOOK3E
Now, it is compiled at all time, but it should never be called if not 
CONFIG_PPC_FSL_BOOK3E because we always have shift < pdshift in that case.
Then the function needs to be defined anyway but should never be called. 
Should I just define it static inline {} ?


Christophe

---
L'absence de virus dans ce courrier électronique a été vérifiée par le logiciel 
antivirus Avast.
https://www.avast.com/antivirus

Re: [PATCH v2 2/3] powerpc: get hugetlbpage handling more generic

2016-09-19 Thread christophe leroy




Le 19/09/2016 à 07:45, Aneesh Kumar K.V a écrit :

Christophe Leroy  writes:


Today there are two implementations of hugetlbpages which are managed
by exclusive #ifdefs:
* FSL_BOOKE: several directory entries points to the same single hugepage
* BOOK3S: one upper level directory entry points to a table of hugepages

In preparation of implementation of hugepage support on the 8xx, we
need a mix of the two above solutions, because the 8xx needs both cases
depending on the size of pages:
* In 4k page size mode, each PGD entry covers a 4M bytes area. It means
that 2 PGD entries will be necessary to cover an 8M hugepage while a
single PGD entry will cover 8x 512k hugepages.
* In 16 page size mode, each PGD entry covers a 64M bytes area. It means
that 8x 8M hugepages will be covered by one PGD entry and 64x 512k
hugepages will be covers by one PGD entry.

This patch:
* removes #ifdefs in favor of if/else based on the range sizes
* merges the two huge_pte_alloc() functions as they are pretty similar
* merges the two hugetlbpage_init() functions as they are pretty similar

Signed-off-by: Christophe Leroy 
---
v2: This part is new and results from a split of last patch of v1 serie in
two parts

 arch/powerpc/mm/hugetlbpage.c | 189 +-
 1 file changed, 77 insertions(+), 112 deletions(-)

diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 8a512b1..2119f00 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c


[...]




-#ifdef CONFIG_PPC_FSL_BOOK3E
 struct kmem_cache *hugepte_cache;
 static int __init hugetlbpage_init(void)
 {
int psize;

-   for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
-   unsigned shift;
-
-   if (!mmu_psize_defs[psize].shift)
-   continue;
-
-   shift = mmu_psize_to_shift(psize);
-
-   /* Don't treat normal page sizes as huge... */
-   if (shift != PAGE_SHIFT)
-   if (add_huge_page_size(1ULL << shift) < 0)
-   continue;
-   }
-
-   /*
-* Create a kmem cache for hugeptes.  The bottom bits in the pte have
-* size information encoded in them, so align them to allow this
-*/
-   hugepte_cache =  kmem_cache_create("hugepte-cache", sizeof(pte_t),
-  HUGEPD_SHIFT_MASK + 1, 0, NULL);
-   if (hugepte_cache == NULL)
-   panic("%s: Unable to create kmem cache for hugeptes\n",
- __func__);
-
-   /* Default hpage size = 4M */
-   if (mmu_psize_defs[MMU_PAGE_4M].shift)
-   HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_4M].shift;
-   else
-   panic("%s: Unable to set default huge page size\n", __func__);
-
-
-   return 0;
-}
-#else
-static int __init hugetlbpage_init(void)
-{
-   int psize;
-
+#if !defined(CONFIG_PPC_FSL_BOOK3E)
if (!radix_enabled() && !mmu_has_feature(MMU_FTR_16M_PAGE))
return -ENODEV;
-
+#endif


Do we need that #if ? radix_enabled() should become 0 and that if
condition should be removed at compile time isn't it ? or are you
finding errors with that ?


Having radix_enabled() being 0, it becomes:

if (!mmu_has_feature(MMU_FTR_16M_PAGE))
return -ENODEV;

Which means hugepage will only be handled by CPUs having 16M pages. 
That's the issue.






for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
unsigned shift;
unsigned pdshift;
@@ -860,16 +807,31 @@ static int __init hugetlbpage_init(void)
 * if we have pdshift and shift value same, we don't
 * use pgt cache for hugepd.
 */
-   if (pdshift != shift) {
+   if (pdshift > shift) {
pgtable_cache_add(pdshift - shift, NULL);
if (!PGT_CACHE(pdshift - shift))
panic("hugetlbpage_init(): could not create "
  "pgtable cache for %d bit pagesize\n", 
shift);
+   } else if (!hugepte_cache) {
+   /*
+* Create a kmem cache for hugeptes.  The bottom bits in
+* the pte have size information encoded in them, so
+* align them to allow this
+*/
+   hugepte_cache = kmem_cache_create("hugepte-cache",
+ sizeof(pte_t),
+ HUGEPD_SHIFT_MASK + 1,
+ 0, NULL);
+   if (hugepte_cache == NULL)
+   panic("%s: Unable to create kmem cache "
+ "for hugeptes\n", __func__);
+



We don't need

[PATCH] powerpc/mm: Update the FORCE_MAX_ZONEORDER range to enable hugetlb

2016-09-19 Thread Aneesh Kumar K.V

For hugetlb to work with 4K page size, we need the MAX_ORDER to be more
than 13. When switching from a 64K page size to 4K linux page size using
make nconfig, we endup with a CONFIG_FORCE_MAX_ZONEORDER value of 9.
This results in 16M hugepage to be considered as a gigantic huge page
which inturn can result in failure to setup hugepages if gigantic
hugepage support is not enabled.

This also results in kernel crash with 4K radix configuration. We
hit the below BUG_ON on radix

 kernel BUG at mm/huge_memory.c:364!
 Oops: Exception in kernel mode, sig: 5 [#1]
 SMP NR_CPUS=2048 NUMA PowerNV
 Modules linked in:
 CPU: 0 PID: 1 Comm: swapper/0 Not tainted 4.8.0-rc1-6-gbae9cc6 #1
 task: c000f1af8000 task.stack: c000f1aec000
 NIP: c0c5fa0c LR: c0c5f9d8 CTR: c0c5f9a4
 REGS: c000f1aef920 TRAP: 0700   Not tainted (4.8.0-rc1-6-gbae9cc6)
 MSR: 900102029033   CR: 24000844  XER: 

 CFAR: c0c5f9e0 SOFTE: 1
.
 NIP [c0c5fa0c] hugepage_init+0x68/0x238
 LR [c0c5f9d8] hugepage_init+0x34/0x238

Fixes: a7ee539584acf ("powerpc/Kconfig: Update config option based on page 
size")

Reported-by: Santhosh 
Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 927d2ab2ce08..792cb1768c8f 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -637,7 +637,7 @@ config FORCE_MAX_ZONEORDER
int "Maximum zone order"
range 8 9 if PPC64 && PPC_64K_PAGES
default "9" if PPC64 && PPC_64K_PAGES
-   range 9 13 if PPC64 && !PPC_64K_PAGES
+   range 13 13 if PPC64 && !PPC_64K_PAGES
default "13" if PPC64 && !PPC_64K_PAGES
range 9 64 if PPC32 && PPC_16K_PAGES
default "9" if PPC32 && PPC_16K_PAGES
-- 
2.7.4

Re: [PATCH v21 00/20] perf, tools: Add support for PMU events in JSON format

2016-09-19 Thread Sukadev Bhattiprolu

I messed up the Cc list. Fixing it now.

Sukadev Bhattiprolu [suka...@linux.vnet.ibm.com] wrote:
> CPUs support a large number of performance monitoring events (PMU events)
> and often these events are very specific to an architecture/model of the
> CPU. To use most of these PMU events with perf, we currently have to identify
> them by their raw codes:
> 
>   perf stat -e r100f2 sleep 1
> 
> This patchset allows architectures to specify these PMU events in JSON
> files located in 'tools/perf/pmu-events/arch/' of the mainline tree.
> The events from the JSON files for the architecture are then built into
> the perf binary.
> 
> At run time, perf identifies the specific set of events for the CPU and
> creates "event aliases". These aliases allow users to specify events by
> "name" as:
> 
>   perf stat -e pm_1plus_ppc_cmpl sleep 1
> 
> The file, 'tools/perf/pmu-events/README' in [PATCH 15/19] gives more
> details.
> 
> Note:
>   - All known events tables for the architecture are included in the
> perf binary.
> 
>   - For architectures that don't have any JSON files, an empty mapping
> table is created and they should continue to build.
> 
> Thanks to input from Andi Kleen, Jiri Olsa, Namhyung Kim and Ingo Molnar.
> 
> These patches are available from:
> 
>   https://github.com/sukadev/linux.git 
>   
>   Branch  Description
>   --
>   json-code-v21   Source Code only 
>   json-code+data-v21  Both code and data (for build/test/pull)
>   
> NOTE: Only "source code" patches (i.e those in json-code-v21) are 
> being
>   emailed. Please pull the json-code+data-v21 branch for build/test.
> 
> Changelog[v21]
>   - Rebase to recent perf/core
>   - Group the PMU events supported by a CPU model into topics and
> create a separate JSON file for each topic for each CPU (code
> and input from Jiri Olsa).
> 
> Changelog[v20]
>   - Rebase to recent perf/core
>   - Add Patch 20/20 to allow perf-stat to work with the period= field
> 
> Changelog[v19]
>   Rebase to recent perf/core; fix couple lines >80 chars.
> 
> Changelog[v18]
>   Rebase to recent perf/core; fix minor merge conflicts.
> 
> Changelog[v17]
>   Rebase to recent perf/core; couple of small fixes to processing Intel
>   JSON files; allow case-insensitive PMU event names.
> 
> Changelog[v16]
>   Rebase to recent perf/core; fix minor merge conflicts; drop 3 patches
>   that were merged into perf/core.
> 
> Changelog[v15]
>   Code changes:
>   - Fix 'perf list' usage string and update man page.
>   - Remove a redundant __maybe_unused tag.
>   - Rebase to recent perf/core branch.
> 
>   Data files updates: json-files-5 branch
>   - Rebase to perf/intel-json-files-5 from Andi Kleen
>   - Add patch from Madhavan Srinivasan for couple more Powerpc models
> 
> Changelog[v14]
>   Comments from Jiri Olsa:
>   - Change parameter name/type for pmu_add_cpu_aliases (from void *data
> to list_head *head)
>   - Use asprintf() in file_name_to_tablename() and simplify/reorg code.
>   - Use __weak definition from 
>   - Use fopen() with mode "w" and eliminate unlink()
>   - Remove minor TODO.
>   - Add error check for return value from strdup() in print_pmu_events().
>   - Move independent changes from patches 3,11,12 .. to separate patches
> for easier review/backport.
>   - Clarify mapfile's "header line support" in patch description.
>   - Fix build failure with DEBUG=1
> 
>   Comment from Andi Kleen:
>   - In tools/perf/pmu-events/Build, check for 'mapfile.csv' rather than
> 'mapfile*'
> 
>   Misc:
>   - Minor changes/clarifications to tools/perf/pmu-events/README.
> 
> 
> Changelog[v13]
>   Version: Individual patches have their own history :-) that I am
>   preserving. Patchset version (v13) is for overall patchset and is
>   somewhat arbitrary.
> 
>   - Added support for "categories" of events to perf
>   - Add mapfile, jevents build dependency on pmu-events.c
>   - Silence jevents when parsing JSON files unless V=1 is specified
>   - Cleanup error messages
>   - Fix memory leak with ->cpuid
>   - Rebase to Arnaldo's tree
>   - Allow overriding CPUID via environment variable
>   - Support long descriptions for events
>   - Handle header line in mapfile.csv
>   - Cleanup JSON files (trim PublicDescription if identical to/prefix of
> BriefDescription field)
> 
> 
> Andi Kleen (12):
>   perf, tools: Add jsmn `jasmine' JSON parser
>   perf, tools, jevents: Program to convert JSON file to C style file
>   perf, tools: Support CPU id matching for x86 v2
>   perf, tools: Support alias descriptions
>   perf, tools: Query terminal width and use in perf list
>   perf, tools: Add a --no-desc flag to perf list
>

Re: [PATCH v6 4/7] perf annotate: Do not ignore call instruction with indirect target

2016-09-19 Thread Arnaldo Carvalho de Melo

Em Fri, Aug 19, 2016 at 06:29:35PM +0530, Ravi Bangoria escreveu:
> Do not ignore call instruction with indirect target when its already
> identified as a call. This is an extension of commit e8ea1561952b
> ("perf annotate: Use raw form for register indirect call instructions")
> to generalize annotation for all instructions with indirect calls.
> 
> This is needed for certain powerpc call instructions that use address
> in a register (such as bctrl, btarl, ...).
> 
> Apart from that, when kcore is used to disassemble function, all call
> instructions were ignored. This patch will fix it as a side effect by
> not ignoring them. For example,
> 
> Before (with kcore):
>mov%r13,%rdi
>callq  0x811a7e70
>  ^ jmpq   64
>mov%gs:0x7ef41a6e(%rip),%al
> 
> After (with kcore):
>mov%r13,%rdi
>  > callq  0x811a7e70
>  ^ jmpq   64
>mov%gs:0x7ef41a6e(%rip),%al

Ok, makes sense, but then now I have the -> and can't press enter to go
to that function, in fact for the case I'm using as a test, the
vsnprintf kernel function, I get:

   │ 56:   test   %al,%al   
 ▒
   │ ↓ je 81
 ▒
   │   lea-0x38(%rbp),%rsi  
 ▒
   │   mov%r15,%rdi 
 ▒
   │ → callq  0x993e3230 

That 0x993e3230 should've been resolved to:

[root@jouet ~]# grep 993e3230 /proc/kallsyms 
993e3230 t format_decode

Trying to investigate why it doesn't...

- Arnaldo

> Suggested-by: Michael Ellerman 
> [Suggested about 'bctrl' instruction]
> Signed-off-by: Ravi Bangoria 
> ---
> Changes in v6:
>   - No change
> 
>  tools/perf/util/annotate.c | 8 ++--
>  1 file changed, 2 insertions(+), 6 deletions(-)
> 
> diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c
> index ea07588..a05423b 100644
> --- a/tools/perf/util/annotate.c
> +++ b/tools/perf/util/annotate.c
> @@ -81,16 +81,12 @@ static int call__parse(struct ins_operands *ops, const 
> char *norm_arch)
>   return ops->target.name == NULL ? -1 : 0;
>  
>  indirect_call:
> - tok = strchr(endptr, '(');
> - if (tok != NULL) {
> + tok = strchr(endptr, '*');
> + if (tok == NULL) {
>   ops->target.addr = 0;
>   return 0;
>   }
>  
> - tok = strchr(endptr, '*');
> - if (tok == NULL)
> - return -1;
> -
>   ops->target.addr = strtoull(tok + 1, NULL, 16);
>   return 0;
>  }
> -- 
> 2.5.5

Re: [PATCH] powerpc/pseries: fix memory leak in queue_hotplug_event() error path

2016-09-19 Thread Nathan Fontenot

On 09/19/2016 01:41 AM, Andrew Donnellan wrote:
> If we fail to allocate work, we don't end up using hp_errlog_copy. Free it
> in the error path.
> 
> Signed-off-by: Andrew Donnellan 

Reviewed-by: Nathan Fontenot 

> 
> ---
> 
> Found by Coverity Scan. Compile tested only.
> ---
>  arch/powerpc/platforms/pseries/dlpar.c | 1 +
>  1 file changed, 1 insertion(+)
> 
> diff --git a/arch/powerpc/platforms/pseries/dlpar.c 
> b/arch/powerpc/platforms/pseries/dlpar.c
> index 4748124..6a99e72 100644
> --- a/arch/powerpc/platforms/pseries/dlpar.c
> +++ b/arch/powerpc/platforms/pseries/dlpar.c
> @@ -413,6 +413,7 @@ void queue_hotplug_event(struct pseries_hp_errorlog 
> *hp_errlog,
>   queue_work(pseries_hp_wq, (struct work_struct *)work);
>   } else {
>   *rc = -ENOMEM;
> + kfree(hp_errlog_copy);
>   complete(hotplug_done);
>   }
>  }
>

Re: [PATCH v6 3/7] perf annotate: Add support for powerpc

2016-09-19 Thread Arnaldo Carvalho de Melo

Em Fri, Aug 19, 2016 at 06:29:34PM +0530, Ravi Bangoria escreveu:
> From: "Naveen N. Rao" 
> 
> +static struct ins *ins__find_powerpc(const char *name)
> +{
> + int i;
> + struct ins *ins;
> + struct ins_ops *ops;
> + static struct instructions_powerpc head;
> + static bool list_initialized;
> +
> + /*
> +  * - Interested only if instruction starts with 'b'.
> +  * - Few start with 'b', but aren't branch instructions.
> +  */
> + if (name[0] != 'b' ||
> + !strncmp(name, "bcd", 3)   ||
> + !strncmp(name, "brinc", 5) ||
> + !strncmp(name, "bper", 4))
> + return NULL;
> +
> + if (!list_initialized) {
> + INIT_LIST_HEAD();
> + list_initialized = true;
> + }

Why not ditch list_initialized and instead just declare the list as:

static struct instructions_powerpc head = {
.list = LIST_HEAD_INIT(head.list),
}

Just like the kernel sources do? See for instance:

  net/core/net_namespace.c

struct net init_net = {
.dev_base_head = LIST_HEAD_INIT(init_net.dev_base_head),
};

> +
> + /*
> +  * Return if we already have object of 'struct ins' for this instruction
> +  */
> + ins = list_search__ins_powerpc(, name);
> + if (ins)
> + return ins;
> +
> + ops = _ops;
> +
> + i = strlen(name) - 1;
> + if (i < 0)
> + return NULL;
> +
> + /* ignore optional hints at the end of the instructions */
> + if (name[i] == '+' || name[i] == '-')
> + i--;
> +
> + if (name[i] == 'l' || (name[i] == 'a' && name[i-1] == 'l')) {
> + /*
> +  * if the instruction ends up with 'l' or 'la', then
> +  * those are considered 'calls' since they update LR.
> +  * ... except for 'bnl' which is branch if not less than
> +  * and the absolute form of the same.
> +  */
> + if (strcmp(name, "bnl") && strcmp(name, "bnl+") &&
> + strcmp(name, "bnl-") && strcmp(name, "bnla") &&
> + strcmp(name, "bnla+") && strcmp(name, "bnla-"))
> + ops = _ops;
> + }
> + if (name[i] == 'r' && name[i-1] == 'l')
> + /*
> +  * instructions ending with 'lr' are considered to be
> +  * return instructions
> +  */
> + ops = _ops;
> +
> + /*
> +  * Add instruction to list so next time no need to
> +  * allocate memory for it.
> +  */
> + return list_add__ins_powerpc(, name, ops);
> +}
> +
>  static void ins__sort(struct ins *instructions, int nmemb)
>  {
>   qsort(instructions, nmemb, sizeof(struct ins), ins__cmp);
> @@ -585,6 +699,8 @@ static struct ins *ins__find(const char *name, const char 
> *norm_arch)
>   } else if (!strcmp(norm_arch, NORM_ARM)) {
>   instructions = instructions_arm;
>   nmemb = ARRAY_SIZE(instructions_arm);
> + } else if (!strcmp(norm_arch, NORM_POWERPC)) {
> + return ins__find_powerpc(name);
>   } else {
>   pr_err("perf annotate not supported by %s arch\n", norm_arch);
>   return NULL;
> -- 
> 2.5.5

Re: PowerPC agpmode issues

2016-09-19 Thread Mathieu Malaterre

Hi,

On Mon, Sep 19, 2016 at 11:13 AM, Herminio Hernandez, Jr.
 wrote:
> Apologies for the previous email.
>
> What I was trying to say was I just recently installed Debain on my PowerMac
> G5. I upgraded to sid and install the nonfree firmware. After reboot I
> noticed that GPU acclleration was broken and I recieved the following error
>
> [   10.428542] [drm] Loading R300 Microcode
> [   10.486631] radeon :f0:10.0: firmware: direct-loading firmware
> radeon/R300_cp.bin
> [   10.486865] [drm] radeon: ring at 0x0001
> [   10.634417] [drm:.r100_ring_test [radeon]] *ERROR* radeon: ring test
> failed (scratch(0x15E4)=0xCAFEDEAD)
> [   10.634526] [drm:.r100_cp_init [radeon]] *ERROR* radeon: cp isn't working
[...]
> [   11.935061] [drm] Forcing AGP to PCI mode

Well that is really odd. I am trying hard to push my pci patch upstream:

https://patchwork.kernel.org/patch/9088181/

I do believe PCI mode always work, so I would be interested if you
could report the bug upstream to have PCI mode work on all Powermac
(AGP is know to be broken).

Pay attention to your /etc/modules file check whether or not it
contains a option for the loaded modules, eg:

radeon.agpmode=-1

Finally your dmesg looks odd since the line `[drm] Forcing AGP to PCI
mode` comes only after the first error.

-M

Re: [PATCH v2 2/3] powerpc/mm: allow memory hotplug into a memoryless node

2016-09-19 Thread Balbir Singh



On 15/09/16 06:06, Reza Arbab wrote:
> Remove the check which prevents us from hotplugging into an empty node.
> 
> This limitation has been questioned before [1], and judging by the
> response, there doesn't seem to be a reason we can't remove it. No issues
> have been found in light testing.
> 
> [1] 
> http://lkml.kernel.org/r/cagzkibrmksa1yyhbf5hwgxubcjse5smksmy4tpanerme2ug...@mail.gmail.com
> 
> http://lkml.kernel.org/r/20160511215051.gf22...@arbab-laptop.austin.ibm.com
> 
> Signed-off-by: Reza Arbab 
> Acked-by: Balbir Singh 
> Cc: Nathan Fontenot 
> Cc: Bharata B Rao 
> ---
>  arch/powerpc/mm/numa.c | 13 +
>  1 file changed, 1 insertion(+), 12 deletions(-)
> 

I presume you've tested with CONFIG_NODES_SHIFT of 8 (255 nodes?)

Balbir Singh.

Re: [PATCH v2] powerpc: fix usage of _PAGE_RO in hugepage

2016-09-19 Thread Christophe Leroy




Le 19/09/2016 à 12:58, Christophe Leroy a écrit :

On some CPUs like the 8xx, _PAGE_RW hence _PAGE_WRITE is defined
as 0 and _PAGE_RO has to be set when a page is not writable

_PAGE_RO is defined by default in pte-common.h, however BOOK3S/64
doesn't include that file so _PAGE_RO has to be defined explicitly
in book3s/64/pgtable.h

fixes: a7b9f671f2d14 ("powerpc32: adds handling of _PAGE_RO")
Signed-off-by: Christophe Leroy 


Forgot to add from V1:

Reviewed-by: Aneesh Kumar K.V 

Christophe



---
v1: This patch was initially part of the v1 serie of patchs for providing
   hugepage support to the 8xx. As suggested by Aneesh, that serie has
   been splited to focus only on hugepage implementation for 8xx.
   This patch is a fix and is independant of 8xx hugepage implementation,
   allthough it is required to have hugepage support working properly on
   the 8xx.

v2: Added a comment in the code as suggested by Aneesh.

 arch/powerpc/include/asm/book3s/64/pgtable.h | 2 ++
 arch/powerpc/mm/hugetlbpage.c| 7 +++
 2 files changed, 9 insertions(+)

diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h 
b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 8ec8be9..9fd77f8 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -6,6 +6,8 @@
  */
 #define _PAGE_BIT_SWAP_TYPE0

+#define _PAGE_RO   0
+
 #define _PAGE_EXEC 0x1 /* execute permission */
 #define _PAGE_WRITE0x2 /* write access allowed */
 #define _PAGE_READ 0x4 /* read access allowed */
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 7372ee1..a5d3ecd 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -1019,8 +1019,15 @@ int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned 
long addr,

pte = READ_ONCE(*ptep);
mask = _PAGE_PRESENT | _PAGE_READ;
+
+   /*
+* On some CPUs like the 8xx, _PAGE_RW hence _PAGE_WRITE is defined
+* as 0 and _PAGE_RO has to be set when a page is not writable
+*/
if (write)
mask |= _PAGE_WRITE;
+   else
+   mask |= _PAGE_RO;

if ((pte_val(pte) & mask) != mask)
return 0;

[PATCH v2] powerpc: fix usage of _PAGE_RO in hugepage

2016-09-19 Thread Christophe Leroy

On some CPUs like the 8xx, _PAGE_RW hence _PAGE_WRITE is defined
as 0 and _PAGE_RO has to be set when a page is not writable

_PAGE_RO is defined by default in pte-common.h, however BOOK3S/64
doesn't include that file so _PAGE_RO has to be defined explicitly
in book3s/64/pgtable.h

fixes: a7b9f671f2d14 ("powerpc32: adds handling of _PAGE_RO")
Signed-off-by: Christophe Leroy 
---
v1: This patch was initially part of the v1 serie of patchs for providing
   hugepage support to the 8xx. As suggested by Aneesh, that serie has
   been splited to focus only on hugepage implementation for 8xx.
   This patch is a fix and is independant of 8xx hugepage implementation,
   allthough it is required to have hugepage support working properly on
   the 8xx.

v2: Added a comment in the code as suggested by Aneesh.

 arch/powerpc/include/asm/book3s/64/pgtable.h | 2 ++
 arch/powerpc/mm/hugetlbpage.c| 7 +++
 2 files changed, 9 insertions(+)

diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h 
b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 8ec8be9..9fd77f8 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -6,6 +6,8 @@
  */
 #define _PAGE_BIT_SWAP_TYPE0
 
+#define _PAGE_RO   0
+
 #define _PAGE_EXEC 0x1 /* execute permission */
 #define _PAGE_WRITE0x2 /* write access allowed */
 #define _PAGE_READ 0x4 /* read access allowed */
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 7372ee1..a5d3ecd 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -1019,8 +1019,15 @@ int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned 
long addr,
 
pte = READ_ONCE(*ptep);
mask = _PAGE_PRESENT | _PAGE_READ;
+
+   /*
+* On some CPUs like the 8xx, _PAGE_RW hence _PAGE_WRITE is defined
+* as 0 and _PAGE_RO has to be set when a page is not writable
+*/
if (write)
mask |= _PAGE_WRITE;
+   else
+   mask |= _PAGE_RO;
 
if ((pte_val(pte) & mask) != mask)
return 0;
-- 
2.1.0

Re: [PATCH] powernv/pci: Fix m64 checks for SR-IOV and window alignment

2016-09-19 Thread Benjamin Herrenschmidt

On Mon, 2016-09-19 at 16:37 +1000, Russell Currey wrote:
> On Wed, 2016-09-14 at 21:30 +1000, Gavin Shan wrote:
> > 
> > On Wed, Sep 14, 2016 at 05:51:08PM +1000, Benjamin Herrenschmidt wrote:
> > > 
> > > 
> > > On Wed, 2016-09-14 at 16:37 +1000, Russell Currey wrote:
> > > > 
> > > > 
> > > > Commit 5958d19a143e checks for prefetchable m64 BARs by comparing the
> > > > addresses instead of using resource flags.  This broke SR-IOV as the
> > > > m64
> > > > check in pnv_pci_ioda_fixup_iov_resources() fails.
> > > > 
> > > > The condition in pnv_pci_window_alignment() also changed to checking
> > > > only IORESOURCE_MEM_64 instead of both IORESOURCE_MEM_64 and
> > > > IORESOURCE_PREFETCH.
> > > 
> > > CC'ing Gavin who might have some insight in the matter.
> > > 
> > > Why do we check for prefetch ? On PCIe, any 64-bit BAR can live under a
> > > prefetchable region afaik... Gavin, any idea ?
> > > 
> > Ben, what I understood for long time: non-prefetchable BAR cannot live under
> > a prefetchable region (window), but any BAR can live under non-prefetchable
> > region (window).

That is actually no longer true on PCIe I think. I need to double check but I
believe PCIe allows it because PCIe bridges aren't allowed to prefetch. 

That being said, our alignment hook is for bridge regions, and in that case, 
well,
the only 64-bit window is prefetchable...

> > > 
> > > 
> > > 
> > > > 
> > > > 
> > > > Revert these cases to the previous behaviour, adding a new helper
> > > > function
> > > > to do so.  This is named pnv_pci_is_m64_flags() to make it clear this
> > > > function is only looking at resource flags and should not be relied
> > > > on for
> > > > non-SRIOV resources.
> > > > 
> > > > Fixes: 5958d19a143e ("Fix incorrect PE reservation attempt on some
> > > > 64-bit BARs")
> > > > > > > > Reported-by: Alexey Kardashevskiy 
> > > > > > > > Signed-off-by: Russell Currey 
> > > > ---
> > > >  arch/powerpc/platforms/powernv/pci-ioda.c | 11 +--
> > > >  1 file changed, 9 insertions(+), 2 deletions(-)
> > > > 
> > > > diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c
> > > > b/arch/powerpc/platforms/powernv/pci-ioda.c
> > > > index c16d790..2f25622 100644
> > > > --- a/arch/powerpc/platforms/powernv/pci-ioda.c
> > > > +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
> > > > @@ -124,6 +124,13 @@ static inline bool pnv_pci_is_m64(struct pnv_phb
> > > > *phb, struct resource *r)
> > > > > > > >     r->start < (phb->ioda.m64_base + phb-
> > > > > 
> > > > > 
> > > > > ioda.m64_size));
> > > >  }
> > > >  
> > > > +static inline bool pnv_pci_is_m64_flags(unsigned long
> > > > resource_flags)
> > > > +{
> > > > > > > > +   unsigned long flags = (IORESOURCE_MEM_64 |
> > > > IORESOURCE_PREFETCH);
> > > > +
> > > > > > > > +   return (resource_flags & flags) == flags;
> > > > +}
> > > > 
> > > I don't agree. See below.
> > > 
> > > > 
> > > > 
> > > >  static struct pnv_ioda_pe *pnv_ioda_init_pe(struct pnv_phb *phb, int
> > > > pe_no)
> > > >  {
> > > > > > > >     phb->ioda.pe_array[pe_no].phb = phb;
> > > > @@ -2871,7 +2878,7 @@ static void
> > > > pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
> > > > > > > >     res = >resource[i + PCI_IOV_RESOURCES];
> > > > > > > >     if (!res->flags || res->parent)
> > > > > > > >     continue;
> > > > > > > > -   if (!pnv_pci_is_m64(phb, res)) {
> > > > > > > > +   if (!pnv_pci_is_m64_flags(res->flags)) {
> > > > > > > >     dev_warn(>dev, "Don't support 
> > > > > > > > SR-IOV
> > > > with"
> > > > > > > >     " non M64 VF BAR%d: %pR.
> > > > \n",
> > > > > > > >      i, res);
> > > 
> > > What is that function actually doing ? Having IORESOURCE_64 and
> > > PREFETCHABLE is completely orthogonal to being in the M64 region. This
> > > is the bug my original patch was fixing in fact as it's possible for
> > > the allocator to put a 64-bit resource in the M32 region.
> > > 
> > 
> > This function is called before the resoureces are resized and assigned.
> > So using the resource's start/end addresses to judge it's in M64 or M32
> > windows are not reliable. Currently, all IOV BARs is required to have
> > (IORESOURCE_64 | PREFETCHABLE) which is covered by bridge's M64 window
> > and PHB's M64 windows (BARs).
> > 
> > > 
> > > 
> > > > 
> > > > 
> > > > @@ -3096,7 +3103,7 @@ static resource_size_t
> > > > pnv_pci_window_alignment(struct pci_bus *bus,
> > > > > > > >      * alignment for any 64-bit resource, PCIe doesn't care 
> > > > > > > > and
> > > > > > > >      * bridges only do 64-bit prefetchable anyway.
> > > > > > > >      */
> > > > > > > > -   if (phb->ioda.m64_segsize && (type & IORESOURCE_MEM_64))
> > > > > > > > +   if (phb->ioda.m64_segsize && pnv_pci_is_m64_flags(type))
> > > > > > > >     return

Re: PowerPC agpmode issues

2016-09-19 Thread Benjamin Herrenschmidt

On Mon, 2016-09-19 at 02:13 -0700, Herminio Hernandez, Jr. wrote:
> 
> I noticed the GPU was set to PCI mode however I did not set
> radeon.agpmode=-1 in yaboot. I then installed Lubuntu 16.10 along
> side Debian. I noticed that GPU was working and it was not set to
> force PCI mode. Has there been a change to kernel in Debian to force
> this by default? If so it looks like it is breaking on certain G5
> machines. I am attaching bothe dmesg files for review.
> 
> Thanks and sorry for the previous garbled email.

Hrm... this is odd. I would expect PCI mode to work and AGP to be
busted to be honest. Can you provide the complete log ? I wonder
if it's trying to enable the PCIe bypass high aperture and hits the
Radeon address limit...

Ben.

Re: [PATCH v2 1/3] drivers/of: recognize status property of dt memory nodes

2016-09-19 Thread Balbir Singh



On 15/09/16 06:06, Reza Arbab wrote:
> Respect the standard dt "status" property when scanning memory nodes in
> early_init_dt_scan_memory(), so that if the property is present and not
> "okay", no memory will be added.
> 
> The use case at hand is accelerator or device memory, which may be
> unusable until post-boot initialization of the memory link. Such a node
> can be described in the dt as any other, given its status is "disabled".
> Per the device tree specification,
> 
> "disabled"
>   Indicates that the device is not presently operational, but it
>   might become operational in the future (for example, something
>   is not plugged in, or switched off).
> 
> Once such memory is made operational, it can then be hotplugged.
> 
> Signed-off-by: Reza Arbab 

Makes sense, so basically a /memory@  with missing status or status = "okay"
are added, others are skipped. No memblocks corresponding to those nodes
are created either.

Balbir Singh

Re: [PATCH v7 6/6] powerpc: pSeries: Add pv-qspinlock build config/make

2016-09-19 Thread kbuild test robot

Hi Pan,

[auto build test ERROR on powerpc/next]
[also build test ERROR on v4.8-rc7 next-20160916]
[if your patch is applied to the wrong git tree, please drop us a note to help 
improve the system]
[Suggest to use git(>=2.9.0) format-patch --base= (or --base=auto for 
convenience) to record what (public, well-known) commit your patch series was 
built on]
[Check https://git-scm.com/docs/git-format-patch for more information]

url:
https://github.com/0day-ci/linux/commits/Pan-Xinhui/Implement-qspinlock-pv-qspinlock-on-ppc/20160919-133130
base:   https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git next
config: powerpc-allyesconfig (attached as .config)
compiler: powerpc64-linux-gnu-gcc (Debian 6.1.1-9) 6.1.1 20160705
reproduce:
wget 
https://git.kernel.org/cgit/linux/kernel/git/wfg/lkp-tests.git/plain/sbin/make.cross
 -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# save the attached .config to linux build tree
make.cross ARCH=powerpc 

All errors (new ones prefixed by >>):

   include/linux/compiler.h:491:2: note: in expansion of macro 
'_compiletime_assert'
 _compiletime_assert(condition, msg, __compiletime_assert_, __LINE__)
 ^~~
   include/linux/bug.h:51:37: note: in expansion of macro 'compiletime_assert'
#define BUILD_BUG_ON_MSG(cond, msg) compiletime_assert(!(cond), msg)
^~
   arch/powerpc/include/asm/cmpxchg.h:326:2: note: in expansion of macro 
'BUILD_BUG_ON_MSG'
 BUILD_BUG_ON_MSG(1, "Unsupported size for __cmpxchg");
 ^~~~
   In function '__cmpxchg',
   inlined from 'pv_wait_node' at kernel/locking/qspinlock_paravirt.h:328:3,
   inlined from '__pv_queued_spin_lock_slowpath' at 
kernel/locking/qspinlock.c:538:3:
   include/linux/compiler.h:491:38: error: call to '__compiletime_assert_326' 
declared with attribute error: Unsupported size for __cmpxchg
 _compiletime_assert(condition, msg, __compiletime_assert_, __LINE__)
 ^
   include/linux/compiler.h:474:4: note: in definition of macro 
'__compiletime_assert'
   prefix ## suffix();\
   ^~
   include/linux/compiler.h:491:2: note: in expansion of macro 
'_compiletime_assert'
 _compiletime_assert(condition, msg, __compiletime_assert_, __LINE__)
 ^~~
   include/linux/bug.h:51:37: note: in expansion of macro 'compiletime_assert'
#define BUILD_BUG_ON_MSG(cond, msg) compiletime_assert(!(cond), msg)
^~
   arch/powerpc/include/asm/cmpxchg.h:326:2: note: in expansion of macro 
'BUILD_BUG_ON_MSG'
 BUILD_BUG_ON_MSG(1, "Unsupported size for __cmpxchg");
 ^~~~
   In function '__cmpxchg',
   inlined from 'pv_wait_head_or_lock' at 
kernel/locking/qspinlock_paravirt.h:109:10,
   inlined from '__pv_queued_spin_lock_slowpath' at 
kernel/locking/qspinlock.c:573:5:
   include/linux/compiler.h:491:38: error: call to '__compiletime_assert_326' 
declared with attribute error: Unsupported size for __cmpxchg
 _compiletime_assert(condition, msg, __compiletime_assert_, __LINE__)
 ^
   include/linux/compiler.h:474:4: note: in definition of macro 
'__compiletime_assert'
   prefix ## suffix();\
   ^~
   include/linux/compiler.h:491:2: note: in expansion of macro 
'_compiletime_assert'
 _compiletime_assert(condition, msg, __compiletime_assert_, __LINE__)
 ^~~
   include/linux/bug.h:51:37: note: in expansion of macro 'compiletime_assert'
#define BUILD_BUG_ON_MSG(cond, msg) compiletime_assert(!(cond), msg)
^~
   arch/powerpc/include/asm/cmpxchg.h:326:2: note: in expansion of macro 
'BUILD_BUG_ON_MSG'
 BUILD_BUG_ON_MSG(1, "Unsupported size for __cmpxchg");
 ^~~~
   In function '__xchg_relaxed',
   inlined from 'pv_wait_head_or_lock' at 
kernel/locking/qspinlock_paravirt.h:442:8,
   inlined from '__pv_queued_spin_lock_slowpath' at 
kernel/locking/qspinlock.c:573:5:
   include/linux/compiler.h:491:38: error: call to '__compiletime_assert_113' 
declared with attribute error: Unsupported size for __xchg_local
 _compiletime_assert(condition, msg, __compiletime_assert_, __LINE__)
 ^
   include/linux/compiler.h:474:4: note: in definition of macro 
'__compiletime_assert'
   prefix ## suffix();\
   ^~
   include/linux/compiler.h:491:2: note: in expansion of macro 
'_compiletime_assert'
 _compiletime_assert(condition, msg, __compiletime_assert_, __LINE__)
 ^~~
   include/linux/bug.h:51:37: note: in expansion of macro 'compiletime_assert'
#define BUILD_BUG_ON_MSG(cond, msg) compiletime_assert(!(cond), msg)
^~
   a

Re: [PATCH][RFC] Implement arch primitives for busywait loops

2016-09-19 Thread Nicholas Piggin

On Mon, 19 Sep 2016 17:45:52 +1000
Balbir Singh  wrote:

> On 16/09/16 18:57, Nicholas Piggin wrote:
> > Implementing busy wait loops with cpu_relax() in callers poses
> > some difficulties for powerpc.
> > 
> > First, we want to put our SMT thread into a low priority mode for the
> > duration of the loop, but then return to normal priority after exiting
> > the loop.  Dependong on the CPU design, 'HMT_low() ; HMT_medium();' as
> > cpu_relax() does may have HMT_medium take effect before HMT_low made
> > any (or much) difference.
> > 
> > Second, it can be beneficial for some implementations to spin on the
> > exit condition with a statically predicted-not-taken branch (i.e.,
> > always predict the loop will exit).
> >   
> 
> IIUC, what you are proposing is that cpu_relax() be split such 
> that on entry we do HMT_low() and on exit do HMT_medium(). I think
> that makes a lot of sense, in that it allows the required transition
> time from low to medium

Basically yes, although also allowing the loop exit branch to be
overridden by the arch code too. That can possibly benefit some
microarchitectures -- e.g., you want loop exit to not take a branch
miss if possible but it may be acceptable to branch miss for every
other iteration. I'm doing some testing of it now (previous patch
was garbage btw, don't try to use it!)

> > diff --git a/arch/powerpc/include/asm/processor.h 
> > b/arch/powerpc/include/asm/processor.h
> > index 68e3bf5..e10aee2 100644
> > --- a/arch/powerpc/include/asm/processor.h
> > +++ b/arch/powerpc/include/asm/processor.h
> > @@ -402,6 +402,28 @@ static inline unsigned long __pack_fe01(unsigned int 
> > fpmode)
> >  
> >  #ifdef CONFIG_PPC64
> >  #define cpu_relax()do { HMT_low(); HMT_medium(); barrier(); } 
> > while (0)
> > +
> > +#define spin_do\  
> 
> How about cpu_relax_begin()?
> 
> > +do {   \
> > +   HMT_low();  \
> > +   __asm__ __volatile__ (  "1010:");
> > +
> > +#define spin_while(cond)   \  
> 
> cpu_relax_while()
> > +   barrier();  \
> > +   __asm__ __volatile__ (  "cmpdi  %0,0\n\t"   \
> > +   "beq-   1010b   \n\t"   \
> > +   : : "r" (cond));\
> > +   HMT_medium();   \
> > +} while (0)
> > +  
> 
> 
> > +#define spin_until(cond)   \  
> 
> This is just spin_while(!cond) from an implementation perspective right?

Yes, the only reason I put it in was because such spin loops often
read a bit better the other way from normal loops (you are interested
in the exit condition rather than the loop-again condition).

> 
> cpu_relax_until()
> 
> > +   barrier();  \
> > +   __asm__ __volatile__ (  "cmpdi  %0,0\n\t"   \
> > +   "bne-   1010b   \n\t"   \
> > +   : : "r" (cond));\
> > +   HMT_medium();   \
> > +} while (0)
> > +  
> 
> Then add cpu_relax_end() that does HMT_medium()

Hmm. I see what you mean, but I don't know if we should trust open
coded callers to get this right. It could cause weird problems and
isn't easily caught. If we can get something that breaks build,
perhaps. OTOH, I prefer all the logic including SMT priority to be
in the spin loop primitive directly because it's pretty subtle and
might need to be runtime patched.

We could *also* have a cpu_relax_begin/cpu_relax_end pair, although
I'd like to first see callers that don't suit spin_ primitives and
see what should be done.

Thanks,
Nick

Re: [PATCH v7 3/6] powerpc: pseries/Kconfig: Add qspinlock build config

2016-09-19 Thread kbuild test robot

Hi Pan,

[auto build test ERROR on powerpc/next]
[also build test ERROR on v4.8-rc7 next-20160916]
[if your patch is applied to the wrong git tree, please drop us a note to help 
improve the system]
[Suggest to use git(>=2.9.0) format-patch --base= (or --base=auto for 
convenience) to record what (public, well-known) commit your patch series was 
built on]
[Check https://git-scm.com/docs/git-format-patch for more information]

url:
https://github.com/0day-ci/linux/commits/Pan-Xinhui/Implement-qspinlock-pv-qspinlock-on-ppc/20160919-133130
base:   https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git next
config: powerpc-allyesconfig (attached as .config)
compiler: powerpc64-linux-gnu-gcc (Debian 6.1.1-9) 6.1.1 20160705
reproduce:
wget 
https://git.kernel.org/cgit/linux/kernel/git/wfg/lkp-tests.git/plain/sbin/make.cross
 -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# save the attached .config to linux build tree
make.cross ARCH=powerpc 

All errors (new ones prefixed by >>):

   In file included from include/uapi/linux/stddef.h:1:0,
from include/linux/stddef.h:4,
from include/uapi/linux/posix_types.h:4,
from include/uapi/linux/types.h:13,
from include/linux/types.h:5,
from include/linux/smp.h:10,
from kernel/locking/qspinlock.c:25:
   In function '__xchg_relaxed',
   inlined from 'queued_spin_lock_slowpath' at 
kernel/locking/qspinlock.c:184:14:
>> include/linux/compiler.h:491:38: error: call to '__compiletime_assert_113' 
>> declared with attribute error: Unsupported size for __xchg_local
 _compiletime_assert(condition, msg, __compiletime_assert_, __LINE__)
 ^
   include/linux/compiler.h:474:4: note: in definition of macro 
'__compiletime_assert'
   prefix ## suffix();\
   ^~
   include/linux/compiler.h:491:2: note: in expansion of macro 
'_compiletime_assert'
 _compiletime_assert(condition, msg, __compiletime_assert_, __LINE__)
 ^~~
   include/linux/bug.h:51:37: note: in expansion of macro 'compiletime_assert'
#define BUILD_BUG_ON_MSG(cond, msg) compiletime_assert(!(cond), msg)
^~
   arch/powerpc/include/asm/cmpxchg.h:113:2: note: in expansion of macro 
'BUILD_BUG_ON_MSG'
 BUILD_BUG_ON_MSG(1, "Unsupported size for __xchg_local");
 ^~~~

vim +/__compiletime_assert_113 +491 include/linux/compiler.h

9a8ab1c3 Daniel Santos  2013-02-21  475 
__compiletime_error_fallback(__cond);   \
9a8ab1c3 Daniel Santos  2013-02-21  476 } while (0)
9a8ab1c3 Daniel Santos  2013-02-21  477  
9a8ab1c3 Daniel Santos  2013-02-21  478  #define _compiletime_assert(condition, 
msg, prefix, suffix) \
9a8ab1c3 Daniel Santos  2013-02-21  479 __compiletime_assert(condition, 
msg, prefix, suffix)
9a8ab1c3 Daniel Santos  2013-02-21  480  
9a8ab1c3 Daniel Santos  2013-02-21  481  /**
9a8ab1c3 Daniel Santos  2013-02-21  482   * compiletime_assert - break build 
and emit msg if condition is false
9a8ab1c3 Daniel Santos  2013-02-21  483   * @condition: a compile-time constant 
condition to check
9a8ab1c3 Daniel Santos  2013-02-21  484   * @msg:   a message to emit if 
condition is false
9a8ab1c3 Daniel Santos  2013-02-21  485   *
9a8ab1c3 Daniel Santos  2013-02-21  486   * In tradition of POSIX assert, this 
macro will break the build if the
9a8ab1c3 Daniel Santos  2013-02-21  487   * supplied condition is *false*, 
emitting the supplied error message if the
9a8ab1c3 Daniel Santos  2013-02-21  488   * compiler has support to do so.
9a8ab1c3 Daniel Santos  2013-02-21  489   */
9a8ab1c3 Daniel Santos  2013-02-21  490  #define compiletime_assert(condition, 
msg) \
9a8ab1c3 Daniel Santos  2013-02-21 @491 _compiletime_assert(condition, 
msg, __compiletime_assert_, __LINE__)
9a8ab1c3 Daniel Santos  2013-02-21  492  
47933ad4 Peter Zijlstra 2013-11-06  493  #define 
compiletime_assert_atomic_type(t)  \
47933ad4 Peter Zijlstra 2013-11-06  494 
compiletime_assert(__native_word(t),\
47933ad4 Peter Zijlstra 2013-11-06  495 "Need native word sized 
stores/loads for atomicity.")
47933ad4 Peter Zijlstra 2013-11-06  496  
9c3cdc1f Linus Torvalds 2008-05-10  497  /*
9c3cdc1f Linus Torvalds 2008-05-10  498   * Prevent the compiler from merging 
or refetching accesses.  The compiler
9c3cdc1f Linus Torvalds 2008-05-10  499   * is also forbidden from reordering 
successive instances of ACCESS_ONCE(),

:: The code at line 491 was first introduced by commit
:: 9a8ab1c39970a4938a72d94e6fd13be88a797590 bug.h, compiler.h: introduce 
compiletime_assert & BUILD_BUG_ON_MSG

:: TO: Daniel Santos <daniel.san...@pobox.com>
:: CC: Linus

Re: [PATCH][RFC] Implement arch primitives for busywait loops

2016-09-19 Thread Balbir Singh



On 16/09/16 18:57, Nicholas Piggin wrote:
> Implementing busy wait loops with cpu_relax() in callers poses
> some difficulties for powerpc.
> 
> First, we want to put our SMT thread into a low priority mode for the
> duration of the loop, but then return to normal priority after exiting
> the loop.  Dependong on the CPU design, 'HMT_low() ; HMT_medium();' as
> cpu_relax() does may have HMT_medium take effect before HMT_low made
> any (or much) difference.
> 
> Second, it can be beneficial for some implementations to spin on the
> exit condition with a statically predicted-not-taken branch (i.e.,
> always predict the loop will exit).
> 

IIUC, what you are proposing is that cpu_relax() be split such 
that on entry we do HMT_low() and on exit do HMT_medium(). I think
that makes a lot of sense, in that it allows the required transition
time from low to medium

> This is a quick RFC with a couple of users converted to see what
> people think. I don't use a C branch with hints, because we don't want
> the compiler moving the loop body out of line, which makes it a bit
> messy unfortunately. If there's a better way to do it, I'm all ears.
> 
> I would not propose to switch all callers immediately, just some
> core synchronisation primitives.
> 
> ---
>  arch/powerpc/include/asm/processor.h | 22 ++
>  include/asm-generic/barrier.h|  7 ++-
>  include/linux/bit_spinlock.h |  5 ++---
>  include/linux/cgroup.h   |  7 ++-
>  include/linux/seqlock.h  | 10 --
>  5 files changed, 32 insertions(+), 19 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/processor.h 
> b/arch/powerpc/include/asm/processor.h
> index 68e3bf5..e10aee2 100644
> --- a/arch/powerpc/include/asm/processor.h
> +++ b/arch/powerpc/include/asm/processor.h
> @@ -402,6 +402,28 @@ static inline unsigned long __pack_fe01(unsigned int 
> fpmode)
>  
>  #ifdef CONFIG_PPC64
>  #define cpu_relax()  do { HMT_low(); HMT_medium(); barrier(); } while (0)
> +
> +#define spin_do  \

How about cpu_relax_begin()?

> +do { \
> + HMT_low();  \
> + __asm__ __volatile__ (  "1010:");
> +
> +#define spin_while(cond) \

cpu_relax_while()
> + barrier();  \
> + __asm__ __volatile__ (  "cmpdi  %0,0\n\t"   \
> + "beq-   1010b   \n\t"   \
> + : : "r" (cond));\
> + HMT_medium();   \
> +} while (0)
> +


> +#define spin_until(cond) \

This is just spin_while(!cond) from an implementation perspective right?

cpu_relax_until()

> + barrier();  \
> + __asm__ __volatile__ (  "cmpdi  %0,0\n\t"   \
> + "bne-   1010b   \n\t"   \
> + : : "r" (cond));\
> + HMT_medium();   \
> +} while (0)
> +

Then add cpu_relax_end() that does HMT_medium()

Balbir Singh.

Re: [V2] powerpc/Kconfig: Update config option based on page size.

2016-09-19 Thread Balbir Singh

On 17/09/16 01:16, Aneesh Kumar K.V wrote:
> - range 9 13 if PPC64 && !PPC_64K_PAGES
> + range 13 13 if PPC64 && !PPC_64K_PAGES
>   default "13" if PPC64 && !PPC_64K_PAGES

Do we still want t to be 13 13 or 12 13? 
Looks like the lower side of the range can cause issues based
on the values of Huge page size

Balbir Singh.

Re: [PATCH] perf tests: Add dwarf unwind test for powerpc

2016-09-19 Thread Jiri Olsa

On Mon, Sep 19, 2016 at 02:38:20AM -0400, Ravi Bangoria wrote:
> User stack dump feature is recently added for powerpc. But there was no
> test case available to test it. This test works same as other on arch by
> preparing stack frame on perf test thread and comparing each frame by
> unwinding it.
> 
>   $ ./perf test 50
> 50: Test dwarf unwind: Ok
> 
> User stack dump for powerpc: https://lkml.org/lkml/2016/4/28/482
> 
> Signed-off-by: Ravi Bangoria 

can't test, but looks good

Acked-by: Jiri Olsa 

thanks,
jirka

[PATCH] powerpc/pseries: fix memory leak in queue_hotplug_event() error path

2016-09-19 Thread Andrew Donnellan

If we fail to allocate work, we don't end up using hp_errlog_copy. Free it
in the error path.

Signed-off-by: Andrew Donnellan 

---

Found by Coverity Scan. Compile tested only.
---
 arch/powerpc/platforms/pseries/dlpar.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/platforms/pseries/dlpar.c 
b/arch/powerpc/platforms/pseries/dlpar.c
index 4748124..6a99e72 100644
--- a/arch/powerpc/platforms/pseries/dlpar.c
+++ b/arch/powerpc/platforms/pseries/dlpar.c
@@ -413,6 +413,7 @@ void queue_hotplug_event(struct pseries_hp_errorlog 
*hp_errlog,
queue_work(pseries_hp_wq, (struct work_struct *)work);
} else {
*rc = -ENOMEM;
+   kfree(hp_errlog_copy);
complete(hotplug_done);
}
 }
-- 
Andrew Donnellan  OzLabs, ADL Canberra
andrew.donnel...@au1.ibm.com  IBM Australia Limited

[PATCH] perf tests: Add dwarf unwind test for powerpc

2016-09-19 Thread Ravi Bangoria

User stack dump feature is recently added for powerpc. But there was no
test case available to test it. This test works same as other on arch by
preparing stack frame on perf test thread and comparing each frame by
unwinding it.

  $ ./perf test 50
50: Test dwarf unwind: Ok

User stack dump for powerpc: https://lkml.org/lkml/2016/4/28/482

Signed-off-by: Ravi Bangoria 
---
 tools/perf/arch/powerpc/Build|  1 +
 tools/perf/arch/powerpc/include/arch-tests.h | 13 
 tools/perf/arch/powerpc/include/perf_regs.h  |  2 +
 tools/perf/arch/powerpc/tests/Build  |  4 ++
 tools/perf/arch/powerpc/tests/arch-tests.c   | 15 +
 tools/perf/arch/powerpc/tests/dwarf-unwind.c | 62 ++
 tools/perf/arch/powerpc/tests/regs_load.S| 94 
 tools/perf/tests/Build   |  2 +-
 tools/perf/tests/dwarf-unwind.c  |  2 +-
 9 files changed, 193 insertions(+), 2 deletions(-)
 create mode 100644 tools/perf/arch/powerpc/include/arch-tests.h
 create mode 100644 tools/perf/arch/powerpc/tests/Build
 create mode 100644 tools/perf/arch/powerpc/tests/arch-tests.c
 create mode 100644 tools/perf/arch/powerpc/tests/dwarf-unwind.c
 create mode 100644 tools/perf/arch/powerpc/tests/regs_load.S

diff --git a/tools/perf/arch/powerpc/Build b/tools/perf/arch/powerpc/Build
index 54afe4a..db52fa2 100644
--- a/tools/perf/arch/powerpc/Build
+++ b/tools/perf/arch/powerpc/Build
@@ -1 +1,2 @@
 libperf-y += util/
+libperf-y += tests/
diff --git a/tools/perf/arch/powerpc/include/arch-tests.h 
b/tools/perf/arch/powerpc/include/arch-tests.h
new file mode 100644
index 000..84d8ded
--- /dev/null
+++ b/tools/perf/arch/powerpc/include/arch-tests.h
@@ -0,0 +1,13 @@
+#ifndef ARCH_TESTS_H
+#define ARCH_TESTS_H
+
+#ifdef HAVE_DWARF_UNWIND_SUPPORT
+struct thread;
+struct perf_sample;
+int test__arch_unwind_sample(struct perf_sample *sample,
+struct thread *thread);
+#endif
+
+extern struct test arch_tests[];
+
+#endif
diff --git a/tools/perf/arch/powerpc/include/perf_regs.h 
b/tools/perf/arch/powerpc/include/perf_regs.h
index 75de0e9..c12f4e8 100644
--- a/tools/perf/arch/powerpc/include/perf_regs.h
+++ b/tools/perf/arch/powerpc/include/perf_regs.h
@@ -5,6 +5,8 @@
 #include 
 #include 
 
+void perf_regs_load(u64 *regs);
+
 #define PERF_REGS_MASK  ((1ULL << PERF_REG_POWERPC_MAX) - 1)
 #define PERF_REGS_MAX   PERF_REG_POWERPC_MAX
 #ifdef __powerpc64__
diff --git a/tools/perf/arch/powerpc/tests/Build 
b/tools/perf/arch/powerpc/tests/Build
new file mode 100644
index 000..d827ef3
--- /dev/null
+++ b/tools/perf/arch/powerpc/tests/Build
@@ -0,0 +1,4 @@
+libperf-$(CONFIG_DWARF_UNWIND) += regs_load.o
+libperf-$(CONFIG_DWARF_UNWIND) += dwarf-unwind.o
+
+libperf-y += arch-tests.o
diff --git a/tools/perf/arch/powerpc/tests/arch-tests.c 
b/tools/perf/arch/powerpc/tests/arch-tests.c
new file mode 100644
index 000..e24f462
--- /dev/null
+++ b/tools/perf/arch/powerpc/tests/arch-tests.c
@@ -0,0 +1,15 @@
+#include 
+#include "tests/tests.h"
+#include "arch-tests.h"
+
+struct test arch_tests[] = {
+#ifdef HAVE_DWARF_UNWIND_SUPPORT
+   {
+   .desc = "Test dwarf unwind",
+   .func = test__dwarf_unwind,
+   },
+#endif
+   {
+   .func = NULL,
+   },
+};
diff --git a/tools/perf/arch/powerpc/tests/dwarf-unwind.c 
b/tools/perf/arch/powerpc/tests/dwarf-unwind.c
new file mode 100644
index 000..0bac313
--- /dev/null
+++ b/tools/perf/arch/powerpc/tests/dwarf-unwind.c
@@ -0,0 +1,62 @@
+#include 
+#include "perf_regs.h"
+#include "thread.h"
+#include "map.h"
+#include "event.h"
+#include "debug.h"
+#include "tests/tests.h"
+#include "arch-tests.h"
+
+#define STACK_SIZE 8192
+
+static int sample_ustack(struct perf_sample *sample,
+struct thread *thread, u64 *regs)
+{
+   struct stack_dump *stack = >user_stack;
+   struct map *map;
+   unsigned long sp;
+   u64 stack_size, *buf;
+
+   buf = malloc(STACK_SIZE);
+   if (!buf) {
+   pr_debug("failed to allocate sample uregs data\n");
+   return -1;
+   }
+
+   sp = (unsigned long) regs[PERF_REG_POWERPC_R1];
+
+   map = map_groups__find(thread->mg, MAP__VARIABLE, (u64) sp);
+   if (!map) {
+   pr_debug("failed to get stack map\n");
+   free(buf);
+   return -1;
+   }
+
+   stack_size = map->end - sp;
+   stack_size = stack_size > STACK_SIZE ? STACK_SIZE : stack_size;
+
+   memcpy(buf, (void *) sp, stack_size);
+   stack->data = (char *) buf;
+   stack->size = stack_size;
+   return 0;
+}
+
+int test__arch_unwind_sample(struct perf_sample *sample,
+struct thread *thread)
+{
+   struct regs_dump *regs = >user_regs;
+   u64 *buf;
+
+   buf = calloc(1, sizeof(u64) * PERF_REGS_MAX);
+   if (!buf) {
+   pr_debug("failed to

Re: [PATCH] powernv/pci: Fix m64 checks for SR-IOV and window alignment

2016-09-19 Thread Russell Currey

On Wed, 2016-09-14 at 21:30 +1000, Gavin Shan wrote:
> On Wed, Sep 14, 2016 at 05:51:08PM +1000, Benjamin Herrenschmidt wrote:
> > 
> > On Wed, 2016-09-14 at 16:37 +1000, Russell Currey wrote:
> > > 
> > > Commit 5958d19a143e checks for prefetchable m64 BARs by comparing the
> > > addresses instead of using resource flags.  This broke SR-IOV as the
> > > m64
> > > check in pnv_pci_ioda_fixup_iov_resources() fails.
> > > 
> > > The condition in pnv_pci_window_alignment() also changed to checking
> > > only IORESOURCE_MEM_64 instead of both IORESOURCE_MEM_64 and
> > > IORESOURCE_PREFETCH.
> > 
> > CC'ing Gavin who might have some insight in the matter.
> > 
> > Why do we check for prefetch ? On PCIe, any 64-bit BAR can live under a
> > prefetchable region afaik... Gavin, any idea ?
> > 
> 
> Ben, what I understood for long time: non-prefetchable BAR cannot live under
> a prefetchable region (window), but any BAR can live under non-prefetchable
> region (window).
> 
> > 
> > 
> > > 
> > > Revert these cases to the previous behaviour, adding a new helper
> > > function
> > > to do so.  This is named pnv_pci_is_m64_flags() to make it clear this
> > > function is only looking at resource flags and should not be relied
> > > on for
> > > non-SRIOV resources.
> > > 
> > > Fixes: 5958d19a143e ("Fix incorrect PE reservation attempt on some
> > > 64-bit BARs")
> > > Reported-by: Alexey Kardashevskiy 
> > > Signed-off-by: Russell Currey 
> > > ---
> > >  arch/powerpc/platforms/powernv/pci-ioda.c | 11 +--
> > >  1 file changed, 9 insertions(+), 2 deletions(-)
> > > 
> > > diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c
> > > b/arch/powerpc/platforms/powernv/pci-ioda.c
> > > index c16d790..2f25622 100644
> > > --- a/arch/powerpc/platforms/powernv/pci-ioda.c
> > > +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
> > > @@ -124,6 +124,13 @@ static inline bool pnv_pci_is_m64(struct pnv_phb
> > > *phb, struct resource *r)
> > >   r->start < (phb->ioda.m64_base + phb-
> > > > 
> > > > ioda.m64_size));
> > >  }
> > >  
> > > +static inline bool pnv_pci_is_m64_flags(unsigned long
> > > resource_flags)
> > > +{
> > > + unsigned long flags = (IORESOURCE_MEM_64 |
> > > IORESOURCE_PREFETCH);
> > > +
> > > + return (resource_flags & flags) == flags;
> > > +}
> > > 
> > I don't agree. See below.
> > 
> > > 
> > >  static struct pnv_ioda_pe *pnv_ioda_init_pe(struct pnv_phb *phb, int
> > > pe_no)
> > >  {
> > >   phb->ioda.pe_array[pe_no].phb = phb;
> > > @@ -2871,7 +2878,7 @@ static void
> > > pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
> > >   res = >resource[i + PCI_IOV_RESOURCES];
> > >   if (!res->flags || res->parent)
> > >   continue;
> > > - if (!pnv_pci_is_m64(phb, res)) {
> > > + if (!pnv_pci_is_m64_flags(res->flags)) {
> > >   dev_warn(>dev, "Don't support SR-IOV
> > > with"
> > >   " non M64 VF BAR%d: %pR.
> > > \n",
> > >    i, res);
> > 
> > What is that function actually doing ? Having IORESOURCE_64 and
> > PREFETCHABLE is completely orthogonal to being in the M64 region. This
> > is the bug my original patch was fixing in fact as it's possible for
> > the allocator to put a 64-bit resource in the M32 region.
> > 
> 
> This function is called before the resoureces are resized and assigned.
> So using the resource's start/end addresses to judge it's in M64 or M32
> windows are not reliable. Currently, all IOV BARs is required to have
> (IORESOURCE_64 | PREFETCHABLE) which is covered by bridge's M64 window
> and PHB's M64 windows (BARs).
> 
> > 
> > > 
> > > @@ -3096,7 +3103,7 @@ static resource_size_t
> > > pnv_pci_window_alignment(struct pci_bus *bus,
> > >    * alignment for any 64-bit resource, PCIe doesn't care and
> > >    * bridges only do 64-bit prefetchable anyway.
> > >    */
> > > - if (phb->ioda.m64_segsize && (type & IORESOURCE_MEM_64))
> > > + if (phb->ioda.m64_segsize && pnv_pci_is_m64_flags(type))
> > >   return phb->ioda.m64_segsize;
> > 
> > I disagree similarly. 64-bit non-prefetchable resources should live in
> > the M64 space as well.
> > 
> 
> As I understood, 64-bits non-prefetchable BARs cannot live behind
> M64 (64-bits prefetchable) windows.
> 
> > 
> > > 
> > >   if (type & IORESOURCE_MEM)
> > >   return phb->ioda.m32_segsize;
> > 
> > Something seems to be deeply wrong here and this patch looks to me that
> > it's just papering over the problem in way that could bring back the
> > bugs I've seen if the generic allocator decides to put things in the
> > M32 window.
> > 
> > We need to look at this more closely and understand WTF that code
> > intends means to do.
> > 
> 
> Yeah, it seems it partially reverts your changes. The start/end addresses
> are usable after resource resizing/assignment is finished. Before that,
> we still need to use the flags.

I agree with Ben that we need to look at

Re: [PATCH v2 3/3] mm: enable CONFIG_MOVABLE_NODE on powerpc

2016-09-19 Thread Aneesh Kumar K.V

Reza Arbab  writes:

> Onlining memory into ZONE_MOVABLE requires CONFIG_MOVABLE_NODE. Enable
> the use of this config option on PPC64 platforms.
>
> Signed-off-by: Reza Arbab 
> ---
>  Documentation/kernel-parameters.txt | 2 +-
>  mm/Kconfig  | 2 +-
>  2 files changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/Documentation/kernel-parameters.txt 
> b/Documentation/kernel-parameters.txt
> index a4f4d69..3d8460d 100644
> --- a/Documentation/kernel-parameters.txt
> +++ b/Documentation/kernel-parameters.txt
> @@ -2344,7 +2344,7 @@ bytes respectively. Such letter suffixes can also be 
> entirely omitted.
>   that the amount of memory usable for all allocations
>   is not too small.
>  
> - movable_node[KNL,X86] Boot-time switch to enable the effects
> + movable_node[KNL,X86,PPC] Boot-time switch to enable the effects
>   of CONFIG_MOVABLE_NODE=y. See mm/Kconfig for details.
>  

Movable node also does.
memblock_set_bottom_up(true);
What is the impact of that. Do we need changes equivalent to that ? Also
where are we marking the nodes which can be hotplugged, ie where do we
do memblock_mark_hotplug() ?
   
>   MTD_Partition=  [MTD]
> diff --git a/mm/Kconfig b/mm/Kconfig
> index be0ee11..4b19cd3 100644
> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -153,7 +153,7 @@ config MOVABLE_NODE
>   bool "Enable to assign a node which has only movable memory"
>   depends on HAVE_MEMBLOCK
>   depends on NO_BOOTMEM
> - depends on X86_64
> + depends on X86_64 || PPC64
>   depends on NUMA
>   default n
>   help

-aneesh

Re: [PATCH 12/13] powerpc: Add a Kconfig and a functions to set new soft_enabled mask

2016-09-19 Thread Madhavan Srinivasan




On Friday 16 September 2016 04:26 PM, Nicholas Piggin wrote:

On Thu, 15 Sep 2016 18:32:02 +0530
Madhavan Srinivasan  wrote:



diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index 9e5e9a6d4147..ae31b1e85fdb 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -209,6 +209,10 @@ notrace void arch_local_irq_restore(unsigned long en)
unsigned char irq_happened;
unsigned int replay;
  
+#ifdef CONFIG_IRQ_DEBUG_SUPPORT

+   WARN_ON(en & local_paca->soft_enabled & ~IRQ_DISABLE_MASK_LINUX);
+#endif
+
/* Write the new soft-enabled value */
soft_enabled_set(en);
  

Oh one other quick thing I just noticed: you could put this debug
check into your soft_enabled accessors.


OK. Will move it.


We did decide it's okay for your masking level to go both ways,
didn't we? I.e.,

local_irq_disable();
local_irq_pmu_save(flags);
local_irq_pmu_restore(flags);
local_irq_enable();

-> LINUX -> LINUX|PMU -> LINUX ->

This means PMU interrupts would not get replayed despite being
enabled here. In practice I think that doesn't matter/can't happen
because a PMU interrupt while masked would hard disable anyway. A
comment explaining it might be nice though.


Yes. I though i did the comment. Apologies. Will respin
with the comment.

Maddy


Thanks,
Nick

Re: [PATCH RESEND] powerpc: fix usage of _PAGE_RO in hugepage

2016-09-19 Thread Aneesh Kumar K.V

Christophe Leroy  writes:

> On some CPUs like the 8xx, _PAGE_RW hence _PAGE_WRITE is defined
> as 0 and _PAGE_RO has to be set when a page is not writable
>
> _PAGE_RO is defined by default in pte-common.h, however BOOK3S/64
> doesn't include that file so _PAGE_RO has to be defined explicitly
> in book3s/64/pgtable.h
>
> fixes: a7b9f671f2d14 ("powerpc32: adds handling of _PAGE_RO")
> Signed-off-by: Christophe Leroy 
> ---
> This patch was initially part of the v1 serie of patchs for providing
> hugepage support to the 8xx. As suggested by Aneesh, that serie has
> been splited to focus only on hugepage implementation for 8xx.
> This patch is a fix and is independant of 8xx hugepage implementation,
> allthough it is required to have hugepage support working properly on
> the 8xx.
>
>  arch/powerpc/include/asm/book3s/64/pgtable.h | 2 ++
>  arch/powerpc/mm/hugetlbpage.c| 2 ++
>  2 files changed, 4 insertions(+)
>
> diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h 
> b/arch/powerpc/include/asm/book3s/64/pgtable.h
> index 8ec8be9..9fd77f8 100644
> --- a/arch/powerpc/include/asm/book3s/64/pgtable.h
> +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
> @@ -6,6 +6,8 @@
>   */
>  #define _PAGE_BIT_SWAP_TYPE  0
>  
> +#define _PAGE_RO 0
> +
>  #define _PAGE_EXEC   0x1 /* execute permission */
>  #define _PAGE_WRITE  0x2 /* write access allowed */
>  #define _PAGE_READ   0x4 /* read access allowed */
> diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
> index 7372ee1..8a512b1 100644
> --- a/arch/powerpc/mm/hugetlbpage.c
> +++ b/arch/powerpc/mm/hugetlbpage.c
> @@ -1021,6 +1021,8 @@ int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned 
> long addr,
>   mask = _PAGE_PRESENT | _PAGE_READ;
>   if (write)
>   mask |= _PAGE_WRITE;
> + else
> + mask |= _PAGE_RO;
>  
>   if ((pte_val(pte) & mask) != mask)
>   return 0;
> -- 

Can you add the commit message also as code comment ?. It is confusing
to find that we do mask = _PAGE_READ and then mask |= _PAGE_RO.

Otherwise

Reviewed-by: Aneesh Kumar K.V 

-aneesh

53 matches

Mail list logo