[PATCH 2/2] KVM: PPC: hypervisor large decrementer support

2016-05-31 Thread Oliver O'Halloran
Power ISAv3 extends the width of the decrementer register from 32 bits.
The enlarged register width is implementation dependent, but reads from
these registers are automatically sign extended to produce a 64 bit
output when operating in large mode. The HDEC always operates in large
mode while the DEC register can be operated in 32bit mode or large mode
depending on the setting of the LPCR.LD bit.

Currently the hypervisor assumes that reads from the DEC and HDEC
register produce a 32 bit result which it sign extends to 64 bits using
the extsw instruction. This behaviour can result in the guest DEC
register value being corrupted by the hypervisor when the guest is
operating in LD mode since the results of the extsw instruction only
depends on the value of bit 31 in the register to be sign extended.

This patch adds the GET_DEC() and GET_HDEC() assembly macros for reading
from the decrementer registers. These macros will return the current
decrementer value as a 64 bit quantity regardless of the Host CPU or
guest decrementer operating mode. Additionally this patch corrects
several uses of decrementer values that assume a 32 bit register width.

Signed-off-by: Oliver O'Halloran <ooh...@gmail.com>
Cc: Paul Mackerras <pau...@samba.org>
Cc: Michael Neuling <mi...@neuling.org>
---
 arch/powerpc/include/asm/exception-64s.h | 29 
 arch/powerpc/include/asm/kvm_host.h  |  2 +-
 arch/powerpc/include/asm/kvm_ppc.h   |  2 +-
 arch/powerpc/include/uapi/asm/kvm.h  |  2 +-
 arch/powerpc/kvm/book3s_hv_interrupts.S  |  3 +--
 arch/powerpc/kvm/book3s_hv_rmhandlers.S  | 38 ++--
 arch/powerpc/kvm/emulate.c   |  6 ++---
 7 files changed, 57 insertions(+), 25 deletions(-)

diff --git a/arch/powerpc/include/asm/exception-64s.h 
b/arch/powerpc/include/asm/exception-64s.h
index 93ae809fe5ea..4fa303bf6d5b 100644
--- a/arch/powerpc/include/asm/exception-64s.h
+++ b/arch/powerpc/include/asm/exception-64s.h
@@ -545,4 +545,33 @@ END_FTR_SECTION_IFSET(CPU_FTR_CAN_NAP)
 #define FINISH_NAP
 #endif
 
+/*
+ * On ISAv3 processors the DEC register can be extended from 32 bits to 64 by
+ * setting the LD flag the LPCR. The decrementer value is a signed quantity so
+ * sign exension is required when operating in 32 bit mode. The GET_DEC() and
+ * GET_HDEC() handle this sign extension and yield a 64 bit result independent
+ * of the LD mode.
+ *
+ * NB: It's possible run with LD mode disabled on ISAv3 so GET_DEC() does not
+ * use a CPU_FEATURE section. A feature section is used for GET_HDEC 
because
+ * it has no mode bit. It is always 64 bits for ISAv3 processors.
+ */
+
+#define IS_LD_ENABLED(reg) \
+   mfspr  reg,SPRN_LPCR;  \
+   andis. reg,reg,(LPCR_LD >> 16);
+
+#define GET_DEC(reg)   \
+   IS_LD_ENABLED(reg);\
+   mfspr reg, SPRN_DEC;   \
+   bne 99f;   \
+   extsw reg, reg;\
+99:
+
+#define GET_HDEC(reg) \
+   mfspr reg, SPRN_HDEC;   \
+BEGIN_FTR_SECTION   \
+   extsw reg, reg; \
+END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
+
 #endif /* _ASM_POWERPC_EXCEPTION_H */
diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index ec35af34a3fb..ddea233e2cce 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -520,7 +520,7 @@ struct kvm_vcpu_arch {
ulong mcsrr0;
ulong mcsrr1;
ulong mcsr;
-   u32 dec;
+   u64 dec;
 #ifdef CONFIG_BOOKE
u32 decar;
 #endif
diff --git a/arch/powerpc/include/asm/kvm_ppc.h 
b/arch/powerpc/include/asm/kvm_ppc.h
index 2544edabe7f3..4de0102930e9 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -94,7 +94,7 @@ extern int kvmppc_emulate_instruction(struct kvm_run *run,
 extern int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu);
 extern int kvmppc_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu);
 extern void kvmppc_emulate_dec(struct kvm_vcpu *vcpu);
-extern u32 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb);
+extern u64 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb);
 extern void kvmppc_decrementer_func(struct kvm_vcpu *vcpu);
 extern int kvmppc_sanity_check(struct kvm_vcpu *vcpu);
 extern int kvmppc_subarch_vcpu_init(struct kvm_vcpu *vcpu);
diff --git a/arch/powerpc/include/uapi/asm/kvm.h 
b/arch/powerpc/include/uapi/asm/kvm.h
index c93cf35ce379..2dd92e841127 100644
--- a/arch/powerpc/include/uapi/asm/kvm.h
+++ b/arch/powerpc/include/uapi/asm/kvm.h
@@ -215,7 +215,7 @@ struct kvm_sregs {
__u32 tsr;  /* KVM_SREGS_E_UPDATE_TSR */
__u32 tcr;
__u32 decar;
-   __u32 dec;  /* KVM_SREGS_E_UPDATE_DEC */
+   __u64 dec;  /

[PATCH 1/2] powerpc/timer - large decrementer support

2016-05-31 Thread Oliver O'Halloran
POWER ISA v3 adds large decrementer (LD) mode of operation which increases
the size of the decrementer register from 32 bits to an implementation
defined with of up to 64 bits.

This patch adds support for the LD on processors with the CPU_FTR_ARCH_300
cpu feature flag set. For CPUs with this feature LD mode is enabled when
when the ibm,dec-bits devicetree property is supplied for the boot CPU. The
decrementer value is a signed quantity (with negative values indicating a
pending exception) and this property is required to find the maximum
positive decrementer value. If this property is not supplied then the
traditional decrementer width of 32 bits is assumed and LD mode is disabled.

This patch was based on initial work by Jack Miller.

Signed-off-by: Oliver O'Halloran <ooh...@gmail.com>
Cc: Michael Neuling <mi...@neuling.org>
Cc: Balbir Singh <bsinghar...@gmail.com>
Cc: Jack Miller <j...@codezen.org>
---
 arch/powerpc/include/asm/reg.h  |  1 +
 arch/powerpc/include/asm/time.h |  6 +--
 arch/powerpc/kernel/time.c  | 94 +
 3 files changed, 90 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index c1e82e968506..2793f3f03f9b 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -332,6 +332,7 @@
 #define   LPCR_AIL_0   0x  /* MMU off exception offset 0x0 */
 #define   LPCR_AIL_3   0x0180  /* MMU on exception offset 0xc00...4xxx 
*/
 #define   LPCR_ONL 0x0004  /* online - PURR/SPURR count */
+#define   LPCR_LD  0x0002  /* large decremeter */
 #define   LPCR_PECE0x0001f000  /* powersave exit cause enable */
 #define LPCR_PECEDP0x0001  /* directed priv dbells cause 
exit */
 #define LPCR_PECEDH0x8000  /* directed hyp dbells cause 
exit */
diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h
index 1092fdd7e737..09211640a0e0 100644
--- a/arch/powerpc/include/asm/time.h
+++ b/arch/powerpc/include/asm/time.h
@@ -146,7 +146,7 @@ static inline void set_tb(unsigned int upper, unsigned int 
lower)
  * in auto-reload mode.  The problem is PIT stops counting when it
  * hits zero.  If it would wrap, we could use it just like a decrementer.
  */
-static inline unsigned int get_dec(void)
+static inline u64 get_dec(void)
 {
 #if defined(CONFIG_40x)
return (mfspr(SPRN_PIT));
@@ -160,10 +160,10 @@ static inline unsigned int get_dec(void)
  * in when the decrementer generates its interrupt: on the 1 to 0
  * transition for Book E/4xx, but on the 0 to -1 transition for others.
  */
-static inline void set_dec(int val)
+static inline void set_dec(u64 val)
 {
 #if defined(CONFIG_40x)
-   mtspr(SPRN_PIT, val);
+   mtspr(SPRN_PIT, (u32) val);
 #else
 #ifndef CONFIG_BOOKE
--val;
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 3ed9a5a21d77..fe66f1c8d8b2 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -96,7 +96,8 @@ static struct clocksource clocksource_timebase = {
.read = timebase_read,
 };
 
-#define DECREMENTER_MAX0x7fff
+#define DECREMENTER_DEFAULT_MAX 0x7FFF
+u64 decrementer_max = DECREMENTER_DEFAULT_MAX;
 
 static int decrementer_set_next_event(unsigned long evt,
  struct clock_event_device *dev);
@@ -504,8 +505,8 @@ static void __timer_interrupt(void)
__this_cpu_inc(irq_stat.timer_irqs_event);
} else {
now = *next_tb - now;
-   if (now <= DECREMENTER_MAX)
-   set_dec((int)now);
+   if (now <= decrementer_max)
+   set_dec(now);
/* We may have raced with new irq work */
if (test_irq_work_pending())
set_dec(1);
@@ -535,7 +536,7 @@ void timer_interrupt(struct pt_regs * regs)
/* Ensure a positive value is written to the decrementer, or else
 * some CPUs will continue to take decrementer exceptions.
 */
-   set_dec(DECREMENTER_MAX);
+   set_dec(decrementer_max);
 
/* Some implementations of hotplug will get timer interrupts while
 * offline, just ignore these and we also need to set
@@ -583,9 +584,9 @@ static void generic_suspend_disable_irqs(void)
 * with suspending.
 */
 
-   set_dec(DECREMENTER_MAX);
+   set_dec(decrementer_max);
local_irq_disable();
-   set_dec(DECREMENTER_MAX);
+   set_dec(decrementer_max);
 }
 
 static void generic_suspend_enable_irqs(void)
@@ -866,7 +867,7 @@ static int decrementer_set_next_event(unsigned long evt,
 
 static int decrementer_shutdown(struct clock_event_device *dev)
 {
-   decrementer_set_next_event(DECREMENTER_MAX, dev);
+   decrementer_set_next_event(decrementer_max, dev);
return 0;
 }
 
@@

[PATCHv4] powerpc/timer - large decrementer support

2016-06-23 Thread Oliver O'Halloran
Power ISAv3 adds a large decrementer (LD) mode which increases the size
of the decrementer register. The size of the enlarged decrementer
register is between 32 and 64 bits with the exact size being dependent
on the implementation. When in LD mode, reads are sign extended to 64
bits and a decrementer exception is raised when the high bit is set (i.e
the value goes below zero). Writes however are truncated to the physical
register width so some care needs to be taken to ensure that the high
bit is not set when reloading the decrementer. This patch adds support
for using the LD inside the host kernel on processors that support it.

When LD mode is supported firmware will supply the ibm,dec-bits property
for CPU nodes to allow the kernel to determine the maximum decrementer
value. Enabling LD mode is a hypervisor privileged operation so the
kernel can only enable it manually when running in hypervisor mode.
Guest kernels that support LD mode can request it using the
"ibm,client-architecture-support" firmware call or some other platform
specific method. If this property is not supplied then the traditional
decrementer width of 32 bit is assumed and LD mode will not be enabled.

This patch was based on initial work by Jack Miller.

Signed-off-by: Oliver O'Halloran <ooh...@gmail.com>
Signed-off-by: Balbir Singh <bsinghar...@gmail.com>
Cc: Michael Neuling <mi...@neuling.org>
Cc: Jack Miller <j...@codezen.org>
---
 arch/powerpc/include/asm/reg.h  |   1 +
 arch/powerpc/include/asm/time.h |   6 +--
 arch/powerpc/kernel/time.c  | 102 
 3 files changed, 98 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index a0948f40bc7b..12d970d64bb3 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -332,6 +332,7 @@
 #define   LPCR_AIL_0   0x  /* MMU off exception offset 0x0 */
 #define   LPCR_AIL_3   0x0180  /* MMU on exception offset 0xc00...4xxx 
*/
 #define   LPCR_ONL 0x0004  /* online - PURR/SPURR count */
+#define   LPCR_LD  0x0002  /* large decremeter */
 #define   LPCR_PECE0x0001f000  /* powersave exit cause enable */
 #define LPCR_PECEDP0x0001  /* directed priv dbells cause 
exit */
 #define LPCR_PECEDH0x8000  /* directed hyp dbells cause 
exit */
diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h
index 1092fdd7e737..09211640a0e0 100644
--- a/arch/powerpc/include/asm/time.h
+++ b/arch/powerpc/include/asm/time.h
@@ -146,7 +146,7 @@ static inline void set_tb(unsigned int upper, unsigned int 
lower)
  * in auto-reload mode.  The problem is PIT stops counting when it
  * hits zero.  If it would wrap, we could use it just like a decrementer.
  */
-static inline unsigned int get_dec(void)
+static inline u64 get_dec(void)
 {
 #if defined(CONFIG_40x)
return (mfspr(SPRN_PIT));
@@ -160,10 +160,10 @@ static inline unsigned int get_dec(void)
  * in when the decrementer generates its interrupt: on the 1 to 0
  * transition for Book E/4xx, but on the 0 to -1 transition for others.
  */
-static inline void set_dec(int val)
+static inline void set_dec(u64 val)
 {
 #if defined(CONFIG_40x)
-   mtspr(SPRN_PIT, val);
+   mtspr(SPRN_PIT, (u32) val);
 #else
 #ifndef CONFIG_BOOKE
--val;
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 7a482a7f4d8d..aa6d399d939b 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -97,7 +97,8 @@ static struct clocksource clocksource_timebase = {
.read = timebase_read,
 };
 
-#define DECREMENTER_MAX0x7fff
+#define DECREMENTER_DEFAULT_MAX 0x7FFF
+u64 decrementer_max = DECREMENTER_DEFAULT_MAX;
 
 static int decrementer_set_next_event(unsigned long evt,
  struct clock_event_device *dev);
@@ -505,8 +506,8 @@ static void __timer_interrupt(void)
__this_cpu_inc(irq_stat.timer_irqs_event);
} else {
now = *next_tb - now;
-   if (now <= DECREMENTER_MAX)
-   set_dec((int)now);
+   if (now <= decrementer_max)
+   set_dec(now);
/* We may have raced with new irq work */
if (test_irq_work_pending())
set_dec(1);
@@ -536,7 +537,7 @@ void timer_interrupt(struct pt_regs * regs)
/* Ensure a positive value is written to the decrementer, or else
 * some CPUs will continue to take decrementer exceptions.
 */
-   set_dec(DECREMENTER_MAX);
+   set_dec(decrementer_max);
 
/* Some implementations of hotplug will get timer interrupts while
 * offline, just ignore these and we also need to set
@@ -584,9 +585,9 @@ static void generic_suspend_disable_irqs(void)
 * with suspending.
 

[PATCH v2] powerpc/boot: Add OPAL console to epapr wrappers

2016-06-24 Thread Oliver O'Halloran
This patch adds an OPAL console backend to the powerpc boot wrapper so
that decompression failures inside the wrapper can be reported to the
user. This is important since it typically indicates data corruption in
the firmware and other nasty things.

Currently this only works when building a little endian kernel. When
compiling a 64 bit BE kernel the wrapper is always build 32 bit to be
compatible with some 32 bit firmwares. BE support will be added at a
later date. Another limitation of this is that only the "raw" type of
OPAL console is supported, however machines that provide a hvsi console
also provide a raw console so this is not an issue in practice.

Actually-written-by: Benjamin Herrenschmidt <b...@kernel.crashing.org>
Signed-off-by: Oliver O'Halloran <ooh...@gmail.com>
Cc: Stewart Smith <stew...@linux.vnet.ibm.com>
Cc: sta...@vger.kernel.org
---
 arch/powerpc/boot/Makefile |  4 +-
 arch/powerpc/boot/opal-calls.S | 49 +++
 arch/powerpc/boot/opal.c   | 88 ++
 arch/powerpc/boot/ops.h|  1 +
 arch/powerpc/boot/ppc_asm.h|  4 ++
 arch/powerpc/boot/serial.c |  2 +
 arch/powerpc/boot/types.h  | 12 ++
 7 files changed, 158 insertions(+), 2 deletions(-)
 create mode 100644 arch/powerpc/boot/opal-calls.S
 create mode 100644 arch/powerpc/boot/opal.c

diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile
index 8fe78a3efc92..00cf88aa9a23 100644
--- a/arch/powerpc/boot/Makefile
+++ b/arch/powerpc/boot/Makefile
@@ -70,7 +70,7 @@ $(addprefix $(obj)/,$(zlib) cuboot-c2k.o gunzip_util.o 
main.o): \
 libfdt   := fdt.c fdt_ro.c fdt_wip.c fdt_sw.c fdt_rw.c fdt_strerror.c
 libfdtheader := fdt.h libfdt.h libfdt_internal.h
 
-$(addprefix $(obj)/,$(libfdt) libfdt-wrapper.o simpleboot.o epapr.o): \
+$(addprefix $(obj)/,$(libfdt) libfdt-wrapper.o simpleboot.o epapr.o opal.o): \
$(addprefix $(obj)/,$(libfdtheader))
 
 src-wlib-y := string.S crt0.S crtsavres.S stdio.c main.c \
@@ -78,7 +78,7 @@ src-wlib-y := string.S crt0.S crtsavres.S stdio.c main.c \
ns16550.c serial.c simple_alloc.c div64.S util.S \
gunzip_util.c elf_util.c $(zlib) devtree.c stdlib.c \
oflib.c ofconsole.c cuboot.c mpsc.c cpm-serial.c \
-   uartlite.c mpc52xx-psc.c
+   uartlite.c mpc52xx-psc.c opal.c opal-calls.S
 src-wlib-$(CONFIG_40x) += 4xx.c planetcore.c
 src-wlib-$(CONFIG_44x) += 4xx.c ebony.c bamboo.c
 src-wlib-$(CONFIG_8xx) += mpc8xx.c planetcore.c fsl-soc.c
diff --git a/arch/powerpc/boot/opal-calls.S b/arch/powerpc/boot/opal-calls.S
new file mode 100644
index ..1f3c097e1552
--- /dev/null
+++ b/arch/powerpc/boot/opal-calls.S
@@ -0,0 +1,49 @@
+#include "ppc_asm.h"
+#include "../include/asm/opal-api.h"
+
+   .text
+
+#define OPAL_CALL(name, token) \
+   .globl name;\
+name:  \
+   li  r0, token;  \
+   b   opal_call;
+
+opal_call:
+   mflrr11
+   std r11,16(r1)
+   mfcrr12
+   stw r12,8(r1)
+   mr  r13,r2
+
+   /* Set opal return address */
+   ld  r11,opal_return@got(r2)
+   mtlrr11
+   mfmsr   r12
+
+   /* switch to BE when we enter OPAL */
+   li  r11,MSR_LE
+   andcr12,r12,r11
+   mtspr   SPRN_HSRR1,r12
+
+   /* load the opal call entry point and base */
+   ld  r11,opal@got(r2)
+   ld  r12,8(r11)
+   ld  r2,0(r11)
+   mtspr   SPRN_HSRR0,r12
+   hrfid
+
+opal_return:
+   FIXUP_ENDIAN
+   mr  r2,r13;
+   lwz r11,8(r1);
+   ld  r12,16(r1)
+   mtcrr11;
+   mtlrr12
+   blr
+
+OPAL_CALL(opal_console_write,  OPAL_CONSOLE_WRITE);
+OPAL_CALL(opal_console_read,   OPAL_CONSOLE_READ);
+OPAL_CALL(opal_console_write_buffer_space, 
OPAL_CONSOLE_WRITE_BUFFER_SPACE);
+OPAL_CALL(opal_poll_events,OPAL_POLL_EVENTS);
+OPAL_CALL(opal_console_flush,  OPAL_CONSOLE_FLUSH);
diff --git a/arch/powerpc/boot/opal.c b/arch/powerpc/boot/opal.c
new file mode 100644
index ..d0f54443caa9
--- /dev/null
+++ b/arch/powerpc/boot/opal.c
@@ -0,0 +1,88 @@
+#include "ops.h"
+#include "stdio.h"
+#include "io.h"
+#include 
+#include "../include/asm/opal-api.h"
+
+/* Global OPAL struct used by opal-call.S */
+struct opal {
+   u64 base;
+   u64 entry;
+} opal;
+
+static u32 opal_con_id;
+
+int64_t opal_console_write(int64_t term_number, u64 *length, const u8 *buffer);
+int64_t opal_console_read(int64_t term_number, uint64_t *length, u8 *buffer);
+int64_t opal_console_write_buffer_space(uint64_t term_number, uint64_t 
*length);
+int64_t opal_console_flush(uint64_t term_number);
+int6

[PATCH v5] powerpc/timer - large decrementer support

2016-06-23 Thread Oliver O'Halloran
Power ISAv3 adds a large decrementer (LD) mode which increases the size
of the decrementer register. The size of the enlarged decrementer
register is between 32 and 64 bits with the exact size being dependent
on the implementation. When in LD mode, reads are sign extended to 64
bits and a decrementer exception is raised when the high bit is set (i.e
the value goes below zero). Writes however are truncated to the physical
register width so some care needs to be taken to ensure that the high
bit is not set when reloading the decrementer. This patch adds support
for using the LD inside the host kernel on processors that support it.

When LD mode is supported firmware will supply the ibm,dec-bits property
for CPU nodes to allow the kernel to determine the maximum decrementer
value. Enabling LD mode is a hypervisor privileged operation so the
kernel can only enable it manually when running in hypervisor mode.
Guest kernels that support LD mode can request it using the
"ibm,client-architecture-support" firmware call or some other platform
specific method. If this property is not supplied then the traditional
decrementer width of 32 bit is assumed and LD mode will not be enabled.

This patch was based on initial work by Jack Miller.

Signed-off-by: Oliver O'Halloran <ooh...@gmail.com>
Signed-off-by: Balbir Singh <bsinghar...@gmail.com>
Cc: Michael Neuling <mi...@neuling.org>
Cc: Jack Miller <j...@codezen.org>
---
 arch/powerpc/include/asm/reg.h  |   1 +
 arch/powerpc/include/asm/time.h |   6 +--
 arch/powerpc/kernel/time.c  | 104 
 3 files changed, 100 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index a0948f40bc7b..12d970d64bb3 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -332,6 +332,7 @@
 #define   LPCR_AIL_0   0x  /* MMU off exception offset 0x0 */
 #define   LPCR_AIL_3   0x0180  /* MMU on exception offset 0xc00...4xxx 
*/
 #define   LPCR_ONL 0x0004  /* online - PURR/SPURR count */
+#define   LPCR_LD  0x0002  /* large decremeter */
 #define   LPCR_PECE0x0001f000  /* powersave exit cause enable */
 #define LPCR_PECEDP0x0001  /* directed priv dbells cause 
exit */
 #define LPCR_PECEDH0x8000  /* directed hyp dbells cause 
exit */
diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h
index 1092fdd7e737..09211640a0e0 100644
--- a/arch/powerpc/include/asm/time.h
+++ b/arch/powerpc/include/asm/time.h
@@ -146,7 +146,7 @@ static inline void set_tb(unsigned int upper, unsigned int 
lower)
  * in auto-reload mode.  The problem is PIT stops counting when it
  * hits zero.  If it would wrap, we could use it just like a decrementer.
  */
-static inline unsigned int get_dec(void)
+static inline u64 get_dec(void)
 {
 #if defined(CONFIG_40x)
return (mfspr(SPRN_PIT));
@@ -160,10 +160,10 @@ static inline unsigned int get_dec(void)
  * in when the decrementer generates its interrupt: on the 1 to 0
  * transition for Book E/4xx, but on the 0 to -1 transition for others.
  */
-static inline void set_dec(int val)
+static inline void set_dec(u64 val)
 {
 #if defined(CONFIG_40x)
-   mtspr(SPRN_PIT, val);
+   mtspr(SPRN_PIT, (u32) val);
 #else
 #ifndef CONFIG_BOOKE
--val;
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 7a482a7f4d8d..efebe52133ef 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -97,7 +97,8 @@ static struct clocksource clocksource_timebase = {
.read = timebase_read,
 };
 
-#define DECREMENTER_MAX0x7fff
+#define DECREMENTER_DEFAULT_MAX 0x7FFF
+u64 decrementer_max = DECREMENTER_DEFAULT_MAX;
 
 static int decrementer_set_next_event(unsigned long evt,
  struct clock_event_device *dev);
@@ -505,8 +506,8 @@ static void __timer_interrupt(void)
__this_cpu_inc(irq_stat.timer_irqs_event);
} else {
now = *next_tb - now;
-   if (now <= DECREMENTER_MAX)
-   set_dec((int)now);
+   if (now <= decrementer_max)
+   set_dec(now);
/* We may have raced with new irq work */
if (test_irq_work_pending())
set_dec(1);
@@ -536,7 +537,7 @@ void timer_interrupt(struct pt_regs * regs)
/* Ensure a positive value is written to the decrementer, or else
 * some CPUs will continue to take decrementer exceptions.
 */
-   set_dec(DECREMENTER_MAX);
+   set_dec(decrementer_max);
 
/* Some implementations of hotplug will get timer interrupts while
 * offline, just ignore these and we also need to set
@@ -584,9 +585,9 @@ static void generic_suspend_disable_irqs(void)
 * with suspending.
 

[PATCH] powerpc/boot: Add OPAL console to epapr wrappers

2016-06-24 Thread Oliver O'Halloran
This patch adds an OPAL console backend to the powerpc boot wrapper so
that decompression failures inside the wrapper can be reported to the
user. This is important since it typically indicates data corruption in
the firmware and other nasty things.

Currently this only works when building a little endian kernel. When
compiling a 64 bit BE kernel the wrapper is always build 32 bit to be
compatible with some 32 bit firmwares. BE support will be added at a
later date. Another limitation of this is that only the "raw" type of
OPAL console is supported, however machines that provide a hvsi console
also provide a raw console so this is not an issue in practice.

Actually-written-by: Benjamin Herrenschmidt <b...@kernel.crashing.org>
Signed-off-by: Oliver O'Halloran <ooh...@gmail.com>
Cc: Stewart Smith <stew...@linux.vnet.ibm.com>
Cc: sta...@vger.kernel.org
---
 arch/powerpc/boot/Makefile  |  4 ++--
 arch/powerpc/boot/ops.h |  1 +
 arch/powerpc/boot/ppc_asm.h |  4 
 arch/powerpc/boot/serial.c  |  2 ++
 arch/powerpc/boot/types.h   | 12 
 5 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile
index 8fe78a3efc92..00cf88aa9a23 100644
--- a/arch/powerpc/boot/Makefile
+++ b/arch/powerpc/boot/Makefile
@@ -70,7 +70,7 @@ $(addprefix $(obj)/,$(zlib) cuboot-c2k.o gunzip_util.o 
main.o): \
 libfdt   := fdt.c fdt_ro.c fdt_wip.c fdt_sw.c fdt_rw.c fdt_strerror.c
 libfdtheader := fdt.h libfdt.h libfdt_internal.h
 
-$(addprefix $(obj)/,$(libfdt) libfdt-wrapper.o simpleboot.o epapr.o): \
+$(addprefix $(obj)/,$(libfdt) libfdt-wrapper.o simpleboot.o epapr.o opal.o): \
$(addprefix $(obj)/,$(libfdtheader))
 
 src-wlib-y := string.S crt0.S crtsavres.S stdio.c main.c \
@@ -78,7 +78,7 @@ src-wlib-y := string.S crt0.S crtsavres.S stdio.c main.c \
ns16550.c serial.c simple_alloc.c div64.S util.S \
gunzip_util.c elf_util.c $(zlib) devtree.c stdlib.c \
oflib.c ofconsole.c cuboot.c mpsc.c cpm-serial.c \
-   uartlite.c mpc52xx-psc.c
+   uartlite.c mpc52xx-psc.c opal.c opal-calls.S
 src-wlib-$(CONFIG_40x) += 4xx.c planetcore.c
 src-wlib-$(CONFIG_44x) += 4xx.c ebony.c bamboo.c
 src-wlib-$(CONFIG_8xx) += mpc8xx.c planetcore.c fsl-soc.c
diff --git a/arch/powerpc/boot/ops.h b/arch/powerpc/boot/ops.h
index 5e75e1c5518e..e19b64ef977a 100644
--- a/arch/powerpc/boot/ops.h
+++ b/arch/powerpc/boot/ops.h
@@ -89,6 +89,7 @@ int mpsc_console_init(void *devp, struct serial_console_data 
*scdp);
 int cpm_console_init(void *devp, struct serial_console_data *scdp);
 int mpc5200_psc_console_init(void *devp, struct serial_console_data *scdp);
 int uartlite_console_init(void *devp, struct serial_console_data *scdp);
+int opal_console_init(void *devp, struct serial_console_data *scdp);
 void *simple_alloc_init(char *base, unsigned long heap_size,
unsigned long granularity, unsigned long max_allocs);
 extern void flush_cache(void *, unsigned long);
diff --git a/arch/powerpc/boot/ppc_asm.h b/arch/powerpc/boot/ppc_asm.h
index 35ea60c1f070..b03373d8b386 100644
--- a/arch/powerpc/boot/ppc_asm.h
+++ b/arch/powerpc/boot/ppc_asm.h
@@ -61,6 +61,10 @@
 
 #define SPRN_TBRL  268
 #define SPRN_TBRU  269
+#define SPRN_HSRR0 0x13A   /* Hypervisor Save/Restore 0 */
+#define SPRN_HSRR1 0x13B   /* Hypervisor Save/Restore 1 */
+
+#define MSR_LE 0x0001
 
 #define FIXUP_ENDIAN  \
tdi   0, 0, 0x48; /* Reverse endian of b . + 8  */ \
diff --git a/arch/powerpc/boot/serial.c b/arch/powerpc/boot/serial.c
index 167ee9433de6..e04c1e4063ae 100644
--- a/arch/powerpc/boot/serial.c
+++ b/arch/powerpc/boot/serial.c
@@ -132,6 +132,8 @@ int serial_console_init(void)
else if (dt_is_compatible(devp, "xlnx,opb-uartlite-1.00.b") ||
 dt_is_compatible(devp, "xlnx,xps-uartlite-1.00.a"))
rc = uartlite_console_init(devp, _cd);
+   else if (dt_is_compatible(devp, "ibm,opal-console-raw"))
+   rc = opal_console_init(devp, _cd);
 
/* Add other serial console driver calls here */
 
diff --git a/arch/powerpc/boot/types.h b/arch/powerpc/boot/types.h
index 31393d17a9c1..cda474cd63c8 100644
--- a/arch/powerpc/boot/types.h
+++ b/arch/powerpc/boot/types.h
@@ -12,6 +12,18 @@ typedef shorts16;
 typedef ints32;
 typedef long long  s64;
 
+
+/* required for opal-api.h */
+typedef u8  uint8_t;
+typedef u16 uint16_t;
+typedef u32 uint32_t;
+typedef u64 uint64_t;
+typedef s8  int8_t;
+typedef s16 int16_t;
+typedef s32 int32_t;
+typedef s64 int64_t;
+
+
 #define min(x,y) ({ \
typeof(x) _x = (x); \
typeof(y) _y = (y); \
-- 
2.5.5

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH] Fix fall-through from case 30 (rld*) to case 31

2016-01-24 Thread Oliver O'Halloran
I think this bug can only be triggered if the instruction to
simulate is malformed. The switch in the else case only handles
the zero and one case, but it extracts bits 4:1 from the
instruction word so it may be other values. It's pretty minor, but
a bug is a bug.

Signed-off-by: Oliver O'Halloran <ooh...@gmail.com>
---
 arch/powerpc/lib/sstep.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c
index dc885b3..e25f73c 100644
--- a/arch/powerpc/lib/sstep.c
+++ b/arch/powerpc/lib/sstep.c
@@ -925,6 +925,7 @@ int __kprobes analyse_instr(struct instruction_op *op, 
struct pt_regs *regs,
}
}
 #endif
+   break; /* illegal instruction */
 
case 31:
switch ((instr >> 1) & 0x3ff) {
-- 
2.5.0

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH] powerpc/lib/sstep.c - Fix emulation fall-through

2016-02-15 Thread Oliver O'Halloran
There is a switch fallthough in instr_analyze() which can cause
an invalid instruction to be emulated as a different, valid,
instruction. The rld* (opcode 30) case extracts a sub-opcode from
bits 3:1 of the instruction word. However, the only valid values
of this field a 001 and 000. These cases are correctly handled,
but the others are not which causes execution to fall through
into case 31.

Breaking out of the switch causes the instruction to be marked as
unknown and allows the caller to deal with the invalid instruction
in a manner consistent with other invalid instructions.

Signed-off-by: Oliver O'Halloran <ooh...@gmail.com>
---
 arch/powerpc/lib/sstep.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c
index dc885b3..e25f73c 100644
--- a/arch/powerpc/lib/sstep.c
+++ b/arch/powerpc/lib/sstep.c
@@ -925,6 +925,7 @@ int __kprobes analyse_instr(struct instruction_op *op, 
struct pt_regs *regs,
}
}
 #endif
+   break; /* illegal instruction */
 
case 31:
switch ((instr >> 1) & 0x3ff) {
-- 
2.5.0

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH v2] powerpc/sstep.c - Fix emulation fall-through

2016-02-15 Thread Oliver O'Halloran
There is a switch fallthough in instr_analyze() which can cause
an invalid instruction to be emulated as a different, valid,
instruction. The rld* (opcode 30) case extracts a sub-opcode from
bits 3:1 of the instruction word. However, the only valid values
of this field a 001 and 000. These cases are correctly handled,
but the others are not which causes execution to fall through
into case 31.

Breaking out of the switch causes the instruction to be marked as
unknown and allows the caller to deal with the invalid instruction
in a manner consistent with other invalid instructions.

Signed-off-by: Oliver O'Halloran <ooh...@gmail.com>
---
 arch/powerpc/lib/sstep.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c
index dc885b3..e25f73c 100644
--- a/arch/powerpc/lib/sstep.c
+++ b/arch/powerpc/lib/sstep.c
@@ -925,6 +925,7 @@ int __kprobes analyse_instr(struct instruction_op *op, 
struct pt_regs *regs,
}
}
 #endif
+   break; /* illegal instruction */
 
case 31:
switch ((instr >> 1) & 0x3ff) {
-- 
2.5.0

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH 2/2] KVM: PPC: hypervisor large decrementer support

2016-04-11 Thread Oliver O'Halloran
Power ISAv3 extends the width of the decrementer register from 32 bits.
The enlarged register width is implementation dependent, but reads from
these registers are automatically sign extended to produce a 64 bit output
when operating in large mode. The HDEC always operates in large mode
while the DEC register can be operated in 32bit mode or large mode
depending on the setting of the LPCR.LD bit.

Currently the hypervisor assumes that reads from the DEC and HDEC register
produce a 32 bit result which it sign extends to 64 bits using the extsw
instruction. This behaviour can result in the guest DEC register value
being corrupted by the hypervisor when the guest is operating in LD mode since
the results of the extsw instruction only depends on the value of bit
31 in the register to be sign extended.

This patch adds the GET_DEC() and GET_HDEC() assembly macros for reading
from the decrementer registers. These macros will return the current
decrementer value as a 64 bit quantity regardless of the Host CPU or
guest decrementer operating mode. Additionally this patch corrects several
uses of decrementer values that assume a 32 bit register width.

Signed-off-by: Oliver O'Halloran <ooh...@gmail.com>
Cc: Paul Mackerras <pau...@samba.org>
---
 arch/powerpc/include/asm/exception-64s.h | 22 ++
 arch/powerpc/include/asm/kvm_host.h  |  2 +-
 arch/powerpc/include/asm/kvm_ppc.h   |  2 +-
 arch/powerpc/include/uapi/asm/kvm.h  |  2 +-
 arch/powerpc/kernel/exceptions-64s.S |  9 +++-
 arch/powerpc/kvm/book3s_hv_interrupts.S  |  3 +--
 arch/powerpc/kvm/book3s_hv_rmhandlers.S  | 38 ++--
 arch/powerpc/kvm/emulate.c   |  4 ++--
 8 files changed, 57 insertions(+), 25 deletions(-)

diff --git a/arch/powerpc/include/asm/exception-64s.h 
b/arch/powerpc/include/asm/exception-64s.h
index 93ae809fe5ea..d922f76c682d 100644
--- a/arch/powerpc/include/asm/exception-64s.h
+++ b/arch/powerpc/include/asm/exception-64s.h
@@ -545,4 +545,26 @@ END_FTR_SECTION_IFSET(CPU_FTR_CAN_NAP)
 #define FINISH_NAP
 #endif
 
+/* these ensure that we always get a 64bit value from the
+ * decrementer register. */
+
+#define IS_LD_ENABLED(reg) \
+   mfspr  reg,SPRN_LPCR;  \
+   andis. reg,reg,(LPCR_LD >> 16);
+
+#define GET_DEC(reg)   \
+   IS_LD_ENABLED(reg);\
+   mfspr reg, SPRN_DEC;   \
+   bne 99f;   \
+   extsw reg, reg;\
+99:
+
+/* For CPUs that support it the Hypervisor LD is
+ * always enabled, so this needs to be feature gated */
+#define GET_HDEC(reg) \
+   mfspr reg, SPRN_HDEC;   \
+BEGIN_FTR_SECTION   \
+   extsw reg, reg; \
+END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
+
 #endif /* _ASM_POWERPC_EXCEPTION_H */
diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index d7b343170453..6330d3fca083 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -516,7 +516,7 @@ struct kvm_vcpu_arch {
ulong mcsrr0;
ulong mcsrr1;
ulong mcsr;
-   u32 dec;
+   u64 dec;
 #ifdef CONFIG_BOOKE
u32 decar;
 #endif
diff --git a/arch/powerpc/include/asm/kvm_ppc.h 
b/arch/powerpc/include/asm/kvm_ppc.h
index 2544edabe7f3..4de0102930e9 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -94,7 +94,7 @@ extern int kvmppc_emulate_instruction(struct kvm_run *run,
 extern int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu);
 extern int kvmppc_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu);
 extern void kvmppc_emulate_dec(struct kvm_vcpu *vcpu);
-extern u32 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb);
+extern u64 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb);
 extern void kvmppc_decrementer_func(struct kvm_vcpu *vcpu);
 extern int kvmppc_sanity_check(struct kvm_vcpu *vcpu);
 extern int kvmppc_subarch_vcpu_init(struct kvm_vcpu *vcpu);
diff --git a/arch/powerpc/include/uapi/asm/kvm.h 
b/arch/powerpc/include/uapi/asm/kvm.h
index c93cf35ce379..2dd92e841127 100644
--- a/arch/powerpc/include/uapi/asm/kvm.h
+++ b/arch/powerpc/include/uapi/asm/kvm.h
@@ -215,7 +215,7 @@ struct kvm_sregs {
__u32 tsr;  /* KVM_SREGS_E_UPDATE_TSR */
__u32 tcr;
__u32 decar;
-   __u32 dec;  /* KVM_SREGS_E_UPDATE_DEC */
+   __u64 dec;  /* KVM_SREGS_E_UPDATE_DEC */
 
/*
 * Userspace can read TB directly, but the
diff --git a/arch/powerpc/kernel/exceptions-64s.S 
b/arch/powerpc/kernel/exceptions-64s.S
index 7716cebf4b8e..984ae894e758 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -641,7 +641,14 @@ masked_##_H##interrupt:

[PATCH 1/2] powerpc/timer - large decrementer support

2016-04-11 Thread Oliver O'Halloran
POWER ISA v3 adds large decrementer (LD) mode of operation which increases
the size of the decrementer register from 32 bits to an implementation
defined with of up to 64 bits.

This patch adds support for the LD on processors with the CPU_FTR_ARCH_300
cpu feature flag set. Even for CPUs with this feature LD mode is only
enabled when the property ibm,dec-bits devicetree property is supplied
for the boot CPU. The decrementer value is a signed quantity (with
negative values indicating a pending exception) and this property is
required to find the maximum positive decrementer value. If this property
is not supplied then the traditional decrementer width of 32 bits is
assumed and LD mode is disabled.

This patch was based on inital work by Jack Miller.

Signed-off-by: Oliver O'Halloran <ooh...@gmail.com>
Cc: Jack Miller <j...@codezen.org>
---
 arch/powerpc/include/asm/reg.h  |  1 +
 arch/powerpc/include/asm/time.h |  6 +--
 arch/powerpc/kernel/time.c  | 89 +
 3 files changed, 86 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index f5f4c66bbbc9..ff581ed1ab9d 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -332,6 +332,7 @@
 #define   LPCR_AIL_0   0x  /* MMU off exception offset 0x0 */
 #define   LPCR_AIL_3   0x0180  /* MMU on exception offset 0xc00...4xxx 
*/
 #define   LPCR_ONL 0x0004  /* online - PURR/SPURR count */
+#define   LPCR_LD  0x0002  /* large decremeter */
 #define   LPCR_PECE0x0001f000  /* powersave exit cause enable */
 #define LPCR_PECEDP0x0001  /* directed priv dbells cause 
exit */
 #define LPCR_PECEDH0x8000  /* directed hyp dbells cause 
exit */
diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h
index 1092fdd7e737..09211640a0e0 100644
--- a/arch/powerpc/include/asm/time.h
+++ b/arch/powerpc/include/asm/time.h
@@ -146,7 +146,7 @@ static inline void set_tb(unsigned int upper, unsigned int 
lower)
  * in auto-reload mode.  The problem is PIT stops counting when it
  * hits zero.  If it would wrap, we could use it just like a decrementer.
  */
-static inline unsigned int get_dec(void)
+static inline u64 get_dec(void)
 {
 #if defined(CONFIG_40x)
return (mfspr(SPRN_PIT));
@@ -160,10 +160,10 @@ static inline unsigned int get_dec(void)
  * in when the decrementer generates its interrupt: on the 1 to 0
  * transition for Book E/4xx, but on the 0 to -1 transition for others.
  */
-static inline void set_dec(int val)
+static inline void set_dec(u64 val)
 {
 #if defined(CONFIG_40x)
-   mtspr(SPRN_PIT, val);
+   mtspr(SPRN_PIT, (u32) val);
 #else
 #ifndef CONFIG_BOOKE
--val;
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 81b0900a39ee..0afaef6b5b6a 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -95,7 +95,8 @@ static struct clocksource clocksource_timebase = {
.read = timebase_read,
 };
 
-#define DECREMENTER_MAX0x7fff
+#define DECREMENTER_DEFAULT_MAX 0x7FFF
+u64 decrementer_max = DECREMENTER_DEFAULT_MAX;
 
 static int decrementer_set_next_event(unsigned long evt,
  struct clock_event_device *dev);
@@ -503,7 +504,7 @@ static void __timer_interrupt(void)
__this_cpu_inc(irq_stat.timer_irqs_event);
} else {
now = *next_tb - now;
-   if (now <= DECREMENTER_MAX)
+   if (now <= decrementer_max)
set_dec((int)now);
/* We may have raced with new irq work */
if (test_irq_work_pending())
@@ -534,7 +535,7 @@ void timer_interrupt(struct pt_regs * regs)
/* Ensure a positive value is written to the decrementer, or else
 * some CPUs will continue to take decrementer exceptions.
 */
-   set_dec(DECREMENTER_MAX);
+   set_dec(decrementer_max);
 
/* Some implementations of hotplug will get timer interrupts while
 * offline, just ignore these and we also need to set
@@ -562,6 +563,7 @@ void timer_interrupt(struct pt_regs * regs)
irq_enter();
 
__timer_interrupt();
+
irq_exit();
set_irq_regs(old_regs);
 }
@@ -582,9 +584,9 @@ static void generic_suspend_disable_irqs(void)
 * with suspending.
 */
 
-   set_dec(DECREMENTER_MAX);
+   set_dec(decrementer_max);
local_irq_disable();
-   set_dec(DECREMENTER_MAX);
+   set_dec(decrementer_max);
 }
 
 static void generic_suspend_enable_irqs(void)
@@ -865,7 +867,7 @@ static int decrementer_set_next_event(unsigned long evt,
 
 static int decrementer_shutdown(struct clock_event_device *dev)
 {
-   decrementer_set_next_event(DECREMENTER_MAX, dev);
+   decrementer_set_next_event(decrementer_max, dev);
return 0;
 }
 

[PATCH] powerpc/process: fix altivec SPR not being saved

2016-03-06 Thread Oliver O'Halloran
In save_sprs() in process.c contains the following test:

if (cpu_has_feature(cpu_has_feature(CPU_FTR_ALTIVEC)))
t->vrsave = mfspr(SPRN_VRSAVE);

CPU feature with the mask 0x1 is CPU_FTR_COHERENT_ICACHE so the test
is equivilent to:

if (cpu_has_feature(CPU_FTR_ALTIVEC) &&
cpu_has_feature(CPU_FTR_COHERENT_ICACHE))

On CPUs without support for both (i.e G5) this results in vrsave not being
saved between context switches. The vector register save/restore code
doesn't use VRSAVE to determine which registers to save/restore,
but the value of VRSAVE is used to determine if altivec is being used
in several code paths.

Signed-off-by: Oliver O'Halloran <ooh...@gmail.com>
---
 arch/powerpc/kernel/process.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 8224852..5a4d4d1 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -855,7 +855,7 @@ void restore_tm_state(struct pt_regs *regs)
 static inline void save_sprs(struct thread_struct *t)
 {
 #ifdef CONFIG_ALTIVEC
-   if (cpu_has_feature(cpu_has_feature(CPU_FTR_ALTIVEC)))
+   if (cpu_has_feature(CPU_FTR_ALTIVEC))
t->vrsave = mfspr(SPRN_VRSAVE);
 #endif
 #ifdef CONFIG_PPC_BOOK3S_64
-- 
2.5.0

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH] powerpc/process: fix altivec SPR not being saved

2016-03-07 Thread Oliver O'Halloran
In save_sprs() in process.c contains the following test:

if (cpu_has_feature(cpu_has_feature(CPU_FTR_ALTIVEC)))
t->vrsave = mfspr(SPRN_VRSAVE);

CPU feature with the mask 0x1 is CPU_FTR_COHERENT_ICACHE so the test
is equivilent to:

if (cpu_has_feature(CPU_FTR_ALTIVEC) &&
cpu_has_feature(CPU_FTR_COHERENT_ICACHE))

On CPUs without support for both (i.e G5) this results in vrsave not being
saved between context switches. The vector register save/restore code
doesn't use VRSAVE to determine which registers to save/restore,
but the value of VRSAVE is used to determine if altivec is being used
in several code paths.

Signed-off-by: Oliver O'Halloran <ooh...@gmail.com>
Signed-off-by: Anton Blanchard <an...@samba.org>
Fixes: 152d523e6307 ("powerpc: Create context switch helpers save_sprs() and 
restore_sprs()")
Cc: sta...@vger.kernel.org
---
 arch/powerpc/kernel/process.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index dccc87e8fee5..bc6aa87a3b12 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -854,7 +854,7 @@ void restore_tm_state(struct pt_regs *regs)
 static inline void save_sprs(struct thread_struct *t)
 {
 #ifdef CONFIG_ALTIVEC
-   if (cpu_has_feature(cpu_has_feature(CPU_FTR_ALTIVEC)))
+   if (cpu_has_feature(CPU_FTR_ALTIVEC))
t->vrsave = mfspr(SPRN_VRSAVE);
 #endif
 #ifdef CONFIG_PPC_BOOK3S_64
-- 
2.5.0

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH v2] powerpc/mm: Ensure "special" zones are empty

2016-05-11 Thread Oliver O'Halloran
The mm zone mechanism was traditionally used by arch specific code to
partition memory into allocation zones. However there are several zones
that are managed by the mm subsystem rather than the architecture. Most
architectures set the max PFN of these special zones to zero, however on
powerpc we set them to ~0ul. This, in conjunction with a bug in
free_area_init_nodes() results in all of system memory being placed in
ZONE_DEVICE when enabled. Device memory cannot be used for regular kernel
memory allocations so this will cause a kernel panic at boot. Given the
planned addition of more mm managed zones (ZONE_CMA) we should aim to be
consistent with every other architecture and set the max PFN for these
zones to zero.

Signed-off-by: Oliver O'Halloran <ooh...@gmail.com>
Reviewed-by: Balbir Singh <bsinghar...@gmail.com>
Cc: linux...@kvack.org
---
 arch/powerpc/mm/mem.c | 8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 879e0bc6f82e..f35e6605c422 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -239,8 +239,14 @@ static int __init mark_nonram_nosave(void)
 
 static bool zone_limits_final;
 
+/*
+ * The memory zones past TOP_ZONE are managed by generic mm code.
+ * These should be set to zero since that's what every other
+ * architecture does.
+ */
 static unsigned long max_zone_pfns[MAX_NR_ZONES] = {
-   [0 ... MAX_NR_ZONES - 1] = ~0UL
+   [0... TOP_ZONE] = ~0UL,
+   [TOP_ZONE + 1 ... MAX_NR_ZONES - 1] = 0
 };
 
 /*
-- 
2.5.5

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH v2 2/2] powerpc/mm: Ensure "special" zones are empty

2016-05-05 Thread Oliver O'Halloran
The mm zone mechanism was traditionally used by arch specific code to
partition memory into allocation zones. However there are several zones
that are managed by the mm subsystem rather than the architecture. Most
architectures set the max PFN of these special zones to zero, however on
powerpc we set them to ~0ul. This, in conjunction with a bug in
free_area_init_nodes() results in all of system memory being placed in
ZONE_DEVICE when enabled. Device memory cannot be used for regular kernel
memory allocations so this will cause a kernel panic at boot.

Given the planned addition of more mm managed zones (ZONE_CMA) we should
aim to be consistent with every other architecture and set the max PFN for
these zones to zero.

Signed-off-by: Oliver O'Halloran <ooh...@gmail.com>
Cc: linux...@kvack.org
---
 arch/powerpc/mm/mem.c | 8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 8f4c19789a38..f0a058ebb6d7 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -239,8 +239,14 @@ static int __init mark_nonram_nosave(void)
 
 static bool zone_limits_final;
 
+/*
+ * The memory zones past TOP_ZONE are managed by generic mm code.
+ * These should be set to zero since that's what every other
+ * architecture does.
+ */
 static unsigned long max_zone_pfns[MAX_NR_ZONES] = {
-   [0 ... MAX_NR_ZONES - 1] = ~0UL
+   [0... TOP_ZONE - 1] = ~0UL,
+   [TOP_ZONE ... MAX_NR_ZONES - 1] = 0
 };
 
 /*
-- 
2.5.5

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH v2 1/2] powerpc/mm: define TOP_ZONE as a constant

2016-05-05 Thread Oliver O'Halloran
The zone that contains the top of memory will be either ZONE_NORMAL
or ZONE_HIGHMEM depending on the kernel config. There are two functions
that require this information and both of them use an #ifdef to set
a local variable (top_zone). This is a little silly so lets just make it
a constant.

Signed-off-by: Oliver O'Halloran <ooh...@gmail.com>
Cc: linux...@kvack.org
---
 arch/powerpc/mm/mem.c | 17 +
 1 file changed, 5 insertions(+), 12 deletions(-)

diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index ac79dbde1015..8f4c19789a38 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -68,12 +68,15 @@ pte_t *kmap_pte;
 EXPORT_SYMBOL(kmap_pte);
 pgprot_t kmap_prot;
 EXPORT_SYMBOL(kmap_prot);
+#define TOP_ZONE ZONE_HIGHMEM
 
 static inline pte_t *virt_to_kpte(unsigned long vaddr)
 {
return pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr),
vaddr), vaddr), vaddr);
 }
+#else
+#define TOP_ZONE ZONE_NORMAL
 #endif
 
 int page_is_ram(unsigned long pfn)
@@ -267,14 +270,9 @@ void __init limit_zone_pfn(enum zone_type zone, unsigned 
long pfn_limit)
  */
 int dma_pfn_limit_to_zone(u64 pfn_limit)
 {
-   enum zone_type top_zone = ZONE_NORMAL;
int i;
 
-#ifdef CONFIG_HIGHMEM
-   top_zone = ZONE_HIGHMEM;
-#endif
-
-   for (i = top_zone; i >= 0; i--) {
+   for (i = TOP_ZONE; i >= 0; i--) {
if (max_zone_pfns[i] <= pfn_limit)
return i;
}
@@ -289,7 +287,6 @@ void __init paging_init(void)
 {
unsigned long long total_ram = memblock_phys_mem_size();
phys_addr_t top_of_ram = memblock_end_of_DRAM();
-   enum zone_type top_zone;
 
 #ifdef CONFIG_PPC32
unsigned long v = __fix_to_virt(__end_of_fixed_addresses - 1);
@@ -313,13 +310,9 @@ void __init paging_init(void)
   (long int)((top_of_ram - total_ram) >> 20));
 
 #ifdef CONFIG_HIGHMEM
-   top_zone = ZONE_HIGHMEM;
limit_zone_pfn(ZONE_NORMAL, lowmem_end_addr >> PAGE_SHIFT);
-#else
-   top_zone = ZONE_NORMAL;
 #endif
-
-   limit_zone_pfn(top_zone, top_of_ram >> PAGE_SHIFT);
+   limit_zone_pfn(TOP_ZONE, top_of_ram >> PAGE_SHIFT);
zone_limits_final = true;
free_area_init_nodes(max_zone_pfns);
 
-- 
2.5.5

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[RFC PATCH] mm/init: fix zone boundary creation

2016-05-05 Thread Oliver O'Halloran
As a part of memory initialisation the architecture passes an array to
free_area_init_nodes() which specifies the max PFN of each memory zone.
This array is not necessarily monotonic (due to unused zones) so this
array is parsed to build monotonic lists of the min and max PFN for
each zone. ZONE_MOVABLE is special cased here as its limits are managed by
the mm subsystem rather than the architecture. Unfortunately, this special
casing is broken when ZONE_MOVABLE is the not the last zone in the zone
list. The core of the issue is:

if (i == ZONE_MOVABLE)
continue;
arch_zone_lowest_possible_pfn[i] =
arch_zone_highest_possible_pfn[i-1];

As ZONE_MOVABLE is skipped the lowest_possible_pfn of the next zone
will be set to zero. This patch fixes this bug by adding explicitly
tracking where the next zone should start rather than relying on the
contents arch_zone_highest_possible_pfn[].

Signed-off-by: Oliver O'Halloran <ooh...@gmail.com>
Cc: linuxppc-dev@lists.ozlabs.org
---
 mm/page_alloc.c | 17 ++---
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 59de90d5d3a3..fc78306ce087 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5980,15 +5980,18 @@ void __init free_area_init_nodes(unsigned long 
*max_zone_pfn)
sizeof(arch_zone_lowest_possible_pfn));
memset(arch_zone_highest_possible_pfn, 0,
sizeof(arch_zone_highest_possible_pfn));
-   arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();
-   arch_zone_highest_possible_pfn[0] = max_zone_pfn[0];
-   for (i = 1; i < MAX_NR_ZONES; i++) {
+
+   start_pfn = find_min_pfn_with_active_regions();
+
+   for (i = 0; i < MAX_NR_ZONES; i++) {
if (i == ZONE_MOVABLE)
continue;
-   arch_zone_lowest_possible_pfn[i] =
-   arch_zone_highest_possible_pfn[i-1];
-   arch_zone_highest_possible_pfn[i] =
-   max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);
+
+   end_pfn = max(max_zone_pfn[i], start_pfn);
+   arch_zone_lowest_possible_pfn[i] = start_pfn;
+   arch_zone_highest_possible_pfn[i] = end_pfn;
+
+   start_pfn = end_pfn;
}
arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0;
arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0;
-- 
2.5.5

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH 2/2] powerpc/mm: Ensure "special" zones are empty

2016-05-05 Thread Oliver O'Halloran
The mm zone mechanism was traditionally used by arch specific code to
partition memory into allocation zones. However there are several zones
that are managed by the mm subsystem rather than the architecture. Most
architectures set the max PFN of these special zones to zero, however on
powerpc we set them to ~0ul. This, in conjunction with a bug in
free_area_init_nodes() results in all of system memory being placed
being placed in ZONE_DEVICE when enabled. Device memory cannot be used
for regular kernel memory allocations so this will cause a kernel panic at
boot.

Given the planned addition of more mm managed zones (ZONE_CMA) we should
aim to be consistent with every other architecture and set the max PFN
for these zones to zero

Signed-off-by: Oliver O'Halloran <ooh...@gmail.com>
---
 arch/powerpc/mm/mem.c | 8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 8f4c19789a38..f0a058ebb6d7 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -239,8 +239,14 @@ static int __init mark_nonram_nosave(void)
 
 static bool zone_limits_final;
 
+/*
+ * The memory zones past TOP_ZONE are managed by the generic
+ * mm subsystem which expects the max PFN for these zones
+ * to be set to zero.
+ */
 static unsigned long max_zone_pfns[MAX_NR_ZONES] = {
-   [0 ... MAX_NR_ZONES - 1] = ~0UL
+   [0... TOP_ZONE - 1] = ~0UL,
+   [TOP_ZONE ... MAX_NR_ZONES - 1] = 0
 };
 
 /*
-- 
2.5.5

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH 1/2] powerpc/mm: define TOP_ZONE as a constant

2016-05-05 Thread Oliver O'Halloran
The zone that contains the top of memory will be either ZONE_NORMAL
or ZONE_HIGHMEM depending on the kernel config. There are two functions
in that require this information and both of them use an #ifdef to set
a local variable (top_zone). This is a little silly so lets just make it
a constant.

Signed-off-by: Oliver O'Halloran <ooh...@gmail.com>
---
 arch/powerpc/mm/mem.c | 17 +
 1 file changed, 5 insertions(+), 12 deletions(-)

diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index ac79dbde1015..8f4c19789a38 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -68,12 +68,15 @@ pte_t *kmap_pte;
 EXPORT_SYMBOL(kmap_pte);
 pgprot_t kmap_prot;
 EXPORT_SYMBOL(kmap_prot);
+#define TOP_ZONE ZONE_HIGHMEM
 
 static inline pte_t *virt_to_kpte(unsigned long vaddr)
 {
return pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr),
vaddr), vaddr), vaddr);
 }
+#else
+#define TOP_ZONE ZONE_NORMAL
 #endif
 
 int page_is_ram(unsigned long pfn)
@@ -267,14 +270,9 @@ void __init limit_zone_pfn(enum zone_type zone, unsigned 
long pfn_limit)
  */
 int dma_pfn_limit_to_zone(u64 pfn_limit)
 {
-   enum zone_type top_zone = ZONE_NORMAL;
int i;
 
-#ifdef CONFIG_HIGHMEM
-   top_zone = ZONE_HIGHMEM;
-#endif
-
-   for (i = top_zone; i >= 0; i--) {
+   for (i = TOP_ZONE; i >= 0; i--) {
if (max_zone_pfns[i] <= pfn_limit)
return i;
}
@@ -289,7 +287,6 @@ void __init paging_init(void)
 {
unsigned long long total_ram = memblock_phys_mem_size();
phys_addr_t top_of_ram = memblock_end_of_DRAM();
-   enum zone_type top_zone;
 
 #ifdef CONFIG_PPC32
unsigned long v = __fix_to_virt(__end_of_fixed_addresses - 1);
@@ -313,13 +310,9 @@ void __init paging_init(void)
   (long int)((top_of_ram - total_ram) >> 20));
 
 #ifdef CONFIG_HIGHMEM
-   top_zone = ZONE_HIGHMEM;
limit_zone_pfn(ZONE_NORMAL, lowmem_end_addr >> PAGE_SHIFT);
-#else
-   top_zone = ZONE_NORMAL;
 #endif
-
-   limit_zone_pfn(top_zone, top_of_ram >> PAGE_SHIFT);
+   limit_zone_pfn(TOP_ZONE, top_of_ram >> PAGE_SHIFT);
zone_limits_final = true;
free_area_init_nodes(max_zone_pfns);
 
-- 
2.5.5

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH v3 1/2] powerpc/timer - large decrementer support

2016-05-09 Thread Oliver O'Halloran
POWER ISA v3 adds large decrementer (LD) mode of operation which increases
the size of the decrementer register from 32 bits to an implementation
defined with of up to 64 bits.

This patch adds support for the LD on processors with the CPU_FTR_ARCH_300
cpu feature flag set. For CPUs with this feature LD mode is enabled when
when the ibm,dec-bits devicetree property is supplied for the boot CPU. The
decrementer value is a signed quantity (with negative values indicating a
pending exception) and this property is required to find the maximum
positive decrementer value. If this property is not supplied then the
traditional decrementer width of 32 bits is assumed and LD mode is disabled.

This patch was based on initial work by Jack Miller.

Signed-off-by: Oliver O'Halloran <ooh...@gmail.com>
Cc: Michael Neuling <mi...@neuling.org>
Cc: Balbir Singh <bsinghar...@gmail.com>
Cc: Jack Miller <j...@codezen.org>
---
 arch/powerpc/include/asm/reg.h  |  1 +
 arch/powerpc/include/asm/time.h |  6 +--
 arch/powerpc/kernel/time.c  | 92 +
 3 files changed, 89 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index f5f4c66bbbc9..ff581ed1ab9d 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -332,6 +332,7 @@
 #define   LPCR_AIL_0   0x  /* MMU off exception offset 0x0 */
 #define   LPCR_AIL_3   0x0180  /* MMU on exception offset 0xc00...4xxx 
*/
 #define   LPCR_ONL 0x0004  /* online - PURR/SPURR count */
+#define   LPCR_LD  0x0002  /* large decremeter */
 #define   LPCR_PECE0x0001f000  /* powersave exit cause enable */
 #define LPCR_PECEDP0x0001  /* directed priv dbells cause 
exit */
 #define LPCR_PECEDH0x8000  /* directed hyp dbells cause 
exit */
diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h
index 1092fdd7e737..09211640a0e0 100644
--- a/arch/powerpc/include/asm/time.h
+++ b/arch/powerpc/include/asm/time.h
@@ -146,7 +146,7 @@ static inline void set_tb(unsigned int upper, unsigned int 
lower)
  * in auto-reload mode.  The problem is PIT stops counting when it
  * hits zero.  If it would wrap, we could use it just like a decrementer.
  */
-static inline unsigned int get_dec(void)
+static inline u64 get_dec(void)
 {
 #if defined(CONFIG_40x)
return (mfspr(SPRN_PIT));
@@ -160,10 +160,10 @@ static inline unsigned int get_dec(void)
  * in when the decrementer generates its interrupt: on the 1 to 0
  * transition for Book E/4xx, but on the 0 to -1 transition for others.
  */
-static inline void set_dec(int val)
+static inline void set_dec(u64 val)
 {
 #if defined(CONFIG_40x)
-   mtspr(SPRN_PIT, val);
+   mtspr(SPRN_PIT, (u32) val);
 #else
 #ifndef CONFIG_BOOKE
--val;
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 81b0900a39ee..0656e80cadbf 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -95,7 +95,8 @@ static struct clocksource clocksource_timebase = {
.read = timebase_read,
 };
 
-#define DECREMENTER_MAX0x7fff
+#define DECREMENTER_DEFAULT_MAX 0x7FFF
+u64 decrementer_max = DECREMENTER_DEFAULT_MAX;
 
 static int decrementer_set_next_event(unsigned long evt,
  struct clock_event_device *dev);
@@ -503,7 +504,7 @@ static void __timer_interrupt(void)
__this_cpu_inc(irq_stat.timer_irqs_event);
} else {
now = *next_tb - now;
-   if (now <= DECREMENTER_MAX)
+   if (now <= decrementer_max)
set_dec((int)now);
/* We may have raced with new irq work */
if (test_irq_work_pending())
@@ -534,7 +535,7 @@ void timer_interrupt(struct pt_regs * regs)
/* Ensure a positive value is written to the decrementer, or else
 * some CPUs will continue to take decrementer exceptions.
 */
-   set_dec(DECREMENTER_MAX);
+   set_dec(decrementer_max);
 
/* Some implementations of hotplug will get timer interrupts while
 * offline, just ignore these and we also need to set
@@ -582,9 +583,9 @@ static void generic_suspend_disable_irqs(void)
 * with suspending.
 */
 
-   set_dec(DECREMENTER_MAX);
+   set_dec(decrementer_max);
local_irq_disable();
-   set_dec(DECREMENTER_MAX);
+   set_dec(decrementer_max);
 }
 
 static void generic_suspend_enable_irqs(void)
@@ -865,7 +866,7 @@ static int decrementer_set_next_event(unsigned long evt,
 
 static int decrementer_shutdown(struct clock_event_device *dev)
 {
-   decrementer_set_next_event(DECREMENTER_MAX, dev);
+   decrementer_set_next_event(decrementer_max, dev);
return 0;
 }
 
@@ -891,6 +892,76 @@ static void register_decrementer_clockevent(int cpu)
clockevents_regi

[PATCH v3 2/2] KVM: PPC: hypervisor large decrementer support

2016-05-09 Thread Oliver O'Halloran
Power ISAv3 extends the width of the decrementer register from 32 bits.
The enlarged register width is implementation dependent, but reads from
these registers are automatically sign extended to produce a 64 bit
output when operating in large mode. The HDEC always operates in large
mode while the DEC register can be operated in 32bit mode or large mode
depending on the setting of the LPCR.LD bit.

Currently the hypervisor assumes that reads from the DEC and HDEC
register produce a 32 bit result which it sign extends to 64 bits using
the extsw instruction. This behaviour can result in the guest DEC
register value being corrupted by the hypervisor when the guest is
operating in LD mode since the results of the extsw instruction only
depends on the value of bit 31 in the register to be sign extended.

This patch adds the GET_DEC() and GET_HDEC() assembly macros for reading
from the decrementer registers. These macros will return the current
decrementer value as a 64 bit quantity regardless of the Host CPU or
guest decrementer operating mode. Additionally this patch corrects
several uses of decrementer values that assume a 32 bit register width.

Signed-off-by: Oliver O'Halloran <ooh...@gmail.com>
Cc: Paul Mackerras <pau...@samba.org>
Cc: Michael Neuling <mi...@neuling.org>
---
 arch/powerpc/include/asm/exception-64s.h | 29 
 arch/powerpc/include/asm/kvm_host.h  |  2 +-
 arch/powerpc/include/asm/kvm_ppc.h   |  2 +-
 arch/powerpc/include/uapi/asm/kvm.h  |  2 +-
 arch/powerpc/kernel/time.c   |  2 +-
 arch/powerpc/kvm/book3s_hv_interrupts.S  |  3 +--
 arch/powerpc/kvm/book3s_hv_rmhandlers.S  | 38 ++--
 arch/powerpc/kvm/emulate.c   |  6 ++---
 8 files changed, 58 insertions(+), 26 deletions(-)

diff --git a/arch/powerpc/include/asm/exception-64s.h 
b/arch/powerpc/include/asm/exception-64s.h
index 93ae809fe5ea..4fa303bf6d5b 100644
--- a/arch/powerpc/include/asm/exception-64s.h
+++ b/arch/powerpc/include/asm/exception-64s.h
@@ -545,4 +545,33 @@ END_FTR_SECTION_IFSET(CPU_FTR_CAN_NAP)
 #define FINISH_NAP
 #endif
 
+/*
+ * On ISAv3 processors the DEC register can be extended from 32 bits to 64 by
+ * setting the LD flag the LPCR. The decrementer value is a signed quantity so
+ * sign exension is required when operating in 32 bit mode. The GET_DEC() and
+ * GET_HDEC() handle this sign extension and yield a 64 bit result independent
+ * of the LD mode.
+ *
+ * NB: It's possible run with LD mode disabled on ISAv3 so GET_DEC() does not
+ * use a CPU_FEATURE section. A feature section is used for GET_HDEC 
because
+ * it has no mode bit. It is always 64 bits for ISAv3 processors.
+ */
+
+#define IS_LD_ENABLED(reg) \
+   mfspr  reg,SPRN_LPCR;  \
+   andis. reg,reg,(LPCR_LD >> 16);
+
+#define GET_DEC(reg)   \
+   IS_LD_ENABLED(reg);\
+   mfspr reg, SPRN_DEC;   \
+   bne 99f;   \
+   extsw reg, reg;\
+99:
+
+#define GET_HDEC(reg) \
+   mfspr reg, SPRN_HDEC;   \
+BEGIN_FTR_SECTION   \
+   extsw reg, reg; \
+END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
+
 #endif /* _ASM_POWERPC_EXCEPTION_H */
diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index d7b343170453..6330d3fca083 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -516,7 +516,7 @@ struct kvm_vcpu_arch {
ulong mcsrr0;
ulong mcsrr1;
ulong mcsr;
-   u32 dec;
+   u64 dec;
 #ifdef CONFIG_BOOKE
u32 decar;
 #endif
diff --git a/arch/powerpc/include/asm/kvm_ppc.h 
b/arch/powerpc/include/asm/kvm_ppc.h
index 2544edabe7f3..4de0102930e9 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -94,7 +94,7 @@ extern int kvmppc_emulate_instruction(struct kvm_run *run,
 extern int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu);
 extern int kvmppc_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu);
 extern void kvmppc_emulate_dec(struct kvm_vcpu *vcpu);
-extern u32 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb);
+extern u64 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb);
 extern void kvmppc_decrementer_func(struct kvm_vcpu *vcpu);
 extern int kvmppc_sanity_check(struct kvm_vcpu *vcpu);
 extern int kvmppc_subarch_vcpu_init(struct kvm_vcpu *vcpu);
diff --git a/arch/powerpc/include/uapi/asm/kvm.h 
b/arch/powerpc/include/uapi/asm/kvm.h
index c93cf35ce379..2dd92e841127 100644
--- a/arch/powerpc/include/uapi/asm/kvm.h
+++ b/arch/powerpc/include/uapi/asm/kvm.h
@@ -215,7 +215,7 @@ struct kvm_sregs {
__u32 tsr;  /* KVM_SREGS_E_UPDATE_TSR */
__u32 tcr;
__u32 decar;
-   __u32 dec;  /* KVM_SREGS_E_UPDATE_DEC */
+

[PATCH v2 1/2] powerpc/timer - large decrementer support

2016-05-04 Thread Oliver O'Halloran
POWER ISA v3 adds large decrementer (LD) mode of operation which increases
the size of the decrementer register from 32 bits to an implementation
defined with of up to 64 bits.

This patch adds support for the LD on processors with the CPU_FTR_ARCH_300
cpu feature flag set. Even for CPUs with this feature LD mode is only
enabled when the property ibm,dec-bits devicetree property is supplied
for the boot CPU. The decrementer value is a signed quantity (with
negative values indicating a pending exception) and this property is
required to find the maximum positive decrementer value. If this property
is not supplied then the traditional decrementer width of 32 bits is
assumed and LD mode is disabled.

This patch was based on initial work by Jack Miller.

Signed-off-by: Oliver O'Halloran <ooh...@gmail.com>
Cc: Jack Miller <j...@codezen.org>
Cc: Balbir Singh <bsinghar...@gmail.com>
---
 arch/powerpc/include/asm/reg.h  |  1 +
 arch/powerpc/include/asm/time.h |  6 +--
 arch/powerpc/kernel/time.c  | 89 +
 3 files changed, 86 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index f5f4c66bbbc9..ff581ed1ab9d 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -332,6 +332,7 @@
 #define   LPCR_AIL_0   0x  /* MMU off exception offset 0x0 */
 #define   LPCR_AIL_3   0x0180  /* MMU on exception offset 0xc00...4xxx 
*/
 #define   LPCR_ONL 0x0004  /* online - PURR/SPURR count */
+#define   LPCR_LD  0x0002  /* large decremeter */
 #define   LPCR_PECE0x0001f000  /* powersave exit cause enable */
 #define LPCR_PECEDP0x0001  /* directed priv dbells cause 
exit */
 #define LPCR_PECEDH0x8000  /* directed hyp dbells cause 
exit */
diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h
index 1092fdd7e737..09211640a0e0 100644
--- a/arch/powerpc/include/asm/time.h
+++ b/arch/powerpc/include/asm/time.h
@@ -146,7 +146,7 @@ static inline void set_tb(unsigned int upper, unsigned int 
lower)
  * in auto-reload mode.  The problem is PIT stops counting when it
  * hits zero.  If it would wrap, we could use it just like a decrementer.
  */
-static inline unsigned int get_dec(void)
+static inline u64 get_dec(void)
 {
 #if defined(CONFIG_40x)
return (mfspr(SPRN_PIT));
@@ -160,10 +160,10 @@ static inline unsigned int get_dec(void)
  * in when the decrementer generates its interrupt: on the 1 to 0
  * transition for Book E/4xx, but on the 0 to -1 transition for others.
  */
-static inline void set_dec(int val)
+static inline void set_dec(u64 val)
 {
 #if defined(CONFIG_40x)
-   mtspr(SPRN_PIT, val);
+   mtspr(SPRN_PIT, (u32) val);
 #else
 #ifndef CONFIG_BOOKE
--val;
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 81b0900a39ee..fab34abfb4cd 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -95,7 +95,8 @@ static struct clocksource clocksource_timebase = {
.read = timebase_read,
 };
 
-#define DECREMENTER_MAX0x7fff
+#define DECREMENTER_DEFAULT_MAX 0x7FFF
+u64 decrementer_max = DECREMENTER_DEFAULT_MAX;
 
 static int decrementer_set_next_event(unsigned long evt,
  struct clock_event_device *dev);
@@ -503,7 +504,7 @@ static void __timer_interrupt(void)
__this_cpu_inc(irq_stat.timer_irqs_event);
} else {
now = *next_tb - now;
-   if (now <= DECREMENTER_MAX)
+   if (now <= decrementer_max)
set_dec((int)now);
/* We may have raced with new irq work */
if (test_irq_work_pending())
@@ -534,7 +535,7 @@ void timer_interrupt(struct pt_regs * regs)
/* Ensure a positive value is written to the decrementer, or else
 * some CPUs will continue to take decrementer exceptions.
 */
-   set_dec(DECREMENTER_MAX);
+   set_dec(decrementer_max);
 
/* Some implementations of hotplug will get timer interrupts while
 * offline, just ignore these and we also need to set
@@ -582,9 +583,9 @@ static void generic_suspend_disable_irqs(void)
 * with suspending.
 */
 
-   set_dec(DECREMENTER_MAX);
+   set_dec(decrementer_max);
local_irq_disable();
-   set_dec(DECREMENTER_MAX);
+   set_dec(decrementer_max);
 }
 
 static void generic_suspend_enable_irqs(void)
@@ -865,7 +866,7 @@ static int decrementer_set_next_event(unsigned long evt,
 
 static int decrementer_shutdown(struct clock_event_device *dev)
 {
-   decrementer_set_next_event(DECREMENTER_MAX, dev);
+   decrementer_set_next_event(decrementer_max, dev);
return 0;
 }
 
@@ -891,6 +892,73 @@ static void register_decrementer_clockevent(int cpu)
clockevents_register_device(dec);
 }
 
+static in

[PATCH v2 2/2] KVM: PPC: hypervisor large decrementer support

2016-05-04 Thread Oliver O'Halloran
Power ISAv3 extends the width of the decrementer register from 32 bits.
The enlarged register width is implementation dependent, but reads from
these registers are automatically sign extended to produce a 64 bit output
when operating in large mode. The HDEC always operates in large mode
while the DEC register can be operated in 32bit mode or large mode
depending on the setting of the LPCR.LD bit.

Currently the hypervisor assumes that reads from the DEC and HDEC register
produce a 32 bit result which it sign extends to 64 bits using the extsw
instruction. This behaviour can result in the guest DEC register value
being corrupted by the hypervisor when the guest is operating in LD mode
since the results of the extsw instruction only depends on the value of
bit 31 in the register to be sign extended.

This patch adds the GET_DEC() and GET_HDEC() assembly macros for reading
from the decrementer registers. These macros will return the current
decrementer value as a 64 bit quantity regardless of the Host CPU or
guest decrementer operating mode. Additionally this patch corrects several
uses of decrementer values that assume a 32 bit register width.

Signed-off-by: Oliver O'Halloran <ooh...@gmail.com>
Cc: Paul Mackerras <pau...@samba.org>
Cc: Balbir Singh <bsinghar...@gmail.com>
---
 arch/powerpc/include/asm/exception-64s.h | 29 
 arch/powerpc/include/asm/kvm_host.h  |  2 +-
 arch/powerpc/include/asm/kvm_ppc.h   |  2 +-
 arch/powerpc/include/uapi/asm/kvm.h  |  2 +-
 arch/powerpc/kernel/time.c   |  2 +-
 arch/powerpc/kvm/book3s_hv_interrupts.S  |  3 +--
 arch/powerpc/kvm/book3s_hv_rmhandlers.S  | 38 ++--
 arch/powerpc/kvm/emulate.c   |  6 ++---
 8 files changed, 58 insertions(+), 26 deletions(-)

diff --git a/arch/powerpc/include/asm/exception-64s.h 
b/arch/powerpc/include/asm/exception-64s.h
index 93ae809fe5ea..4fa303bf6d5b 100644
--- a/arch/powerpc/include/asm/exception-64s.h
+++ b/arch/powerpc/include/asm/exception-64s.h
@@ -545,4 +545,33 @@ END_FTR_SECTION_IFSET(CPU_FTR_CAN_NAP)
 #define FINISH_NAP
 #endif
 
+/*
+ * On ISAv3 processors the DEC register can be extended from 32 bits to 64 by
+ * setting the LD flag the LPCR. The decrementer value is a signed quantity so
+ * sign exension is required when operating in 32 bit mode. The GET_DEC() and
+ * GET_HDEC() handle this sign extension and yield a 64 bit result independent
+ * of the LD mode.
+ *
+ * NB: It's possible run with LD mode disabled on ISAv3 so GET_DEC() does not
+ * use a CPU_FEATURE section. A feature section is used for GET_HDEC 
because
+ * it has no mode bit. It is always 64 bits for ISAv3 processors.
+ */
+
+#define IS_LD_ENABLED(reg) \
+   mfspr  reg,SPRN_LPCR;  \
+   andis. reg,reg,(LPCR_LD >> 16);
+
+#define GET_DEC(reg)   \
+   IS_LD_ENABLED(reg);\
+   mfspr reg, SPRN_DEC;   \
+   bne 99f;   \
+   extsw reg, reg;\
+99:
+
+#define GET_HDEC(reg) \
+   mfspr reg, SPRN_HDEC;   \
+BEGIN_FTR_SECTION   \
+   extsw reg, reg; \
+END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
+
 #endif /* _ASM_POWERPC_EXCEPTION_H */
diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index d7b343170453..6330d3fca083 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -516,7 +516,7 @@ struct kvm_vcpu_arch {
ulong mcsrr0;
ulong mcsrr1;
ulong mcsr;
-   u32 dec;
+   u64 dec;
 #ifdef CONFIG_BOOKE
u32 decar;
 #endif
diff --git a/arch/powerpc/include/asm/kvm_ppc.h 
b/arch/powerpc/include/asm/kvm_ppc.h
index 2544edabe7f3..4de0102930e9 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -94,7 +94,7 @@ extern int kvmppc_emulate_instruction(struct kvm_run *run,
 extern int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu);
 extern int kvmppc_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu);
 extern void kvmppc_emulate_dec(struct kvm_vcpu *vcpu);
-extern u32 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb);
+extern u64 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb);
 extern void kvmppc_decrementer_func(struct kvm_vcpu *vcpu);
 extern int kvmppc_sanity_check(struct kvm_vcpu *vcpu);
 extern int kvmppc_subarch_vcpu_init(struct kvm_vcpu *vcpu);
diff --git a/arch/powerpc/include/uapi/asm/kvm.h 
b/arch/powerpc/include/uapi/asm/kvm.h
index c93cf35ce379..2dd92e841127 100644
--- a/arch/powerpc/include/uapi/asm/kvm.h
+++ b/arch/powerpc/include/uapi/asm/kvm.h
@@ -215,7 +215,7 @@ struct kvm_sregs {
__u32 tsr;  /* KVM_SREGS_E_UPDATE_TSR */
__u32 tcr;
__u32 decar;
-   __u32 dec;  /* KVM_SREGS_E_UPDATE_DEC */
+

[PATCH v2] powerpc/mm: Add a parameter to disable 1TB segs

2016-07-04 Thread Oliver O'Halloran
This patch adds the kernel command line parameter "no_tb_segs" which
forces the kernel to use 256MB rather than 1TB segments. Forcing the use
of 256MB segments makes it considerably easier to test code that depends
on an SLB miss occurring.

Suggested-by: Michael Neuling <mi...@neuling.org>
Suggested-by: Michael Ellerman <m...@ellerman.id.au>
Signed-off-by: Oliver O'Halloran <ooh...@gmail.com>
---
Changes in from v1:
Renamed parameter from "no_tb_segs" to "disable_1tb_segments"
Added kernel-parameters.txt entry

 Documentation/kernel-parameters.txt |  6 ++
 arch/powerpc/mm/hash_utils_64.c | 15 +++
 2 files changed, 21 insertions(+)

diff --git a/Documentation/kernel-parameters.txt 
b/Documentation/kernel-parameters.txt
index 4640ea2dce9b..3be08fda82dd 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -920,6 +920,12 @@ bytes respectively. Such letter suffixes can also be 
entirely omitted.
dhash_entries=  [KNL]
Set number of hash buckets for dentry cache.
 
+   disable_1tb_segments [PPC]
+   Disables the use of 1TB hash page table segments. This
+   causes the kernel to fall back to 256MB segments which
+   can be useful when debugging issues that require an SLB
+   miss to occur.
+
disable=[IPV6]
See Documentation/networking/ipv6.txt.
 
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 5b22ba0b58bc..7e6d38e01645 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -321,6 +321,15 @@ int htab_remove_mapping(unsigned long vstart, unsigned 
long vend,
return ret;
 }
 
+static bool disable_1tb_segments = false;
+
+static int __init parse_disable_1tb_segments(char *p)
+{
+   disable_1tb_segments = true;
+   return 0;
+}
+early_param("disable_1tb_segments", parse_disable_1tb_segments);
+
 static int __init htab_dt_scan_seg_sizes(unsigned long node,
 const char *uname, int depth,
 void *data)
@@ -339,6 +348,12 @@ static int __init htab_dt_scan_seg_sizes(unsigned long 
node,
for (; size >= 4; size -= 4, ++prop) {
if (be32_to_cpu(prop[0]) == 40) {
DBG("1T segment support detected\n");
+
+   if (disable_1tb_segments) {
+   DBG("1T segments disabled by command line\n");
+   break;
+   }
+
cur_cpu_spec->mmu_features |= MMU_FTR_1T_SEGMENT;
return 1;
}
-- 
2.5.5

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH v3] powerpc/boot: Add OPAL console to epapr wrappers

2016-06-30 Thread Oliver O'Halloran
This patch adds an OPAL console backend to the powerpc boot wrapper so
that decompression failures inside the wrapper can be reported to the
user. This is important since it typically indicates data corruption in
the firmware and other nasty things.

Currently this only works when building a little endian kernel. When
compiling a 64 bit BE kernel the wrapper is always build 32 bit to be
compatible with some 32 bit firmwares. BE support will be added at a
later date. Another limitation of this is that only the "raw" type of
OPAL console is supported, however machines that provide a hvsi console
also provide a raw console so this is not an issue in practice.

Actually-written-by: Benjamin Herrenschmidt <b...@kernel.crashing.org>
Signed-off-by: Oliver O'Halloran <ooh...@gmail.com>
Cc: Stewart Smith <stew...@linux.vnet.ibm.com>
Cc: sta...@vger.kernel.org
---

Changelog:

v2: Added missing files
v3: Added copyright headers to opal.c and opal-calls.S

---
 arch/powerpc/boot/Makefile |  4 +-
 arch/powerpc/boot/opal-calls.S | 58 +
 arch/powerpc/boot/opal.c   | 97 ++
 arch/powerpc/boot/ops.h|  1 +
 arch/powerpc/boot/ppc_asm.h|  4 ++
 arch/powerpc/boot/serial.c |  2 +
 arch/powerpc/boot/types.h  | 10 +
 7 files changed, 174 insertions(+), 2 deletions(-)
 create mode 100644 arch/powerpc/boot/opal-calls.S
 create mode 100644 arch/powerpc/boot/opal.c

diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile
index 8fe78a3efc92..00cf88aa9a23 100644
--- a/arch/powerpc/boot/Makefile
+++ b/arch/powerpc/boot/Makefile
@@ -70,7 +70,7 @@ $(addprefix $(obj)/,$(zlib) cuboot-c2k.o gunzip_util.o 
main.o): \
 libfdt   := fdt.c fdt_ro.c fdt_wip.c fdt_sw.c fdt_rw.c fdt_strerror.c
 libfdtheader := fdt.h libfdt.h libfdt_internal.h
 
-$(addprefix $(obj)/,$(libfdt) libfdt-wrapper.o simpleboot.o epapr.o): \
+$(addprefix $(obj)/,$(libfdt) libfdt-wrapper.o simpleboot.o epapr.o opal.o): \
$(addprefix $(obj)/,$(libfdtheader))
 
 src-wlib-y := string.S crt0.S crtsavres.S stdio.c main.c \
@@ -78,7 +78,7 @@ src-wlib-y := string.S crt0.S crtsavres.S stdio.c main.c \
ns16550.c serial.c simple_alloc.c div64.S util.S \
gunzip_util.c elf_util.c $(zlib) devtree.c stdlib.c \
oflib.c ofconsole.c cuboot.c mpsc.c cpm-serial.c \
-   uartlite.c mpc52xx-psc.c
+   uartlite.c mpc52xx-psc.c opal.c opal-calls.S
 src-wlib-$(CONFIG_40x) += 4xx.c planetcore.c
 src-wlib-$(CONFIG_44x) += 4xx.c ebony.c bamboo.c
 src-wlib-$(CONFIG_8xx) += mpc8xx.c planetcore.c fsl-soc.c
diff --git a/arch/powerpc/boot/opal-calls.S b/arch/powerpc/boot/opal-calls.S
new file mode 100644
index ..ff2f1b97bc53
--- /dev/null
+++ b/arch/powerpc/boot/opal-calls.S
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2016 IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include "ppc_asm.h"
+#include "../include/asm/opal-api.h"
+
+   .text
+
+#define OPAL_CALL(name, token) \
+   .globl name;\
+name:  \
+   li  r0, token;  \
+   b   opal_call;
+
+opal_call:
+   mflrr11
+   std r11,16(r1)
+   mfcrr12
+   stw r12,8(r1)
+   mr  r13,r2
+
+   /* Set opal return address */
+   ld  r11,opal_return@got(r2)
+   mtlrr11
+   mfmsr   r12
+
+   /* switch to BE when we enter OPAL */
+   li  r11,MSR_LE
+   andcr12,r12,r11
+   mtspr   SPRN_HSRR1,r12
+
+   /* load the opal call entry point and base */
+   ld  r11,opal@got(r2)
+   ld  r12,8(r11)
+   ld  r2,0(r11)
+   mtspr   SPRN_HSRR0,r12
+   hrfid
+
+opal_return:
+   FIXUP_ENDIAN
+   mr  r2,r13;
+   lwz r11,8(r1);
+   ld  r12,16(r1)
+   mtcrr11;
+   mtlrr12
+   blr
+
+OPAL_CALL(opal_console_write,  OPAL_CONSOLE_WRITE);
+OPAL_CALL(opal_console_read,   OPAL_CONSOLE_READ);
+OPAL_CALL(opal_console_write_buffer_space, 
OPAL_CONSOLE_WRITE_BUFFER_SPACE);
+OPAL_CALL(opal_poll_events,OPAL_POLL_EVENTS);
+OPAL_CALL(opal_console_flush,  OPAL_CONSOLE_FLUSH);
diff --git a/arch/powerpc/boot/opal.c b/arch/powerpc/boot/opal.c
new file mode 100644
index ..3a2ce1e1f048
--- /dev/null
+++ b/arch/powerpc/boot/opal.c
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2016 IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License

[PATCH v6] powerpc/timer - large decrementer support

2016-07-01 Thread Oliver O'Halloran
Power ISAv3 adds a large decrementer (LD) mode which increases the size
of the decrementer register. The size of the enlarged decrementer
register is between 32 and 64 bits with the exact size being dependent
on the implementation. When in LD mode, reads are sign extended to 64
bits and a decrementer exception is raised when the high bit is set (i.e
the value goes below zero). Writes however are truncated to the physical
register width so some care needs to be taken to ensure that the high
bit is not set when reloading the decrementer. This patch adds support
for using the LD inside the host kernel on processors that support it.

When LD mode is supported firmware will supply the ibm,dec-bits property
for CPU nodes to allow the kernel to determine the maximum decrementer
value. Enabling LD mode is a hypervisor privileged operation so the kernel
can only enable it manually when running in hypervisor mode. Guests that
support LD mode can request it using the "ibm,client-architecture-support"
firmware call (not implemented in this patch) or some other platform
specific method. If this property is not supplied then the traditional
decrementer width of 32 bit is assumed and LD mode will not be enabled.

This patch was based on initial work by Jack Miller.

Signed-off-by: Oliver O'Halloran <ooh...@gmail.com>
Signed-off-by: Balbir Singh <bsinghar...@gmail.com>
Acked-by: Michael Neuling <mi...@neuling.org>
Cc: Jack Miller <j...@codezen.org>
---

Changes from v5:
Removed readback test after enabling LD mode since mikey thought it
was dumb.

Replaced use of of_get_property() and of_read_number() with
of_property_read_u32()

---
 arch/powerpc/include/asm/reg.h  |  1 +
 arch/powerpc/include/asm/time.h |  6 ++--
 arch/powerpc/kernel/time.c  | 67 -
 3 files changed, 63 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index a0948f40bc7b..12d970d64bb3 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -332,6 +332,7 @@
 #define   LPCR_AIL_0   0x  /* MMU off exception offset 0x0 */
 #define   LPCR_AIL_3   0x0180  /* MMU on exception offset 0xc00...4xxx 
*/
 #define   LPCR_ONL 0x0004  /* online - PURR/SPURR count */
+#define   LPCR_LD  0x0002  /* large decremeter */
 #define   LPCR_PECE0x0001f000  /* powersave exit cause enable */
 #define LPCR_PECEDP0x0001  /* directed priv dbells cause 
exit */
 #define LPCR_PECEDH0x8000  /* directed hyp dbells cause 
exit */
diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h
index 1092fdd7e737..09211640a0e0 100644
--- a/arch/powerpc/include/asm/time.h
+++ b/arch/powerpc/include/asm/time.h
@@ -146,7 +146,7 @@ static inline void set_tb(unsigned int upper, unsigned int 
lower)
  * in auto-reload mode.  The problem is PIT stops counting when it
  * hits zero.  If it would wrap, we could use it just like a decrementer.
  */
-static inline unsigned int get_dec(void)
+static inline u64 get_dec(void)
 {
 #if defined(CONFIG_40x)
return (mfspr(SPRN_PIT));
@@ -160,10 +160,10 @@ static inline unsigned int get_dec(void)
  * in when the decrementer generates its interrupt: on the 1 to 0
  * transition for Book E/4xx, but on the 0 to -1 transition for others.
  */
-static inline void set_dec(int val)
+static inline void set_dec(u64 val)
 {
 #if defined(CONFIG_40x)
-   mtspr(SPRN_PIT, val);
+   mtspr(SPRN_PIT, (u32) val);
 #else
 #ifndef CONFIG_BOOKE
--val;
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 7a482a7f4d8d..d1cb44ddfc95 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -97,7 +97,8 @@ static struct clocksource clocksource_timebase = {
.read = timebase_read,
 };
 
-#define DECREMENTER_MAX0x7fff
+#define DECREMENTER_DEFAULT_MAX 0x7FFF
+u64 decrementer_max = DECREMENTER_DEFAULT_MAX;
 
 static int decrementer_set_next_event(unsigned long evt,
  struct clock_event_device *dev);
@@ -505,8 +506,8 @@ static void __timer_interrupt(void)
__this_cpu_inc(irq_stat.timer_irqs_event);
} else {
now = *next_tb - now;
-   if (now <= DECREMENTER_MAX)
-   set_dec((int)now);
+   if (now <= decrementer_max)
+   set_dec(now);
/* We may have raced with new irq work */
if (test_irq_work_pending())
set_dec(1);
@@ -536,7 +537,7 @@ void timer_interrupt(struct pt_regs * regs)
/* Ensure a positive value is written to the decrementer, or else
 * some CPUs will continue to take decrementer exceptions.
 */
-   set_dec(DECREMENTER_MAX);
+   set_dec(decrementer_max);
 
  

[PATCH] powerpc/mm: Add a parameter to disable 1TB segs

2016-07-03 Thread Oliver O'Halloran
This patch adds the kernel command line parameter "no_tb_segs" which
forces the kernel to use 256MB rather than 1TB segments. Forcing the use
of 256MB segments makes it considerably easier to test code that depends
on an SLB miss occurring.

Suggested-by: Michael Neuling <mi...@neuling.org>
Suggested-by: Michael Ellerman <m...@ellerman.id.au>
Signed-off-by: Oliver O'Halloran <ooh...@gmail.com>
---
 arch/powerpc/mm/hash_utils_64.c | 15 +++
 1 file changed, 15 insertions(+)

diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 5b22ba0b58bc..6da1a9d18e15 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -321,6 +321,15 @@ int htab_remove_mapping(unsigned long vstart, unsigned 
long vend,
return ret;
 }
 
+static bool no_tb_segs = false;
+
+static int __init parse_no_tb_segs(char *p)
+{
+   no_tb_segs = true;
+   return 0;
+}
+early_param("no_tb_segs", parse_no_tb_segs);
+
 static int __init htab_dt_scan_seg_sizes(unsigned long node,
 const char *uname, int depth,
 void *data)
@@ -339,6 +348,12 @@ static int __init htab_dt_scan_seg_sizes(unsigned long 
node,
for (; size >= 4; size -= 4, ++prop) {
if (be32_to_cpu(prop[0]) == 40) {
DBG("1T segment support detected\n");
+
+   if (no_tb_segs) {
+   DBG("Forcing 256MB segments\n");
+   break;
+   }
+
cur_cpu_spec->mmu_features |= MMU_FTR_1T_SEGMENT;
return 1;
}
-- 
2.5.5

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PowerPC] 4.10.0 fails to build on BE config

2017-02-21 Thread Oliver O'Halloran
On Tue, Feb 21, 2017 at 6:25 PM, abdul  wrote:
> Hi,
>
> Today's mainline build, breaks on Power6 and Power7 (all BE config) with
> these build errors
>
> arch/powerpc/kernel/time.c: In function ‘running_clock’:
> arch/powerpc/kernel/time.c:712:2: error: implicit declaration of function
> ‘cputime_to_nsecs’ [-Werror=implicit-function-declaration]
> return local_clock() -
> cputime_to_nsecs(kcpustat_this_cpu->cpustat[CPUTIME_STEAL]);
> ^
> cc1: some warnings being treated as errors
> make[1]: *** [arch/powerpc/kernel/time.o] Error 1
>
>
> Regard's
> Abdul Haleem
> IBM Linux Technology Center.

Hi Abdul,

Are there any extra patches in your tree? I briefly tried to reproduce
this, but in my local tree this line:

> return local_clock() - 
> cputime_to_nsecs(kcpustat_this_cpu->cpustat[CPUTIME_STEAL]);

Is at time.c:692 rather than time.c:712

Oliver


Re: [PATCH] powerpc/powernv: add hdat attribute to sysfs

2017-02-22 Thread Oliver O'Halloran
On Thu, Feb 23, 2017 at 1:29 PM, Matt Brown <matthew.brown@gmail.com> wrote:
> From: Matt Brown <brownmatt1...@gmail.com>
>
> The HDAT data area is consumed by skiboot and turned into a device-tree.
> In some cases we would like to look directly at the HDAT, so this patch
> adds a sysfs node to allow it to be viewed.  This is not possible through
> /dev/mem as it is reserved memory which is stopped by the /dev/mem filter.
>
> Signed-off-by: Matt Brown <matthew.brown@gmail.com>
> ---
>  arch/powerpc/include/asm/opal.h  |  1 +
>  arch/powerpc/platforms/powernv/opal-msglog.c | 49 
> 
>  arch/powerpc/platforms/powernv/opal.c|  2 ++
>  3 files changed, 52 insertions(+)
>
> diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
> index 5c7db0f..b26944e 100644
> --- a/arch/powerpc/include/asm/opal.h
> +++ b/arch/powerpc/include/asm/opal.h
> @@ -277,6 +277,7 @@ extern int opal_async_comp_init(void);
>  extern int opal_sensor_init(void);
>  extern int opal_hmi_handler_init(void);
>  extern int opal_event_init(void);
> +extern void opal_hdat_sysfs_init(void);
>
>  extern int opal_machine_check(struct pt_regs *regs);
>  extern bool opal_mce_check_early_recovery(struct pt_regs *regs);
> diff --git a/arch/powerpc/platforms/powernv/opal-msglog.c 
> b/arch/powerpc/platforms/powernv/opal-msglog.c
> index 39d6ff9..a637055 100644
> --- a/arch/powerpc/platforms/powernv/opal-msglog.c
> +++ b/arch/powerpc/platforms/powernv/opal-msglog.c
> @@ -31,7 +31,13 @@ struct memcons {
> __be32 in_cons;
>  };
>
> +struct hdatInfo {
> +   char *base;
> +   u64 size;
> +};
> +
>  static struct memcons *opal_memcons = NULL;

> +static struct hdatInfo hdat_inf;
I have a few 'o's to spare if you need one.

>
>  ssize_t opal_msglog_copy(char *to, loff_t pos, size_t count)
>  {
> @@ -136,3 +142,46 @@ void __init opal_msglog_sysfs_init(void)
> if (sysfs_create_bin_file(opal_kobj, _msglog_attr) != 0)
> pr_warn("OPAL: sysfs file creation failed\n");
>  }

> +
> +
> +
> +/* Read function for HDAT attribute in sysfs */
Bonus whitespace!

> +static ssize_t hdat_read(struct file *file, struct kobject *kobj,
> +struct bin_attribute *bin_attr, char *to,
> +loff_t pos, size_t count)
> +{
> +   if (!hdat_inf.base)
> +   return -ENODEV;
> +
> +   return memory_read_from_buffer(to, count, , hdat_inf.base,
> +   hdat_inf.size);
> +}

Hmm... There's been some ideas floating around about removing Skiboot
from the linear mapping and that would break this. However, that is
something we should probably shouldn't worry about until it happens.

> +
> +
> +/* HDAT attribute for sysfs */
> +static struct bin_attribute hdat_attr = {
> +   .attr = {.name = "hdat", .mode = 0444},
> +   .read = hdat_read
> +};
> +
> +void __init opal_hdat_sysfs_init(void)
> +{
> +   u64 hdatAddr[2];
> +
> +   /* Check for the hdat-map prop in device-tree */
> +   if (of_property_read_u64_array(opal_node, "hdat-map", hdatAddr, 2)) {
> +   pr_debug("OPAL: Property hdat-map not found.\n");
> +   return;
> +   }
> +
> +   /* Print out hdat-map values. [0]: base, [1]: size */
> +   pr_debug("HDAT Base address: %#llx\n", hdatAddr[0]);
> +   pr_debug("HDAT Size: %#llx\n", hdatAddr[1]);
> +
> +   hdat_inf.base = phys_to_virt(hdatAddr[0]);
> +   hdat_inf.size = hdatAddr[1];
> +
> +   if (sysfs_create_bin_file(opal_kobj, _attr) != 0)
> +   pr_debug("OPAL: sysfs file creation for HDAT failed");
> +
> +}
> diff --git a/arch/powerpc/platforms/powernv/opal.c 
> b/arch/powerpc/platforms/powernv/opal.c
> index 2822935..cae3745 100644
> --- a/arch/powerpc/platforms/powernv/opal.c
> +++ b/arch/powerpc/platforms/powernv/opal.c
> @@ -740,6 +740,8 @@ static int __init opal_init(void)
>     opal_sys_param_init();
> /* Setup message log sysfs interface. */
> opal_msglog_sysfs_init();
> +   /* Create hdat object under sys/firmware/opal */
> +   opal_hdat_sysfs_init();
> }
>
> /* Initialize platform devices: IPMI backend, PRD & flash interface */
> --
> 2.9.3
>

Quibbling aside, look ok.

Reviewed-by: Oliver O'Halloran <ooh...@gmail.com>


Re: [PATCH] powerpc: Use octal numbers for file permissions

2017-01-17 Thread Oliver O'Halloran
"It's possible I missed one, but I did genuinely review all of it"

Cyril Bur, 2016
In a hobart pub, specifically The Winston

On 17/01/2017 8:53 PM, "Michael Ellerman"  wrote:

> Cyril Bur  writes:
>
> > On Thu, 2017-01-12 at 14:54 +1100, Russell Currey wrote:
> >> Symbolic macros are unintuitive and hard to read, whereas octal
> constants
> >> are much easier to interpret.  Replace macros for the basic permission
> >> flags (user/group/other read/write/execute) with numeric constants
> >> instead, across the whole powerpc tree.
> >>
> >> Introducing a significant number of changes across the tree for no
> runtime
> >> benefit isn't exactly desirable, but so long as these macros are still
> >> used in the tree people will keep sending patches that add them.  Not
> only
> >> are they hard to parse at a glance, there are multiple ways of coming to
> >> the same value (as you can see with 0444 and 0644 in this patch) which
> >> hurts readability.
> >>
> >> Signed-off-by: Russell Currey 
> >
> > Reviewed-by: Cyril Bur 
>
> Did you really really review every single change?
>
> Because if you did then I don't have to, and that would be *great* :)
>
> cheers
>


Re: [PATCH] powerpc: Use octal numbers for file permissions

2017-01-17 Thread Oliver O'Halloran
It has been pointed out that this actually occured in 2017. My apologies.

On 17/01/2017 9:50 PM, "Oliver O'Halloran" <ooh...@gmail.com> wrote:

> "It's possible I missed one, but I did genuinely review all of it"
>
> Cyril Bur, 2016
> In a hobart pub, specifically The Winston
>
> On 17/01/2017 8:53 PM, "Michael Ellerman" <m...@ellerman.id.au> wrote:
>
>> Cyril Bur <cyril...@gmail.com> writes:
>>
>> > On Thu, 2017-01-12 at 14:54 +1100, Russell Currey wrote:
>> >> Symbolic macros are unintuitive and hard to read, whereas octal
>> constants
>> >> are much easier to interpret.  Replace macros for the basic permission
>> >> flags (user/group/other read/write/execute) with numeric constants
>> >> instead, across the whole powerpc tree.
>> >>
>> >> Introducing a significant number of changes across the tree for no
>> runtime
>> >> benefit isn't exactly desirable, but so long as these macros are still
>> >> used in the tree people will keep sending patches that add them.  Not
>> only
>> >> are they hard to parse at a glance, there are multiple ways of coming
>> to
>> >> the same value (as you can see with 0444 and 0644 in this patch) which
>> >> hurts readability.
>> >>
>> >> Signed-off-by: Russell Currey <rus...@russell.cc>
>> >
>> > Reviewed-by: Cyril Bur <cyril...@gmail.com>
>>
>> Did you really really review every single change?
>>
>> Because if you did then I don't have to, and that would be *great* :)
>>
>> cheers
>>
>


Re: [PATCH v3] powerpc/powernv: add hdat attribute to sysfs

2017-02-27 Thread Oliver O'Halloran
On Mon, Feb 27, 2017 at 9:56 PM, Michael Ellerman  wrote:
> Matt Brown  writes:
>> diff --git a/arch/powerpc/platforms/powernv/opal-hdat.c 
>> b/arch/powerpc/platforms/powernv/opal-hdat.c
>> new file mode 100644
>> index 000..3315dd3
>> --- /dev/null
>> +++ b/arch/powerpc/platforms/powernv/opal-hdat.c
>> @@ -0,0 +1,65 @@
> ...
>> +
>> +
>> +/* HDAT attribute for sysfs */
>> +static struct bin_attribute hdat_attr = {
>> + .attr = {.name = "hdat", .mode = 0444},
>  
> ajd and oohal report to my office.

I don't think there's anything in the HDAT that's sensitive. That
said, this might not be true in the future so making it only readable
by root might be a good idea.

Oliver


[PATCH 1/5] powerpc/smp: use cpu_to_chip_id() to find siblings

2017-03-01 Thread Oliver O'Halloran
To determine which logical CPUs are on the same core the kernel uses the
ibm,chipid property from the device tree node associated with that cpu.
The lookup for this this information is currently open coded in both
traverse_siblings() and traverse_siblings_chip_id(). This patch replaces
these manual lookups with the existing cpu_to_chip_id() function.

Signed-off-by: Oliver O'Halloran <ooh...@gmail.com>
---
 arch/powerpc/kernel/smp.c | 39 +--
 1 file changed, 13 insertions(+), 26 deletions(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 893bd7f79be6..dfe0e1d9cd06 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -613,19 +613,11 @@ EXPORT_SYMBOL_GPL(cpu_first_thread_of_core);
 
 static void traverse_siblings_chip_id(int cpu, bool add, int chipid)
 {
-   const struct cpumask *mask;
-   struct device_node *np;
-   int i, plen;
-   const __be32 *prop;
+   const struct cpumask *mask = add ? cpu_online_mask : cpu_present_mask;
+   int i;
 
-   mask = add ? cpu_online_mask : cpu_present_mask;
for_each_cpu(i, mask) {
-   np = of_get_cpu_node(i, NULL);
-   if (!np)
-   continue;
-   prop = of_get_property(np, "ibm,chip-id", );
-   if (prop && plen == sizeof(int) &&
-   of_read_number(prop, 1) == chipid) {
+   if (cpu_to_chip_id(i) == chipid) {
if (add) {
cpumask_set_cpu(cpu, cpu_core_mask(i));
cpumask_set_cpu(i, cpu_core_mask(cpu));
@@ -634,7 +626,6 @@ static void traverse_siblings_chip_id(int cpu, bool add, 
int chipid)
cpumask_clear_cpu(i, cpu_core_mask(cpu));
}
}
-   of_node_put(np);
}
 }
 
@@ -664,23 +655,19 @@ static void traverse_core_siblings(int cpu, bool add)
 {
struct device_node *l2_cache, *np;
const struct cpumask *mask;
-   int i, chip, plen;
-   const __be32 *prop;
+   int chip_id;
+   int i;
 
-   /* First see if we have ibm,chip-id properties in cpu nodes */
-   np = of_get_cpu_node(cpu, NULL);
-   if (np) {
-   chip = -1;
-   prop = of_get_property(np, "ibm,chip-id", );
-   if (prop && plen == sizeof(int))
-   chip = of_read_number(prop, 1);
-   of_node_put(np);
-   if (chip >= 0) {
-   traverse_siblings_chip_id(cpu, add, chip);
-   return;
-   }
+   /* threads that share a chip-id are considered siblings (same die) */
+   chip_id = cpu_to_chip_id(cpu);
+
+   if (chip_id >= 0) {
+   traverse_siblings_chip_id(cpu, add, chip_id);
+   return;
}
 
+   /* if the chip-id fails then threads which share L2 cache are */
+
l2_cache = cpu_to_l2cache(cpu);
mask = add ? cpu_online_mask : cpu_present_mask;
for_each_cpu(i, mask) {
-- 
2.9.3



[PATCH 2/5] powerpc/smp: add set_cpus_related()

2017-03-01 Thread Oliver O'Halloran
Add a helper function for updating the per-cpu core and sibling thread
cpumasks. This helper just sets (or clears) the relevant bit in the
cpumasks each CPU. This is open-coded in several places inside the
mask setup code so moving it into a seperate function is a sensible
cleanup.

Signed-off-by: Oliver O'Halloran <ooh...@gmail.com>
---
 arch/powerpc/kernel/smp.c | 61 ---
 1 file changed, 31 insertions(+), 30 deletions(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index dfe0e1d9cd06..1c531887ca51 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -377,6 +377,25 @@ static void smp_store_cpu_info(int id)
 #endif
 }
 
+/*
+ * Relationships between CPUs are maintained in a set of per-cpu cpumasks. We
+ * need to ensure that they are kept consistant between CPUs when they are
+ * changed.
+ *
+ * This is slightly tricky since the core mask must be a strict superset of
+ * the sibling mask.
+ */
+static void set_cpus_related(int i, int j, bool related, struct cpumask 
*(*relation_fn)(int))
+{
+   if (related) {
+   cpumask_set_cpu(i, relation_fn(j));
+   cpumask_set_cpu(j, relation_fn(i));
+   } else {
+   cpumask_clear_cpu(i, relation_fn(j));
+   cpumask_clear_cpu(j, relation_fn(i));
+   }
+}
+
 void __init smp_prepare_cpus(unsigned int max_cpus)
 {
unsigned int cpu;
@@ -616,17 +635,9 @@ static void traverse_siblings_chip_id(int cpu, bool add, 
int chipid)
const struct cpumask *mask = add ? cpu_online_mask : cpu_present_mask;
int i;
 
-   for_each_cpu(i, mask) {
-   if (cpu_to_chip_id(i) == chipid) {
-   if (add) {
-   cpumask_set_cpu(cpu, cpu_core_mask(i));
-   cpumask_set_cpu(i, cpu_core_mask(cpu));
-   } else {
-   cpumask_clear_cpu(cpu, cpu_core_mask(i));
-   cpumask_clear_cpu(i, cpu_core_mask(cpu));
-   }
-   }
-   }
+   for_each_cpu(i, mask)
+   if (cpu_to_chip_id(i) == chipid)
+   set_cpus_related(cpu, i, add, cpu_core_mask);
 }
 
 /* Must be called when no change can occur to cpu_present_mask,
@@ -666,23 +677,17 @@ static void traverse_core_siblings(int cpu, bool add)
return;
}
 
-   /* if the chip-id fails then threads which share L2 cache are */
-
+   /* if the chip-id fails then group siblings by the L2 cache */
l2_cache = cpu_to_l2cache(cpu);
mask = add ? cpu_online_mask : cpu_present_mask;
for_each_cpu(i, mask) {
np = cpu_to_l2cache(i);
if (!np)
continue;
-   if (np == l2_cache) {
-   if (add) {
-   cpumask_set_cpu(cpu, cpu_core_mask(i));
-   cpumask_set_cpu(i, cpu_core_mask(cpu));
-   } else {
-   cpumask_clear_cpu(cpu, cpu_core_mask(i));
-   cpumask_clear_cpu(i, cpu_core_mask(cpu));
-   }
-   }
+
+   if (np == l2_cache)
+   set_cpus_related(cpu, i, add, cpu_core_mask);
+
of_node_put(np);
}
of_node_put(l2_cache);
@@ -720,15 +725,13 @@ void start_secondary(void *unused)
for (i = 0; i < threads_per_core; i++) {
if (cpu_is_offline(base + i) && (cpu != base + i))
continue;
-   cpumask_set_cpu(cpu, cpu_sibling_mask(base + i));
-   cpumask_set_cpu(base + i, cpu_sibling_mask(cpu));
+   set_cpus_related(cpu, base + i, true, cpu_sibling_mask);
 
/* cpu_core_map should be a superset of
 * cpu_sibling_map even if we don't have cache
 * information, so update the former here, too.
 */
-   cpumask_set_cpu(cpu, cpu_core_mask(base + i));
-   cpumask_set_cpu(base + i, cpu_core_mask(cpu));
+   set_cpus_related(cpu, base + i, true, cpu_core_mask);
}
traverse_core_siblings(cpu, true);
 
@@ -818,10 +821,8 @@ int __cpu_disable(void)
/* Update sibling maps */
base = cpu_first_thread_sibling(cpu);
for (i = 0; i < threads_per_core && base + i < nr_cpu_ids; i++) {
-   cpumask_clear_cpu(cpu, cpu_sibling_mask(base + i));
-   cpumask_clear_cpu(base + i, cpu_sibling_mask(cpu));
-   cpumask_clear_cpu(cpu, cpu_core_mask(base + i));
-   cpumask_clear_cpu(base + i, cpu_core_mask(cpu));
+   set_cpus_related(cpu, base + i, false, cpu_sibling_mask);
+   set_cpus_related(cpu, base + i, false, cpu_core_mask);
}
tr

[PATCH 3/5] powerpc/smp: Add update_cpu_masks()

2017-03-01 Thread Oliver O'Halloran
When adding and removing a CPU from the system the per-cpu masks that
are used by the scheduler to construct scheduler domains need to be updated
to account for the cpu entering or exiting the system. Currently logic this
is open-coded for the thread sibling mask and shared for the core mask.
This patch moves all the logic for rebuilding these masks into a single
function and simplifies the logic which determines which CPUs are within
a "core".

Signed-off-by: Oliver O'Halloran <ooh...@gmail.com>
---
 arch/powerpc/kernel/smp.c | 90 ---
 1 file changed, 54 insertions(+), 36 deletions(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 1c531887ca51..3922cace927e 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -630,14 +630,20 @@ int cpu_first_thread_of_core(int core)
 }
 EXPORT_SYMBOL_GPL(cpu_first_thread_of_core);
 
-static void traverse_siblings_chip_id(int cpu, bool add, int chipid)
+static bool update_core_mask_by_chip_id(int cpu, bool add)
 {
const struct cpumask *mask = add ? cpu_online_mask : cpu_present_mask;
+   int chipid = cpu_to_chip_id(cpu);
int i;
 
+   if (chipid == -1)
+   return false;
+
for_each_cpu(i, mask)
if (cpu_to_chip_id(i) == chipid)
set_cpus_related(cpu, i, add, cpu_core_mask);
+
+   return true;
 }
 
 /* Must be called when no change can occur to cpu_present_mask,
@@ -662,42 +668,72 @@ static struct device_node *cpu_to_l2cache(int cpu)
return cache;
 }
 
-static void traverse_core_siblings(int cpu, bool add)
+static bool update_core_mask_by_l2(int cpu, bool onlining)
 {
+   const struct cpumask *mask = onlining ? cpu_online_mask : 
cpu_present_mask;
struct device_node *l2_cache, *np;
-   const struct cpumask *mask;
-   int chip_id;
int i;
 
-   /* threads that share a chip-id are considered siblings (same die) */
-   chip_id = cpu_to_chip_id(cpu);
-
-   if (chip_id >= 0) {
-   traverse_siblings_chip_id(cpu, add, chip_id);
-   return;
-   }
-
-   /* if the chip-id fails then group siblings by the L2 cache */
l2_cache = cpu_to_l2cache(cpu);
-   mask = add ? cpu_online_mask : cpu_present_mask;
+   if (l2_cache == NULL)
+   return false;
+
for_each_cpu(i, mask) {
np = cpu_to_l2cache(i);
if (!np)
continue;
 
if (np == l2_cache)
-   set_cpus_related(cpu, i, add, cpu_core_mask);
+   set_cpus_related(cpu, i, onlining, cpu_core_mask);
 
of_node_put(np);
}
of_node_put(l2_cache);
+
+   return true;
+}
+
+static void update_thread_mask(int cpu, bool onlining)
+{
+   int base = cpu_first_thread_sibling(cpu);
+   int i;
+
+   pr_info("CPUDEBUG: onlining cpu %d, base %d, thread_per_core %d",
+   cpu, base, threads_per_core);
+
+   for (i = 0; i < threads_per_core; i++) {
+   /* Threads are onlined one by one. By the final time this
+* function is called for the core the sibling mask for each
+* thread will be complete, but we need to ensure that offline
+* threads aren't touched before they run start_secondary() */
+   if (onlining && cpu_is_offline(base + i) && (cpu != base + i))
+   continue;
+
+   set_cpus_related(cpu, base + i, onlining, cpu_sibling_mask);
+   }
+}
+
+static void update_cpu_masks(int cpu, bool onlining)
+{
+   int i;
+
+   update_thread_mask(cpu, onlining);
+
+   if (update_core_mask_by_chip_id(cpu, onlining))
+   return;
+
+   if (update_core_mask_by_l2(cpu, onlining))
+   return;
+
+   /* if all else fails duplicate the sibling mask */
+   for_each_cpu(i, cpu_sibling_mask(cpu))
+   set_cpus_related(cpu, i, onlining, cpu_core_mask);
 }
 
 /* Activate a secondary processor. */
 void start_secondary(void *unused)
 {
unsigned int cpu = smp_processor_id();
-   int i, base;
 
atomic_inc(_mm.mm_count);
current->active_mm = _mm;
@@ -721,19 +757,7 @@ void start_secondary(void *unused)
vdso_getcpu_init();
 #endif
/* Update sibling maps */
-   base = cpu_first_thread_sibling(cpu);
-   for (i = 0; i < threads_per_core; i++) {
-   if (cpu_is_offline(base + i) && (cpu != base + i))
-   continue;
-   set_cpus_related(cpu, base + i, true, cpu_sibling_mask);
-
-   /* cpu_core_map should be a superset of
-* cpu_sibling_map even if we don't have cache
-* information, so update the former here, too.
-*/
-

[PATCH 4/5] powerpc/smp: add cpu_cache_mask

2017-03-01 Thread Oliver O'Halloran
Traditionally we have only ever tracked which CPUs are in the same core
(cpu_sibling_mask) and on the same die (cpu_core_mask). For Power9 we
need to be aware of which CPUs share cache with each other so this patch
adds cpu_cache_mask and the underlying cpu_cache_map variable to track
this.

Signed-off-by: Oliver O'Halloran <ooh...@gmail.com>
---
 arch/powerpc/include/asm/smp.h | 6 ++
 arch/powerpc/kernel/smp.c  | 5 +
 2 files changed, 11 insertions(+)

diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h
index 32db16d2e7ad..a7fc3a105d61 100644
--- a/arch/powerpc/include/asm/smp.h
+++ b/arch/powerpc/include/asm/smp.h
@@ -94,6 +94,7 @@ static inline void set_hard_smp_processor_id(int cpu, int 
phys)
 #endif
 
 DECLARE_PER_CPU(cpumask_var_t, cpu_sibling_map);
+DECLARE_PER_CPU(cpumask_var_t, cpu_cache_map);
 DECLARE_PER_CPU(cpumask_var_t, cpu_core_map);
 
 static inline struct cpumask *cpu_sibling_mask(int cpu)
@@ -106,6 +107,11 @@ static inline struct cpumask *cpu_core_mask(int cpu)
return per_cpu(cpu_core_map, cpu);
 }
 
+static inline struct cpumask *cpu_cache_mask(int cpu)
+{
+   return per_cpu(cpu_cache_map, cpu);
+}
+
 extern int cpu_to_core_id(int cpu);
 
 /* Since OpenPIC has only 4 IPIs, we use slightly different message numbers.
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 3922cace927e..5571f30ff72d 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -72,9 +72,11 @@ static DEFINE_PER_CPU(int, cpu_state) = { 0 };
 struct thread_info *secondary_ti;
 
 DEFINE_PER_CPU(cpumask_var_t, cpu_sibling_map);
+DEFINE_PER_CPU(cpumask_var_t, cpu_cache_map);
 DEFINE_PER_CPU(cpumask_var_t, cpu_core_map);
 
 EXPORT_PER_CPU_SYMBOL(cpu_sibling_map);
+EXPORT_PER_CPU_SYMBOL(cpu_cache_map);
 EXPORT_PER_CPU_SYMBOL(cpu_core_map);
 
 /* SMP operations for this machine */
@@ -415,6 +417,8 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
for_each_possible_cpu(cpu) {
zalloc_cpumask_var_node(_cpu(cpu_sibling_map, cpu),
GFP_KERNEL, cpu_to_node(cpu));
+   zalloc_cpumask_var_node(_cpu(cpu_cache_map, cpu),
+   GFP_KERNEL, cpu_to_node(cpu));
zalloc_cpumask_var_node(_cpu(cpu_core_map, cpu),
GFP_KERNEL, cpu_to_node(cpu));
/*
@@ -428,6 +432,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
}
 
cpumask_set_cpu(boot_cpuid, cpu_sibling_mask(boot_cpuid));
+   cpumask_set_cpu(boot_cpuid, cpu_cache_mask(boot_cpuid));
cpumask_set_cpu(boot_cpuid, cpu_core_mask(boot_cpuid));
 
if (smp_ops && smp_ops->probe)
-- 
2.9.3



[PATCH 5/5] powerpc/smp: Add Power9 scheduler topology

2017-03-01 Thread Oliver O'Halloran
In previous generations of Power processors each core had a private L2
cache. The Power9 processor has a slightly different architecture where
the L2 cache is shared among pairs of cores rather than being completely
private.

Making the scheduler aware of this cache sharing allows the scheduler to
make more intelligent migration decisions. When one core in the pair is
overloaded tasks can be migrated to its paired core to improve throughput
without cache-refilling penality typically associated with task
migration.

Signed-off-by: Oliver O'Halloran <ooh...@gmail.com>
---
 arch/powerpc/kernel/smp.c | 44 
 1 file changed, 40 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 5571f30ff72d..5e1811b24415 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -673,7 +673,7 @@ static struct device_node *cpu_to_l2cache(int cpu)
return cache;
 }
 
-static bool update_core_mask_by_l2(int cpu, bool onlining)
+static bool update_mask_by_l2(int cpu, bool onlining, struct cpumask 
*(*mask_fn)(int))
 {
const struct cpumask *mask = onlining ? cpu_online_mask : 
cpu_present_mask;
struct device_node *l2_cache, *np;
@@ -689,7 +689,7 @@ static bool update_core_mask_by_l2(int cpu, bool onlining)
continue;
 
if (np == l2_cache)
-   set_cpus_related(cpu, i, onlining, cpu_core_mask);
+   set_cpus_related(cpu, i, onlining, mask_fn);
 
of_node_put(np);
}
@@ -724,10 +724,17 @@ static void update_cpu_masks(int cpu, bool onlining)
 
update_thread_mask(cpu, onlining);
 
+   /* we need the l2 cache mask for the power9 scheduler topology */
+   if (cpu_has_feature(CPU_FTR_ARCH_300))
+   update_mask_by_l2(cpu, onlining, cpu_cache_mask);
+
+   /* now build the core mask */
+   set_cpus_related(cpu, cpu, onlining, cpu_core_mask);
+
if (update_core_mask_by_chip_id(cpu, onlining))
return;
 
-   if (update_core_mask_by_l2(cpu, onlining))
+   if (update_mask_by_l2(cpu, onlining, cpu_core_mask))
return;
 
/* if all else fails duplicate the sibling mask */
@@ -805,6 +812,32 @@ static struct sched_domain_topology_level 
powerpc_topology[] = {
{ NULL, },
 };
 
+
+/* P9 has a slightly odd architecture where two, four thread cores share an L2
+ * cache. For highly threaded workloads it makes sense to try and keep tasks
+ * inside the pair for better cache utilisation so the scheduler needs to be
+ * aware of this. */
+static int powerpc_shared_cache_flags(void)
+{
+   return SD_SHARE_PKG_RESOURCES | SD_PREFER_SIBLING;
+}
+
+/* this is kind of gross, but passing cpu_cache_mask directly
+ * causes the build to fail due to incompatible pointer types */
+static inline const struct cpumask *cpu_cache_mask_c(int cpu)
+{
+   return cpu_cache_mask(cpu);
+}
+
+static struct sched_domain_topology_level power9_topology[] = {
+#ifdef CONFIG_SCHED_SMT
+   { cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) },
+#endif
+   { cpu_cache_mask_c, powerpc_shared_cache_flags, SD_INIT_NAME(CACHE) },
+   { cpu_cpu_mask, SD_INIT_NAME(DIE) },
+   { NULL, },
+};
+
 void __init smp_cpus_done(unsigned int max_cpus)
 {
cpumask_var_t old_mask;
@@ -829,7 +862,10 @@ void __init smp_cpus_done(unsigned int max_cpus)
 
dump_numa_cpu_topology();
 
-   set_sched_topology(powerpc_topology);
+   if (cpu_has_feature(CPU_FTR_ARCH_300))
+   set_sched_topology(power9_topology);
+   else
+   set_sched_topology(powerpc_topology);
 
 }
 
-- 
2.9.3



Re: [RFC] Remove memory from nodes for memtrace.

2017-02-26 Thread Oliver O'Halloran
On Thu, Feb 23, 2017 at 8:39 AM, Rashmica Gupta  wrote:
>  Some powerpc hardware features may want to gain access to a
>  chunk of undisturbed real memory.  This update provides a means to unplug
>  said memory from the kernel with a set of sysfs calls.  By writing an integer
>  containing  the size of memory to be unplugged into
>  /sys/kernel/debug/powerpc/memtrace/enable, the code will remove that much
>  memory from the end of each available chip's memory space. In addition, the
>  means to read out the contents of the unplugged memory is also provided by
>  reading out the /sys/kernel/debug/powerpc/memtrace//dump file.
>
> Signed-off-by: Rashmica Gupta 
> ---
> Written by Douglas Lehr .
> Have tested and seems to work as I would expect. Only change I have made from
> the original is to check that the value being written to the debugfs file is
> not 0 (or obscenely large), as otherwise you get a nice kernel oops where the
> kernel attempts to access data at 0xfffe0.
>
> Thoughts about doing this with hot unplug or other changes?
>
>  arch/powerpc/mm/hash_native_64.c  |  39 +++-
>  arch/powerpc/platforms/powernv/Makefile   |   1 +
>  arch/powerpc/platforms/powernv/memtrace.c | 285 
> ++
>  3 files changed, 321 insertions(+), 4 deletions(-)
>  create mode 100644 arch/powerpc/platforms/powernv/memtrace.c
>
> diff --git a/arch/powerpc/mm/hash_native_64.c 
> b/arch/powerpc/mm/hash_native_64.c
> index cc33260..44cc6ce 100644
> --- a/arch/powerpc/mm/hash_native_64.c
> +++ b/arch/powerpc/mm/hash_native_64.c
> @@ -3,7 +3,7 @@
>   *
>   * SMP scalability work:
>   *Copyright (C) 2001 Anton Blanchard , IBM
> - *
> + *
>   * This program is free software; you can redistribute it and/or
>   * modify it under the terms of the GNU General Public License
>   * as published by the Free Software Foundation; either version
> @@ -181,7 +181,7 @@ static inline void native_lock_hpte(struct hash_pte 
> *hptep)
> while (1) {
> if (!test_and_set_bit_lock(HPTE_LOCK_BIT, word))
> break;
> -   while(test_bit(HPTE_LOCK_BIT, word))
> +   while (test_bit(HPTE_LOCK_BIT, word))
> cpu_relax();
> }
>  }
> @@ -208,10 +208,10 @@ static long native_hpte_insert(unsigned long 
> hpte_group, unsigned long vpn,
> }
>
> for (i = 0; i < HPTES_PER_GROUP; i++) {
> -   if (! (be64_to_cpu(hptep->v) & HPTE_V_VALID)) {
> +   if (!(be64_to_cpu(hptep->v) & HPTE_V_VALID)) {
> /* retry with lock held */
> native_lock_hpte(hptep);
> -   if (! (be64_to_cpu(hptep->v) & HPTE_V_VALID))
> +   if (!(be64_to_cpu(hptep->v) & HPTE_V_VALID))
> break;
> native_unlock_hpte(hptep);
> }
> @@ -407,6 +407,36 @@ static void native_hpte_updateboltedpp(unsigned long 
> newpp, unsigned long ea,
> tlbie(vpn, psize, psize, ssize, 0);
>  }
>
> +/*
> + * Remove a bolted kernel entry. Memory hotplug uses this.
> + *
> + * No need to lock here because we should be the only user.
> + */
> +static int native_hpte_removebolted(unsigned long ea, int psize, int ssize)
> +{
> +   unsigned long vpn;
> +   unsigned long vsid;
> +   long slot;
> +   struct hash_pte *hptep;
> +
> +   vsid = get_kernel_vsid(ea, ssize);
> +   vpn = hpt_vpn(ea, vsid, ssize);
> +
> +   slot = native_hpte_find(vpn, psize, ssize);
> +   if (slot == -1)
> +   return -ENOENT;
> +
> +   hptep = htab_address + slot;
> +
> +   /* Invalidate the hpte */
> +   hptep->v = 0;
> +
> +   /* Invalidate the TLB */
> +   tlbie(vpn, psize, psize, ssize, 0);
> +   return 0;
> +}
> +
> +
>  static void native_hpte_invalidate(unsigned long slot, unsigned long vpn,
>int bpsize, int apsize, int ssize, int 
> local)
>  {
> @@ -722,6 +752,7 @@ void __init hpte_init_native(void)
> mmu_hash_ops.hpte_invalidate= native_hpte_invalidate;
> mmu_hash_ops.hpte_updatepp  = native_hpte_updatepp;
> mmu_hash_ops.hpte_updateboltedpp = native_hpte_updateboltedpp;
> +   mmu_hash_ops.hpte_removebolted = native_hpte_removebolted;
> mmu_hash_ops.hpte_insert= native_hpte_insert;
> mmu_hash_ops.hpte_remove= native_hpte_remove;
> mmu_hash_ops.hpte_clear_all = native_hpte_clear;
> diff --git a/arch/powerpc/platforms/powernv/Makefile 
> b/arch/powerpc/platforms/powernv/Makefile
> index b5d98cb..2026661 100644
> --- a/arch/powerpc/platforms/powernv/Makefile
> +++ b/arch/powerpc/platforms/powernv/Makefile
> @@ -11,4 +11,5 @@ obj-$(CONFIG_EEH) += eeh-powernv.o
>  obj-$(CONFIG_PPC_SCOM) += opal-xscom.o
>  obj-$(CONFIG_MEMORY_FAILURE)   += 

Re: [PATCH v3] powerpc/powernv: add hdat attribute to sysfs

2017-02-26 Thread Oliver O'Halloran
On Mon, Feb 27, 2017 at 12:59 PM, Andrew Donnellan
<andrew.donnel...@au1.ibm.com> wrote:
> On 24/02/17 17:20, Matt Brown wrote:
>>
>> The HDAT data area is consumed by skiboot and turned into a device-tree.
>> In some cases we would like to look directly at the HDAT, so this patch
>> adds a sysfs node to allow it to be viewed.  This is not possible through
>> /dev/mem as it is reserved memory which is stopped by the /dev/mem filter.
>>
>> Signed-off-by: Matt Brown <matthew.brown@gmail.com>
>
>
> Changes look good, thanks for addressing the comments! Still a couple of
> minor points below, otherwise:
>
> Reviewed-by: Andrew Donnellan <andrew.donnel...@au1.ibm.com>
>
> Stewart: this might need your ACK?
>
>
>> ---
>>
>> Changes between v2 to v3:
>> - fixed header comments
>> - simplified if statement
>>
>> ---
>>  arch/powerpc/include/asm/opal.h|  1 +
>>  arch/powerpc/platforms/powernv/Makefile|  1 +
>>  arch/powerpc/platforms/powernv/opal-hdat.c | 65
>> ++
>>  arch/powerpc/platforms/powernv/opal.c  |  2 +
>>  4 files changed, 69 insertions(+)
>>  create mode 100644 arch/powerpc/platforms/powernv/opal-hdat.c
>>
>> diff --git a/arch/powerpc/include/asm/opal.h
>> b/arch/powerpc/include/asm/opal.h
>> index 5c7db0f..b26944e 100644
>> --- a/arch/powerpc/include/asm/opal.h
>> +++ b/arch/powerpc/include/asm/opal.h
>> @@ -277,6 +277,7 @@ extern int opal_async_comp_init(void);
>>  extern int opal_sensor_init(void);
>>  extern int opal_hmi_handler_init(void);
>>  extern int opal_event_init(void);
>> +extern void opal_hdat_sysfs_init(void);
>>
>>  extern int opal_machine_check(struct pt_regs *regs);
>>  extern bool opal_mce_check_early_recovery(struct pt_regs *regs);
>> diff --git a/arch/powerpc/platforms/powernv/Makefile
>> b/arch/powerpc/platforms/powernv/Makefile
>> index b5d98cb..9a0c9d6 100644
>> --- a/arch/powerpc/platforms/powernv/Makefile
>> +++ b/arch/powerpc/platforms/powernv/Makefile
>> @@ -3,6 +3,7 @@ obj-y   += opal-rtc.o opal-nvram.o
>> opal-lpc.o opal-flash.o
>>  obj-y  += rng.o opal-elog.o opal-dump.o opal-sysparam.o
>> opal-sensor.o
>>  obj-y  += opal-msglog.o opal-hmi.o opal-power.o
>> opal-irqchip.o
>>  obj-y  += opal-kmsg.o
>> +obj-y  += opal-hdat.o
>
>
> Normally we keep putting new object files on the same line until it gets
> long enough that we have to break it. This is very minor though :)
>
>
>>
>>  obj-$(CONFIG_SMP)  += smp.o subcore.o subcore-asm.o
>>  obj-$(CONFIG_PCI)  += pci.o pci-ioda.o npu-dma.o
>> diff --git a/arch/powerpc/platforms/powernv/opal-hdat.c
>> b/arch/powerpc/platforms/powernv/opal-hdat.c
>> new file mode 100644
>> index 000..3315dd3
>> --- /dev/null
>> +++ b/arch/powerpc/platforms/powernv/opal-hdat.c
>> @@ -0,0 +1,65 @@
>> +/*
>> + * PowerNV OPAL HDAT interface
>> + *
>> + * Author: Matt Brown <matthew.brown@gmail.com>
>> + *
>> + * Copyright 2017 IBM Corp.
>> + *
>> + * This program is free software; you can redistribute it and/or
>> + * modify it under the terms of the GNU General Public License
>> + * as published by the Free Software Foundation; either version
>> + * 2 of the License, or (at your option) any later version.
>> + */
>> +
>> +#include 
>> +#include 
>> +#include 
>> +#include 
>> +
>> +struct hdat_info {
>> +   char *base;
>> +   u64 size;
>> +};
>> +
>> +static struct hdat_info hdat_inf;
>
>
> As Oliver pointed out, we could do with a better name than hdat_inf - it's
> only one character away from the name of the struct type. Hmm, perhaps
> "hdat_location", or maybe Oliver has a better suggestion.

I'm not that bothered by it.

Reviewed-by: Oliver O'Halloran <ooh...@gmail.com>

>
>
> --
> Andrew Donnellan  OzLabs, ADL Canberra
> andrew.donnel...@au1.ibm.com  IBM Australia Limited
>


[PATCH 3/6] powerpc/boot: use the preboot decompression API

2016-08-30 Thread Oliver O'Halloran
Currently the powerpc boot wrapper has its own wrapper around zlib to
handle decompressing gzipped kernels. The kernel decompressor library
functions now provide a generic interface that can be used in the pre-boot
environment. This allows boot wrappers to easily support different
compression algorithms. This patch converts the wrapper to use this new
API, but does not add support for using new algorithms.

Signed-off-by: Oliver O'Halloran <ooh...@gmail.com>
---
 arch/powerpc/boot/Makefile |  10 ++-
 arch/powerpc/boot/decompress.c | 142 +
 arch/powerpc/boot/main.c   |  35 +-
 arch/powerpc/boot/ops.h|   3 +
 4 files changed, 170 insertions(+), 20 deletions(-)
 create mode 100644 arch/powerpc/boot/decompress.c

diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile
index 5a99a485d80a..3fdd74ac2fae 100644
--- a/arch/powerpc/boot/Makefile
+++ b/arch/powerpc/boot/Makefile
@@ -65,11 +65,12 @@ $(obj)/virtex405-head.o: BOOTAFLAGS += -mcpu=405
 
 # the kernel's version of zlib pulls in a lot of other kernel headers
 # which we don't provide inside the wrapper.
+zlib-decomp-$(CONFIG_KERNEL_GZIP) := decompress_inflate.c
 zlib-$(CONFIG_KERNEL_GZIP) := inffast.c inflate.c inftrees.c
 zlibheaders-$(CONFIG_KERNEL_GZIP) := inffast.h inffixed.h inflate.h inftrees.h 
infutil.h
 zliblinuxheader-$(CONFIG_KERNEL_GZIP) := zlib.h zconf.h zutil.h
 
-$(addprefix $(obj)/,$(zlib-y) cuboot-c2k.o gunzip_util.o main.o): \
+$(addprefix $(obj)/,$(zlib-y) cuboot-c2k.o decompress.o main.o): \
$(addprefix $(obj)/,$(zliblinuxheader-y)) \
$(addprefix $(obj)/,$(zlibheaders-y)) \
$(addprefix $(obj)/,$(zlib-decomp-y))
@@ -80,10 +81,10 @@ libfdtheader := fdt.h libfdt.h libfdt_internal.h
 $(addprefix $(obj)/,$(libfdt) libfdt-wrapper.o simpleboot.o epapr.o opal.o): \
$(addprefix $(obj)/,$(libfdtheader))
 
-src-wlib-y := string.S crt0.S crtsavres.S stdio.c main.c \
+src-wlib-y := string.S crt0.S crtsavres.S stdio.c decompress.c main.c \
$(libfdt) libfdt-wrapper.c \
ns16550.c serial.c simple_alloc.c div64.S util.S \
-   gunzip_util.c elf_util.c $(zlib-y) devtree.c stdlib.c \
+   decompress.o elf_util.c $(zlib-y) devtree.c stdlib.c \
oflib.c ofconsole.c cuboot.c mpsc.c cpm-serial.c \
uartlite.c mpc52xx-psc.c opal.c opal-calls.S
 src-wlib-$(CONFIG_40x) += 4xx.c planetcore.c
@@ -144,6 +145,9 @@ $(addprefix $(obj)/,$(zlibheaders-y)): $(obj)/%: 
$(srctree)/lib/zlib_inflate/%
 $(addprefix $(obj)/,$(zliblinuxheader-y)): $(obj)/%: $(srctree)/include/linux/%
$(call cmd,copy_kern_src)
 
+$(addprefix $(obj)/,$(zlib-decomp-y)): $(obj)/%: $(srctree)/lib/%
+   $(call cmd,copy_kern_src)
+
 quiet_cmd_copy_libfdt = COPY$@
   cmd_copy_libfdt = cp $< $@
 
diff --git a/arch/powerpc/boot/decompress.c b/arch/powerpc/boot/decompress.c
new file mode 100644
index ..60fc6fb26867
--- /dev/null
+++ b/arch/powerpc/boot/decompress.c
@@ -0,0 +1,142 @@
+/*
+ * Wrapper around the kernel's pre-boot decompression library.
+ *
+ * Copyright (C) IBM Corporation 2016.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include "elf.h"
+#include "page.h"
+#include "string.h"
+#include "stdio.h"
+#include "ops.h"
+#include "reg.h"
+#include "types.h"
+
+/*
+ * The decompressor_*.c files play #ifdef games so they can be used in both
+ * pre-boot and regular kernel code. We need these definitions to make the
+ * includes work.
+ */
+
+#define STATIC static
+#define INIT
+#define __always_inline inline
+
+/*
+ * The build process will copy the required zlib source files and headers
+ * out of lib/ and "fix" the includes so they do not pull in other kernel
+ * headers.
+ */
+
+#ifdef CONFIG_KERNEL_GZIP
+#  include "decompress_inflate.c"
+#endif
+
+/* globals for tracking the state of the decompression */
+static unsigned long decompressed_bytes;
+static unsigned long limit;
+static unsigned long skip;
+static char *output_buffer;
+
+/*
+ * flush() is called by __decompress() when the decompressor's scratch buffer 
is
+ * full.
+ */
+static long flush(void *v, unsigned long buffer_size)
+{
+   unsigned long end = decompressed_bytes + buffer_size;
+   unsigned long size = buffer_size;
+   unsigned long offset = 0;
+   char *in = v;
+   char *out;
+
+   /*
+* if we hit our decompression limit, we need to fake an error to abort
+* the in-progress decompression.
+*/
+   if (decompressed_bytes >= limit)
+   return -1;
+
+   /* skip this entire block */
+   if

[PATCH 1/6] powerpc/boot: add sed script

2016-08-30 Thread Oliver O'Halloran
The powerpc boot wrapper is compiled with a separate "bootcc" toolchain
rather than the toolchain used for the rest of the kernel. The main
problem with this is that the wrapper does not have access to the kernel
headers (without a lot of gross hacks). To get around this the required
headers are copied into the build directory via several sed scripts
which rewrite problematic includes. This patch moves these fixups out of
the makefile into a separate .sed script file to clean up makefile
slightly.

Signed-off-by: Oliver O'Halloran <ooh...@gmail.com>
---
 arch/powerpc/boot/Makefile  | 18 ++
 arch/powerpc/boot/fixup-headers.sed | 12 
 2 files changed, 18 insertions(+), 12 deletions(-)
 create mode 100644 arch/powerpc/boot/fixup-headers.sed

diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile
index 1a2a6e8dc40d..f98e42ee2534 100644
--- a/arch/powerpc/boot/Makefile
+++ b/arch/powerpc/boot/Makefile
@@ -125,23 +125,17 @@ obj-wlib := $(addsuffix .o, $(basename $(addprefix 
$(obj)/, $(src-wlib
 obj-plat := $(addsuffix .o, $(basename $(addprefix $(obj)/, $(src-plat
 obj-plat: $(libfdt)
 
-quiet_cmd_copy_zlib = COPY$@
-  cmd_copy_zlib = sed "s@__used@@;s@]*\).*@\"\1\"@" $< > $@
-
-quiet_cmd_copy_zlibheader = COPY$@
-  cmd_copy_zlibheader = sed "s@]*\).*@\"\1\"@" $< > $@
-# stddef.h for NULL
-quiet_cmd_copy_zliblinuxheader = COPY$@
-  cmd_copy_zliblinuxheader = sed 
"s@@\"string.h\"@;s@@@;s@]*\).*@\"\1\"@"
 $< > $@
+quiet_cmd_copy_kern_src = COPY$@
+  cmd_copy_kern_src = sed -f 
$(srctree)/arch/powerpc/boot/fixup-headers.sed $< > $@
 
 $(addprefix $(obj)/,$(zlib)): $(obj)/%: $(srctree)/lib/zlib_inflate/%
-   $(call cmd,copy_zlib)
+   $(call cmd,copy_kern_src)
 
-$(addprefix $(obj)/,$(zlibheader)): $(obj)/%: $(srctree)/lib/zlib_inflate/%
-   $(call cmd,copy_zlibheader)
+$(addprefix $(obj)/,$(zlibheaders)): $(obj)/%: $(srctree)/lib/zlib_inflate/%
+   $(call cmd,copy_kern_src)
 
 $(addprefix $(obj)/,$(zliblinuxheader)): $(obj)/%: $(srctree)/include/linux/%
-   $(call cmd,copy_zliblinuxheader)
+   $(call cmd,copy_kern_src)
 
 quiet_cmd_copy_libfdt = COPY$@
   cmd_copy_libfdt = cp $< $@
diff --git a/arch/powerpc/boot/fixup-headers.sed 
b/arch/powerpc/boot/fixup-headers.sed
new file mode 100644
index ..96362428eb37
--- /dev/null
+++ b/arch/powerpc/boot/fixup-headers.sed
@@ -0,0 +1,12 @@
+# Copyright 2016 IBM Corporation.
+#
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License version 2 or later as
+# published by the Free Software Foundation.
+
+s@#include @@;
+s@\"zlib_inflate/\([^\"]*\).*@"\1"@;
+s@@@;
+
+s@__used@@;
+s@]*\).*@"\1"@;
-- 
2.5.5



[PATCH 5/6] powerpc/boot: add xz support to the wrapper script

2016-08-30 Thread Oliver O'Halloran
This modifies the script so that the -Z option takes an argument to
specify the compression type. It can either be 'gz', 'xz' or 'none'.
The legazy --no-gzip and -z options are still supported and will set
the compression to none and gzip respectively, but they are not
documented.

Signed-off-by: Oliver O'Halloran <ooh...@gmail.com>
---
 arch/powerpc/boot/Makefile |  7 --
 arch/powerpc/boot/wrapper  | 61 ++
 2 files changed, 50 insertions(+), 18 deletions(-)

diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile
index 3fdd74ac2fae..482bac2af1ff 100644
--- a/arch/powerpc/boot/Makefile
+++ b/arch/powerpc/boot/Makefile
@@ -212,10 +212,13 @@ CROSSWRAP := -C "$(CROSS_COMPILE)"
 endif
 endif
 
+compressor-$(CONFIG_KERNEL_GZIP) := gz
+
 # args (to if_changed): 1 = (this rule), 2 = platform, 3 = dts 4=dtb 5=initrd
 quiet_cmd_wrap = WRAP$@
-  cmd_wrap =$(CONFIG_SHELL) $(wrapper) -c -o $@ -p $2 $(CROSSWRAP) \
-   $(if $3, -s $3)$(if $4, -d $4)$(if $5, -i $5) vmlinux
+  cmd_wrap =$(CONFIG_SHELL) $(wrapper) -Z $(compressor-y) -c -o $@ -p $2 \
+   $(CROSSWRAP) $(if $3, -s $3)$(if $4, -d $4)$(if $5, -i $5) \
+   vmlinux
 
 image-$(CONFIG_PPC_PSERIES)+= zImage.pseries
 image-$(CONFIG_PPC_POWERNV)+= zImage.pseries
diff --git a/arch/powerpc/boot/wrapper b/arch/powerpc/boot/wrapper
index 6681ec3625c9..cf7631be5007 100755
--- a/arch/powerpc/boot/wrapper
+++ b/arch/powerpc/boot/wrapper
@@ -20,6 +20,8 @@
 # -D dir   specify directory containing data files used by script
 #  (default ./arch/powerpc/boot)
 # -W dir   specify working directory for temporary files (default .)
+# -z   use gzip (legacy)
+# -Z zsuffixcompression to use (gz, xz or none)
 
 # Stop execution if any command fails
 set -e
@@ -38,7 +40,7 @@ dtb=
 dts=
 cacheit=
 binary=
-gzip=.gz
+compression=.gz
 pie=
 format=
 
@@ -59,7 +61,8 @@ tmpdir=.
 usage() {
 echo 'Usage: wrapper [-o output] [-p platform] [-i initrd]' >&2
 echo '   [-d devtree] [-s tree.dts] [-c] [-C cross-prefix]' >&2
-echo '   [-D datadir] [-W workingdir] [--no-gzip] [vmlinux]' >&2
+echo '   [-D datadir] [-W workingdir] [-Z (gz|xz|none)]' >&2
+echo '   [--no-compression] [vmlinux]' >&2
 exit 1
 }
 
@@ -126,8 +129,24 @@ while [ "$#" -gt 0 ]; do
[ "$#" -gt 0 ] || usage
tmpdir="$1"
;;
+-z)
+   compression=.gz
+   ;;
+-Z)
+   shift
+   [ "$#" -gt 0 ] || usage
+[ "$1" != "gz" -o "$1" != "xz" -o "$1" != "none" ] || usage
+
+   compression=".$1"
+
+if [ $compression = ".none" ]; then
+compression=
+fi
+   ;;
 --no-gzip)
-gzip=
+# a "feature" of the the wrapper script is that it can be used outside
+# the kernel tree. So keeping this around for backwards compatibility.
+compression=
 ;;
 -?)
usage
@@ -140,6 +159,7 @@ while [ "$#" -gt 0 ]; do
 shift
 done
 
+
 if [ -n "$dts" ]; then
 if [ ! -r "$dts" -a -r "$object/dts/$dts" ]; then
dts="$object/dts/$dts"
@@ -212,7 +232,7 @@ miboot|uboot*)
 ;;
 cuboot*)
 binary=y
-gzip=
+compression=
 case "$platform" in
 *-mpc866ads|*-mpc885ads|*-adder875*|*-ep88xc)
 platformo=$object/cuboot-8xx.o
@@ -243,7 +263,7 @@ cuboot*)
 ps3)
 platformo="$object/ps3-head.o $object/ps3-hvcall.o $object/ps3.o"
 lds=$object/zImage.ps3.lds
-gzip=
+compression=
 ext=bin
 objflags="-O binary --set-section-flags=.bss=contents,alloc,load,data"
 ksection=.kernel:vmlinux.bin
@@ -310,27 +330,37 @@ mvme7100)
 esac
 
 vmz="$tmpdir/`basename \"$kernel\"`.$ext"
-if [ -z "$cacheit" -o ! -f "$vmz$gzip" -o "$vmz$gzip" -ot "$kernel" ]; then
-${CROSS}objcopy $objflags "$kernel" "$vmz.$$"
 
-strip_size=$(stat -c %s $vmz.$$)
+# Calculate the vmlinux.strip size
+${CROSS}objcopy $objflags "$kernel" "$vmz.$$"
+strip_size=$(stat -c %s $vmz.$$)
 
-if [ -n "$gzip" ]; then
+if [ -z "$cacheit" -o ! -f "$vmz$compression" -o "$vmz$compression" -ot 
"$kernel" ]; then
+# recompress the image if we need to
+case $compression in
+.xz)
+xz --check=crc32 -f -9 "$vmz.$$"
+;;
+.gz)
 gzip -n -f -9 "$vmz.$$"
-fi
+;;
+*)
+# drop the compression suffix so the stripped vmlinux is used
+compression=
+   ;;
+esac
 
 if [ -n "$cacheit" ]; then
-   mv -f "$

[PATCH 2/6] powerpc/boot: Use CONFIG_KERNEL_GZIP

2016-08-30 Thread Oliver O'Halloran
Most architectures allow the compression algorithm used to produced the
vmlinuz image to be selected as a kernel config option. In preperation
for supporting algorithms other than gzip in the powerpc boot wrapper
the makefile needs to be modified to use these config options.

Signed-off-by: Oliver O'Halloran <ooh...@gmail.com>
---
 arch/powerpc/Kconfig   |  1 +
 arch/powerpc/boot/Makefile | 31 +++
 2 files changed, 20 insertions(+), 12 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 927d2ab2ce08..9f0568852ecf 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -167,6 +167,7 @@ config PPC
select GENERIC_CPU_AUTOPROBE
select HAVE_VIRT_CPU_ACCOUNTING
select HAVE_ARCH_HARDENED_USERCOPY
+   select HAVE_KERNEL_GZIP
 
 config GENERIC_CSUM
def_bool CPU_LITTLE_ENDIAN
diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile
index f98e42ee2534..5a99a485d80a 100644
--- a/arch/powerpc/boot/Makefile
+++ b/arch/powerpc/boot/Makefile
@@ -19,10 +19,14 @@
 
 all: $(obj)/zImage
 
+compress-$(CONFIG_KERNEL_GZIP) := CONFIG_KERNEL_GZIP
+
 BOOTCFLAGS:= -Wall -Wundef -Wstrict-prototypes -Wno-trigraphs \
 -fno-strict-aliasing -Os -msoft-float -pipe \
 -fomit-frame-pointer -fno-builtin -fPIC -nostdinc \
--isystem $(shell $(CROSS32CC) -print-file-name=include)
+-isystem $(shell $(CROSS32CC) -print-file-name=include) \
+-D$(compress-y)
+
 ifdef CONFIG_PPC64_BOOT_WRAPPER
 BOOTCFLAGS += -m64
 endif
@@ -59,13 +63,16 @@ $(obj)/treeboot-currituck.o: BOOTCFLAGS += -mcpu=405
 $(obj)/treeboot-akebono.o: BOOTCFLAGS += -mcpu=405
 $(obj)/virtex405-head.o: BOOTAFLAGS += -mcpu=405
 
+# the kernel's version of zlib pulls in a lot of other kernel headers
+# which we don't provide inside the wrapper.
+zlib-$(CONFIG_KERNEL_GZIP) := inffast.c inflate.c inftrees.c
+zlibheaders-$(CONFIG_KERNEL_GZIP) := inffast.h inffixed.h inflate.h inftrees.h 
infutil.h
+zliblinuxheader-$(CONFIG_KERNEL_GZIP) := zlib.h zconf.h zutil.h
 
-zlib   := inffast.c inflate.c inftrees.c
-zlibheader := inffast.h inffixed.h inflate.h inftrees.h infutil.h
-zliblinuxheader := zlib.h zconf.h zutil.h
-
-$(addprefix $(obj)/,$(zlib) cuboot-c2k.o gunzip_util.o main.o): \
-   $(addprefix $(obj)/,$(zliblinuxheader)) $(addprefix 
$(obj)/,$(zlibheader))
+$(addprefix $(obj)/,$(zlib-y) cuboot-c2k.o gunzip_util.o main.o): \
+   $(addprefix $(obj)/,$(zliblinuxheader-y)) \
+   $(addprefix $(obj)/,$(zlibheaders-y)) \
+   $(addprefix $(obj)/,$(zlib-decomp-y))
 
 libfdt   := fdt.c fdt_ro.c fdt_wip.c fdt_sw.c fdt_rw.c fdt_strerror.c
 libfdtheader := fdt.h libfdt.h libfdt_internal.h
@@ -76,7 +83,7 @@ $(addprefix $(obj)/,$(libfdt) libfdt-wrapper.o simpleboot.o 
epapr.o opal.o): \
 src-wlib-y := string.S crt0.S crtsavres.S stdio.c main.c \
$(libfdt) libfdt-wrapper.c \
ns16550.c serial.c simple_alloc.c div64.S util.S \
-   gunzip_util.c elf_util.c $(zlib) devtree.c stdlib.c \
+   gunzip_util.c elf_util.c $(zlib-y) devtree.c stdlib.c \
oflib.c ofconsole.c cuboot.c mpsc.c cpm-serial.c \
uartlite.c mpc52xx-psc.c opal.c opal-calls.S
 src-wlib-$(CONFIG_40x) += 4xx.c planetcore.c
@@ -128,13 +135,13 @@ obj-plat: $(libfdt)
 quiet_cmd_copy_kern_src = COPY$@
   cmd_copy_kern_src = sed -f 
$(srctree)/arch/powerpc/boot/fixup-headers.sed $< > $@
 
-$(addprefix $(obj)/,$(zlib)): $(obj)/%: $(srctree)/lib/zlib_inflate/%
+$(addprefix $(obj)/,$(zlib-y)): $(obj)/%: $(srctree)/lib/zlib_inflate/%
$(call cmd,copy_kern_src)
 
-$(addprefix $(obj)/,$(zlibheaders)): $(obj)/%: $(srctree)/lib/zlib_inflate/%
+$(addprefix $(obj)/,$(zlibheaders-y)): $(obj)/%: $(srctree)/lib/zlib_inflate/%
$(call cmd,copy_kern_src)
 
-$(addprefix $(obj)/,$(zliblinuxheader)): $(obj)/%: $(srctree)/include/linux/%
+$(addprefix $(obj)/,$(zliblinuxheader-y)): $(obj)/%: $(srctree)/include/linux/%
$(call cmd,copy_kern_src)
 
 quiet_cmd_copy_libfdt = COPY$@
@@ -153,7 +160,7 @@ $(obj)/zImage.lds: $(obj)/%: $(srctree)/$(src)/%.S
 $(obj)/zImage.coff.lds $(obj)/zImage.ps3.lds : $(obj)/%: $(srctree)/$(src)/%.S
@cp $< $@
 
-clean-files := $(zlib) $(zlibheader) $(zliblinuxheader) \
+clean-files := $(zlib-y) $(zlibheaders-y) $(zliblinuxheader-y) \
$(libfdt) $(libfdtheader) \
empty.c zImage.coff.lds zImage.ps3.lds zImage.lds
 
-- 
2.5.5



[PATCH 6/6] powerpc/boot: Add support for XZ compression

2016-08-30 Thread Oliver O'Halloran
This patch adds an option to use XZ compression for the kernel image.
Currently this is only enabled for PPC64 targets since the bulk of the
32bit platforms produce uboot images which do not use the wrapper.

Signed-off-by: Oliver O'Halloran <ooh...@gmail.com>
---
 arch/powerpc/boot/Makefile |  2 ++
 arch/powerpc/boot/decompress.c |  5 +
 arch/powerpc/boot/types.h  | 10 +
 arch/powerpc/boot/xz_config.h  | 39 ++
 arch/powerpc/platforms/Kconfig.cputype |  1 +
 5 files changed, 57 insertions(+)
 create mode 100644 arch/powerpc/boot/xz_config.h

diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile
index 482bac2af1ff..de36806c1a73 100644
--- a/arch/powerpc/boot/Makefile
+++ b/arch/powerpc/boot/Makefile
@@ -20,6 +20,7 @@
 all: $(obj)/zImage
 
 compress-$(CONFIG_KERNEL_GZIP) := CONFIG_KERNEL_GZIP
+compress-$(CONFIG_KERNEL_XZ)   := CONFIG_KERNEL_XZ
 
 BOOTCFLAGS:= -Wall -Wundef -Wstrict-prototypes -Wno-trigraphs \
 -fno-strict-aliasing -Os -msoft-float -pipe \
@@ -213,6 +214,7 @@ endif
 endif
 
 compressor-$(CONFIG_KERNEL_GZIP) := gz
+compressor-$(CONFIG_KERNEL_XZ)   := xz
 
 # args (to if_changed): 1 = (this rule), 2 = platform, 3 = dts 4=dtb 5=initrd
 quiet_cmd_wrap = WRAP$@
diff --git a/arch/powerpc/boot/decompress.c b/arch/powerpc/boot/decompress.c
index 60fc6fb26867..8f32ea4289af 100644
--- a/arch/powerpc/boot/decompress.c
+++ b/arch/powerpc/boot/decompress.c
@@ -37,6 +37,11 @@
 #  include "decompress_inflate.c"
 #endif
 
+#ifdef CONFIG_KERNEL_XZ
+#  include "xz_config.h"
+#  include "../../../lib/decompress_unxz.c"
+#endif
+
 /* globals for tracking the state of the decompression */
 static unsigned long decompressed_bytes;
 static unsigned long limit;
diff --git a/arch/powerpc/boot/types.h b/arch/powerpc/boot/types.h
index 85565a89bcc2..0362a262a299 100644
--- a/arch/powerpc/boot/types.h
+++ b/arch/powerpc/boot/types.h
@@ -34,4 +34,14 @@ typedef s64 int64_t;
(void) (&_x == &_y);\
_x > _y ? _x : _y; })
 
+#define min_t(type, a, b) min(((type) a), ((type) b))
+#define max_t(type, a, b) max(((type) a), ((type) b))
+
+#ifndef true
+#define true 1
+#endif
+
+#ifndef false
+#define false 0
+#endif
 #endif /* _TYPES_H_ */
diff --git a/arch/powerpc/boot/xz_config.h b/arch/powerpc/boot/xz_config.h
new file mode 100644
index ..5c6afdbca642
--- /dev/null
+++ b/arch/powerpc/boot/xz_config.h
@@ -0,0 +1,39 @@
+#ifndef __XZ_CONFIG_H__
+#define __XZ_CONFIG_H__
+
+/*
+ * most of this is copied from lib/xz/xz_private.h, we can't use their defines
+ * since the boot wrapper is not built in the same environment as the rest of
+ * the kernel.
+ */
+
+#include "types.h"
+#include "swab.h"
+
+static inline uint32_t swab32p(void *p)
+{
+   uint32_t *q = p;
+
+   return swab32(*q);
+}
+
+#ifdef __LITTLE_ENDIAN__
+#define get_le32(p) (*((uint32_t *) (p)))
+#else
+#define get_le32(p) swab32p(p)
+#endif
+
+#define memeq(a, b, size) (memcmp(a, b, size) == 0)
+#define memzero(buf, size) memset(buf, 0, size)
+
+/* prevent the inclusion of the xz-preboot MM headers */
+#define DECOMPR_MM_H
+#define memmove memmove
+#define XZ_EXTERN static
+
+/* xz.h needs to be included directly since we need enum xz_mode */
+#include "../../../include/linux/xz.h"
+
+#undef XZ_EXTERN
+
+#endif
diff --git a/arch/powerpc/platforms/Kconfig.cputype 
b/arch/powerpc/platforms/Kconfig.cputype
index f32edec13fd1..d5da55b01027 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -2,6 +2,7 @@ config PPC64
bool "64-bit kernel"
default n
select ZLIB_DEFLATE
+   select HAVE_KERNEL_XZ
help
  This option selects whether a 32-bit or a 64-bit kernel
  will be built.
-- 
2.5.5



XZ compressed zImage support

2016-08-30 Thread Oliver O'Halloran
This series adds support for using XZ compression in addition to gzip in the
kernel boot wrapper. Currently this is only enabled for 64bit Book3S processors
since it seems that some embedded platforms rely on uBoot (or similar) to
decompress the image rather than having the kernel decompress itself. Enabling
it for other platforms should be fairly straight forward though.

Supporting other compression algorithms (like ARM and x86 do) is possible, but
painful. Each algorithm includes some kernel headers even when the #defines
that are supposed to make them usable in a pre-boot environment are set.
Including kernel headers is an issue because on powerpc  the boot wrapper is
compiled with a different toolchain and possibly for a different target for
backwards compatibility reasons*. This makes it difficult to include kernel
headers since the include paths, etc are not setup for BOOTCC.

This can be worked around by rewriting parts of the each decompressor with sed
scripts, but the rewriting requried is specific to each decompressor.

-oliver

*powermacs have 32bit firmware that cannot directly load a 64bit kernel. A 64
bit big endian kernel has a 32bit wrapper to work around this. On 64bit little
endian we don't have this legacy problem so the wrapper is also 64bit little
endian, but the toolchain issues are still there.



[PATCH 4/6] powerpc/boot: remove legacy gzip wrapper

2016-08-30 Thread Oliver O'Halloran
This code is no longer used and can be removed.

Signed-off-by: Oliver O'Halloran <ooh...@gmail.com>
---
 arch/powerpc/boot/gunzip_util.c | 204 
 arch/powerpc/boot/gunzip_util.h |  45 -
 2 files changed, 249 deletions(-)
 delete mode 100644 arch/powerpc/boot/gunzip_util.c
 delete mode 100644 arch/powerpc/boot/gunzip_util.h

diff --git a/arch/powerpc/boot/gunzip_util.c b/arch/powerpc/boot/gunzip_util.c
deleted file mode 100644
index 9dc52501de83..
--- a/arch/powerpc/boot/gunzip_util.c
+++ /dev/null
@@ -1,204 +0,0 @@
-/*
- * Copyright 2007 David Gibson, IBM Corporation.
- * Based on earlier work, Copyright (C) Paul Mackerras 1997.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#include 
-#include "string.h"
-#include "stdio.h"
-#include "ops.h"
-#include "gunzip_util.h"
-
-#define HEAD_CRC   2
-#define EXTRA_FIELD4
-#define ORIG_NAME  8
-#define COMMENT0x10
-#define RESERVED   0xe0
-
-/**
- * gunzip_start - prepare to decompress gzip data
- * @state: decompressor state structure to be initialized
- * @src:   buffer containing gzip compressed or uncompressed data
- * @srclen:size in bytes of the buffer at src
- *
- * If the buffer at @src contains a gzip header, this function
- * initializes zlib to decompress the data, storing the decompression
- * state in @state.  The other functions in this file can then be used
- * to decompress data from the gzipped stream.
- *
- * If the buffer at @src does not contain a gzip header, it is assumed
- * to contain uncompressed data.  The buffer information is recorded
- * in @state and the other functions in this file will simply copy
- * data from the uncompressed data stream at @src.
- *
- * Any errors, such as bad compressed data, cause an error to be
- * printed an the platform's exit() function to be called.
- */
-void gunzip_start(struct gunzip_state *state, void *src, int srclen)
-{
-   char *hdr = src;
-   int hdrlen = 0;
-
-   memset(state, 0, sizeof(*state));
-
-   /* Check for gzip magic number */
-   if ((hdr[0] == 0x1f) && (hdr[1] == 0x8b)) {
-   /* gzip data, initialize zlib parameters */
-   int r, flags;
-
-   state->s.workspace = state->scratch;
-   if (zlib_inflate_workspacesize() > sizeof(state->scratch))
-   fatal("insufficient scratch space for gunzip\n\r");
-
-   /* skip header */
-   hdrlen = 10;
-   flags = hdr[3];
-   if (hdr[2] != Z_DEFLATED || (flags & RESERVED) != 0)
-   fatal("bad gzipped data\n\r");
-   if ((flags & EXTRA_FIELD) != 0)
-   hdrlen = 12 + hdr[10] + (hdr[11] << 8);
-   if ((flags & ORIG_NAME) != 0)
-   while (hdr[hdrlen++] != 0)
-   ;
-   if ((flags & COMMENT) != 0)
-   while (hdr[hdrlen++] != 0)
-   ;
-   if ((flags & HEAD_CRC) != 0)
-   hdrlen += 2;
-   if (hdrlen >= srclen)
-   fatal("gunzip_start: ran out of data in header\n\r");
-
-   r = zlib_inflateInit2(>s, -MAX_WBITS);
-   if (r != Z_OK)
-   fatal("inflateInit2 returned %d\n\r", r);
-   }
-
-   state->s.total_in = hdrlen;
-   state->s.next_in = src + hdrlen;
-   state->s.avail_in = srclen - hdrlen;
-}
-
-/**
- * gunzip_partial - extract bytes from a gzip data stream
- * @state: gzip state structure previously initialized by gunzip_start()
- * @dst:   buffer to store extracted data
- * @dstlen:maximum number of bytes to extract
- *
- * This function extracts at most @dstlen bytes from the data stream
- * previously associated with @state by gunzip_start(), decompressing
- * if necessary.  Exactly @dstlen bytes are extracted unless the data
- * stream doesn't contain enough bytes, in which case the entire
- * remainder of the stream is decompressed.
- *
- * Returns the actual number of bytes extracted.  If any errors occur,
- * such as a corrupted compressed stream, an error is printed an the
- * platform's exit() function is called.
- */
-int gunzip_partial(struct gunzip_state *state, void *dst, int dstlen)
-{
-   int len;
-
-   if (state->s.workspace) {
-   /* gunzipping */
-   int r;
-
-   state->s.next_out = dst;
-   state->s.avail_out = dstlen;
-   r = zlib_inflate(>s, Z_FULL_FLUSH);
-   

Re: Commit 1b7898ee276b "powerpc/boot: Use the pre-boot decompression API" breaks boot

2016-10-06 Thread Oliver O'Halloran
Hi, Heiner

Could you send me a copy of the kernel .config (or which defconfig)
that you're using, the name of the HW platform that you're using and
if possible the kernel image itself?

Thanks,
Oliver


[PATCH v2 3/6] powerpc/boot: use the preboot decompression API

2016-09-19 Thread Oliver O'Halloran
Currently the powerpc boot wrapper has its own wrapper around zlib to
handle decompressing gzipped kernels. The kernel decompressor library
functions now provide a generic interface that can be used in the pre-boot
environment. This allows boot wrappers to easily support different
compression algorithms. This patch converts the wrapper to use this new
API, but does not add support for using new algorithms.

Signed-off-by: Oliver O'Halloran <ooh...@gmail.com>
---
 arch/powerpc/boot/Makefile |  34 +++---
 arch/powerpc/boot/decompress.c | 142 +
 arch/powerpc/boot/main.c   |  35 +-
 arch/powerpc/boot/ops.h|   3 +
 4 files changed, 189 insertions(+), 25 deletions(-)
 create mode 100644 arch/powerpc/boot/decompress.c

diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile
index bede555d78cf..861348c72519 100644
--- a/arch/powerpc/boot/Makefile
+++ b/arch/powerpc/boot/Makefile
@@ -63,13 +63,28 @@ $(obj)/treeboot-currituck.o: BOOTCFLAGS += -mcpu=405
 $(obj)/treeboot-akebono.o: BOOTCFLAGS += -mcpu=405
 $(obj)/virtex405-head.o: BOOTAFLAGS += -mcpu=405
 
-# the kernel's version of zlib pulls in a lot of other kernel headers
-# which we don't provide inside the wrapper.
+# The pre-boot decompressors pull in a lot of kernel headers and other source
+# files. This creates a bit of a dependency headache since we need to copy
+# these files into the build dir, fix up any includes and ensure that dependent
+# files are copied in the right order.
+
+# these need to be separate variables because they are copied out of different
+# directories in the kernel tree. Sure you COULd merge them, but it's a
+# cure-is-worse-than-disease situation.
+zlib-decomp-$(CONFIG_KERNEL_GZIP) := decompress_inflate.c
 zlib-$(CONFIG_KERNEL_GZIP) := inffast.c inflate.c inftrees.c
 zlibheader-$(CONFIG_KERNEL_GZIP) := inffast.h inffixed.h inflate.h inftrees.h 
infutil.h
 zliblinuxheader-$(CONFIG_KERNEL_GZIP) := zlib.h zconf.h zutil.h
 
-$(addprefix $(obj)/,$(zlib-y) cuboot-c2k.o gunzip_util.o main.o): \
+$(addprefix $(obj)/, decompress.o): \
+   $(addprefix $(obj)/,$(zlib-decomp-y))
+
+$(addprefix $(obj)/, $(zlib-decomp-y)): \
+   $(addprefix $(obj)/,$(zliblinuxheader-y)) \
+   $(addprefix $(obj)/,$(zlibheader-y)) \
+   $(addprefix $(obj)/,$(zlib-y))
+
+$(addprefix $(obj)/,$(zlib-y)): \
$(addprefix $(obj)/,$(zliblinuxheader-y)) \
$(addprefix $(obj)/,$(zlibheader-y))
 
@@ -79,10 +94,10 @@ libfdtheader := fdt.h libfdt.h libfdt_internal.h
 $(addprefix $(obj)/,$(libfdt) libfdt-wrapper.o simpleboot.o epapr.o opal.o): \
$(addprefix $(obj)/,$(libfdtheader))
 
-src-wlib-y := string.S crt0.S crtsavres.S stdio.c main.c \
+src-wlib-y := string.S crt0.S crtsavres.S stdio.c decompress.c main.c \
$(libfdt) libfdt-wrapper.c \
ns16550.c serial.c simple_alloc.c div64.S util.S \
-   gunzip_util.c elf_util.c $(zlib-y) devtree.c stdlib.c \
+   elf_util.c $(zlib-y) devtree.c stdlib.c \
oflib.c ofconsole.c cuboot.c mpsc.c cpm-serial.c \
uartlite.c mpc52xx-psc.c opal.c opal-calls.S
 src-wlib-$(CONFIG_40x) += 4xx.c planetcore.c
@@ -143,6 +158,9 @@ $(addprefix $(obj)/,$(zlibheader-y)): $(obj)/%: 
$(srctree)/lib/zlib_inflate/%
 $(addprefix $(obj)/,$(zliblinuxheader-y)): $(obj)/%: $(srctree)/include/linux/%
$(call cmd,copy_kern_src)
 
+$(addprefix $(obj)/,$(zlib-decomp-y)): $(obj)/%: $(srctree)/lib/%
+   $(call cmd,copy_kern_src)
+
 quiet_cmd_copy_libfdt = COPY$@
   cmd_copy_libfdt = cp $< $@
 
@@ -160,7 +178,7 @@ $(obj)/zImage.coff.lds $(obj)/zImage.ps3.lds : $(obj)/%: 
$(srctree)/$(src)/%.S
$(Q)cp $< $@
 
 clean-files := $(zlib-) $(zlibheader-) $(zliblinuxheader-) \
-   $(libfdt) $(libfdtheader) \
+   $(zlib-decomp-) $(libfdt) $(libfdtheader) \
empty.c zImage.coff.lds zImage.ps3.lds zImage.lds
 
 quiet_cmd_bootcc = BOOTCC  $@
@@ -410,8 +428,8 @@ clean-files += $(image-) $(initrd-) cuImage.* dtbImage.* 
treeImage.* \
zImage.maple simpleImage.* otheros.bld *.dtb
 
 # clean up files cached by wrapper
-clean-kernel := vmlinux.strip vmlinux.bin
-clean-kernel += $(addsuffix .gz,$(clean-kernel))
+clean-kernel-base := vmlinux.strip vmlinux.bin
+clean-kernel := $(addsuffix .gz,$(clean-kernel-base))
 # If not absolute clean-files are relative to $(obj).
 clean-files += $(addprefix $(objtree)/, $(clean-kernel))
 
diff --git a/arch/powerpc/boot/decompress.c b/arch/powerpc/boot/decompress.c
new file mode 100644
index ..60fc6fb26867
--- /dev/null
+++ b/arch/powerpc/boot/decompress.c
@@ -0,0 +1,142 @@
+/*
+ * Wrapper around the kernel's pre-boot decompression library.
+ *
+ * Copyright (C) IBM Corporation 2016.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Softwar

[PATCH v2 1/6] powerpc/boot: add sed script

2016-09-19 Thread Oliver O'Halloran
The powerpc boot wrapper is compiled with a separate "bootcc" toolchain
rather than the toolchain used for the rest of the kernel. The main
problem with this is that the wrapper does not have access to the kernel
headers (without a lot of gross hacks). To get around this the required
headers are copied into the build directory via several sed scripts
which rewrite problematic includes. This patch moves these fixups out of
the makefile into a separate .sed script file to clean up makefile
slightly.

Signed-off-by: Oliver O'Halloran <ooh...@gmail.com>
---
 arch/powerpc/boot/Makefile  | 16 +---
 arch/powerpc/boot/fixup-headers.sed | 12 
 2 files changed, 17 insertions(+), 11 deletions(-)
 create mode 100644 arch/powerpc/boot/fixup-headers.sed

diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile
index df0fd406aed1..7d6768253caa 100644
--- a/arch/powerpc/boot/Makefile
+++ b/arch/powerpc/boot/Makefile
@@ -125,23 +125,17 @@ obj-wlib := $(addsuffix .o, $(basename $(addprefix 
$(obj)/, $(src-wlib
 obj-plat := $(addsuffix .o, $(basename $(addprefix $(obj)/, $(src-plat
 obj-plat: $(libfdt)
 
-quiet_cmd_copy_zlib = COPY$@
-  cmd_copy_zlib = sed "s@__used@@;s@]*\).*@\"\1\"@" $< > $@
-
-quiet_cmd_copy_zlibheader = COPY$@
-  cmd_copy_zlibheader = sed "s@]*\).*@\"\1\"@" $< > $@
-# stddef.h for NULL
-quiet_cmd_copy_zliblinuxheader = COPY$@
-  cmd_copy_zliblinuxheader = sed 
"s@@\"string.h\"@;s@@@;s@]*\).*@\"\1\"@"
 $< > $@
+quiet_cmd_copy_kern_src = COPY$@
+  cmd_copy_kern_src = sed -f 
$(srctree)/arch/powerpc/boot/fixup-headers.sed $< > $@
 
 $(addprefix $(obj)/,$(zlib)): $(obj)/%: $(srctree)/lib/zlib_inflate/%
-   $(call cmd,copy_zlib)
+   $(call cmd,copy_kern_src)
 
 $(addprefix $(obj)/,$(zlibheader)): $(obj)/%: $(srctree)/lib/zlib_inflate/%
-   $(call cmd,copy_zlibheader)
+   $(call cmd,copy_kern_src)
 
 $(addprefix $(obj)/,$(zliblinuxheader)): $(obj)/%: $(srctree)/include/linux/%
-   $(call cmd,copy_zliblinuxheader)
+   $(call cmd,copy_kern_src)
 
 quiet_cmd_copy_libfdt = COPY$@
   cmd_copy_libfdt = cp $< $@
diff --git a/arch/powerpc/boot/fixup-headers.sed 
b/arch/powerpc/boot/fixup-headers.sed
new file mode 100644
index ..96362428eb37
--- /dev/null
+++ b/arch/powerpc/boot/fixup-headers.sed
@@ -0,0 +1,12 @@
+# Copyright 2016 IBM Corporation.
+#
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License version 2 or later as
+# published by the Free Software Foundation.
+
+s@#include @@;
+s@\"zlib_inflate/\([^\"]*\).*@"\1"@;
+s@@@;
+
+s@__used@@;
+s@]*\).*@"\1"@;
-- 
2.5.5



[v2] XZ compressed zImage support

2016-09-19 Thread Oliver O'Halloran
This series adds support for using XZ compression in addition to gzip in the
kernel boot wrapper. Currently this is only enabled for 64bit Book3S processors
since it seems that some embedded platforms rely on uBoot (or similar) to
decompress the image rather than having the kernel decompress itself. Enabling
it for other platforms should be fairly straight forward though.

Supporting other compression algorithms (like ARM and x86 do) is possible, but
painful. Each algorithm includes some kernel headers even when the #defines
that are supposed to make them usable in a pre-boot environment are set.
Including kernel headers is an issue because on powerpc  the boot wrapper is
compiled with a different toolchain and possibly for a different target for
backwards compatibility reasons*. This makes it difficult to include kernel
headers since the include paths, etc are not setup for BOOTCC.

This can be worked around by rewriting parts of the each decompressor with sed
scripts, but the rewriting requried is specific to each decompressor.

-oliver

*powermacs have 32bit firmware that cannot directly load a 64bit kernel. A 64
bit big endian kernel has a 32bit wrapper to work around this. On 64bit little
endian we don't have this legacy problem so the wrapper is also 64bit little
endian, but the toolchain issues are still there.

---
Changes from v1:
fixed some missing dependecies in the Makefile that were causing random
build breaks.

Fixed "make clean" so that it would remove the files copied into
arch/powerpc/boot/ when the wrapper was built.

previously this series renamed "zlibheader" to "zlibheaders". There were
consequences.
---


[PATCH v2 6/6] powerpc/boot: Add support for XZ compression

2016-09-19 Thread Oliver O'Halloran
This patch adds an option to use XZ compression for the kernel image.
Currently this is only enabled for PPC64 targets since the bulk of the
32bit platforms produce uboot images which do not use the wrapper.

Signed-off-by: Oliver O'Halloran <ooh...@gmail.com>
---
 arch/powerpc/boot/Makefile |  3 +++
 arch/powerpc/boot/decompress.c |  5 +
 arch/powerpc/boot/types.h  | 10 +
 arch/powerpc/boot/xz_config.h  | 39 ++
 arch/powerpc/platforms/Kconfig.cputype |  1 +
 5 files changed, 58 insertions(+)
 create mode 100644 arch/powerpc/boot/xz_config.h

diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile
index 9fb451d0586e..eae2dc8bc218 100644
--- a/arch/powerpc/boot/Makefile
+++ b/arch/powerpc/boot/Makefile
@@ -20,6 +20,7 @@
 all: $(obj)/zImage
 
 compress-$(CONFIG_KERNEL_GZIP) := CONFIG_KERNEL_GZIP
+compress-$(CONFIG_KERNEL_XZ)   := CONFIG_KERNEL_XZ
 
 BOOTCFLAGS:= -Wall -Wundef -Wstrict-prototypes -Wno-trigraphs \
 -fno-strict-aliasing -Os -msoft-float -pipe \
@@ -226,6 +227,7 @@ endif
 endif
 
 compressor-$(CONFIG_KERNEL_GZIP) := gz
+compressor-$(CONFIG_KERNEL_XZ)   := xz
 
 # args (to if_changed): 1 = (this rule), 2 = platform, 3 = dts 4=dtb 5=initrd
 quiet_cmd_wrap = WRAP$@
@@ -433,6 +435,7 @@ clean-files += $(image-) $(initrd-) cuImage.* dtbImage.* 
treeImage.* \
 # clean up files cached by wrapper
 clean-kernel-base := vmlinux.strip vmlinux.bin
 clean-kernel := $(addsuffix .gz,$(clean-kernel-base))
+clean-kernel += $(addsuffix .xz,$(clean-kernel-base))
 # If not absolute clean-files are relative to $(obj).
 clean-files += $(addprefix $(objtree)/, $(clean-kernel))
 
diff --git a/arch/powerpc/boot/decompress.c b/arch/powerpc/boot/decompress.c
index 60fc6fb26867..8f32ea4289af 100644
--- a/arch/powerpc/boot/decompress.c
+++ b/arch/powerpc/boot/decompress.c
@@ -37,6 +37,11 @@
 #  include "decompress_inflate.c"
 #endif
 
+#ifdef CONFIG_KERNEL_XZ
+#  include "xz_config.h"
+#  include "../../../lib/decompress_unxz.c"
+#endif
+
 /* globals for tracking the state of the decompression */
 static unsigned long decompressed_bytes;
 static unsigned long limit;
diff --git a/arch/powerpc/boot/types.h b/arch/powerpc/boot/types.h
index 85565a89bcc2..0362a262a299 100644
--- a/arch/powerpc/boot/types.h
+++ b/arch/powerpc/boot/types.h
@@ -34,4 +34,14 @@ typedef s64 int64_t;
(void) (&_x == &_y);\
_x > _y ? _x : _y; })
 
+#define min_t(type, a, b) min(((type) a), ((type) b))
+#define max_t(type, a, b) max(((type) a), ((type) b))
+
+#ifndef true
+#define true 1
+#endif
+
+#ifndef false
+#define false 0
+#endif
 #endif /* _TYPES_H_ */
diff --git a/arch/powerpc/boot/xz_config.h b/arch/powerpc/boot/xz_config.h
new file mode 100644
index ..5c6afdbca642
--- /dev/null
+++ b/arch/powerpc/boot/xz_config.h
@@ -0,0 +1,39 @@
+#ifndef __XZ_CONFIG_H__
+#define __XZ_CONFIG_H__
+
+/*
+ * most of this is copied from lib/xz/xz_private.h, we can't use their defines
+ * since the boot wrapper is not built in the same environment as the rest of
+ * the kernel.
+ */
+
+#include "types.h"
+#include "swab.h"
+
+static inline uint32_t swab32p(void *p)
+{
+   uint32_t *q = p;
+
+   return swab32(*q);
+}
+
+#ifdef __LITTLE_ENDIAN__
+#define get_le32(p) (*((uint32_t *) (p)))
+#else
+#define get_le32(p) swab32p(p)
+#endif
+
+#define memeq(a, b, size) (memcmp(a, b, size) == 0)
+#define memzero(buf, size) memset(buf, 0, size)
+
+/* prevent the inclusion of the xz-preboot MM headers */
+#define DECOMPR_MM_H
+#define memmove memmove
+#define XZ_EXTERN static
+
+/* xz.h needs to be included directly since we need enum xz_mode */
+#include "../../../include/linux/xz.h"
+
+#undef XZ_EXTERN
+
+#endif
diff --git a/arch/powerpc/platforms/Kconfig.cputype 
b/arch/powerpc/platforms/Kconfig.cputype
index f32edec13fd1..d5da55b01027 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -2,6 +2,7 @@ config PPC64
bool "64-bit kernel"
default n
select ZLIB_DEFLATE
+   select HAVE_KERNEL_XZ
help
  This option selects whether a 32-bit or a 64-bit kernel
  will be built.
-- 
2.5.5



[PATCH v2 5/6] powerpc/boot: add xz support to the wrapper script

2016-09-19 Thread Oliver O'Halloran
This modifies the script so that the -Z option takes an argument to
specify the compression type. It can either be 'gz', 'xz' or 'none'.
The legazy --no-gzip and -z options are still supported and will set
the compression to none and gzip respectively, but they are not
documented.

Signed-off-by: Oliver O'Halloran <ooh...@gmail.com>
---
 arch/powerpc/boot/Makefile |  7 --
 arch/powerpc/boot/wrapper  | 61 ++
 2 files changed, 50 insertions(+), 18 deletions(-)

diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile
index 861348c72519..9fb451d0586e 100644
--- a/arch/powerpc/boot/Makefile
+++ b/arch/powerpc/boot/Makefile
@@ -225,10 +225,13 @@ CROSSWRAP := -C "$(CROSS_COMPILE)"
 endif
 endif
 
+compressor-$(CONFIG_KERNEL_GZIP) := gz
+
 # args (to if_changed): 1 = (this rule), 2 = platform, 3 = dts 4=dtb 5=initrd
 quiet_cmd_wrap = WRAP$@
-  cmd_wrap =$(CONFIG_SHELL) $(wrapper) -c -o $@ -p $2 $(CROSSWRAP) \
-   $(if $3, -s $3)$(if $4, -d $4)$(if $5, -i $5) vmlinux
+  cmd_wrap =$(CONFIG_SHELL) $(wrapper) -Z $(compressor-y) -c -o $@ -p $2 \
+   $(CROSSWRAP) $(if $3, -s $3)$(if $4, -d $4)$(if $5, -i $5) \
+   vmlinux
 
 image-$(CONFIG_PPC_PSERIES)+= zImage.pseries
 image-$(CONFIG_PPC_POWERNV)+= zImage.pseries
diff --git a/arch/powerpc/boot/wrapper b/arch/powerpc/boot/wrapper
index 6681ec3625c9..cf7631be5007 100755
--- a/arch/powerpc/boot/wrapper
+++ b/arch/powerpc/boot/wrapper
@@ -20,6 +20,8 @@
 # -D dir   specify directory containing data files used by script
 #  (default ./arch/powerpc/boot)
 # -W dir   specify working directory for temporary files (default .)
+# -z   use gzip (legacy)
+# -Z zsuffixcompression to use (gz, xz or none)
 
 # Stop execution if any command fails
 set -e
@@ -38,7 +40,7 @@ dtb=
 dts=
 cacheit=
 binary=
-gzip=.gz
+compression=.gz
 pie=
 format=
 
@@ -59,7 +61,8 @@ tmpdir=.
 usage() {
 echo 'Usage: wrapper [-o output] [-p platform] [-i initrd]' >&2
 echo '   [-d devtree] [-s tree.dts] [-c] [-C cross-prefix]' >&2
-echo '   [-D datadir] [-W workingdir] [--no-gzip] [vmlinux]' >&2
+echo '   [-D datadir] [-W workingdir] [-Z (gz|xz|none)]' >&2
+echo '   [--no-compression] [vmlinux]' >&2
 exit 1
 }
 
@@ -126,8 +129,24 @@ while [ "$#" -gt 0 ]; do
[ "$#" -gt 0 ] || usage
tmpdir="$1"
;;
+-z)
+   compression=.gz
+   ;;
+-Z)
+   shift
+   [ "$#" -gt 0 ] || usage
+[ "$1" != "gz" -o "$1" != "xz" -o "$1" != "none" ] || usage
+
+   compression=".$1"
+
+if [ $compression = ".none" ]; then
+compression=
+fi
+   ;;
 --no-gzip)
-gzip=
+# a "feature" of the the wrapper script is that it can be used outside
+# the kernel tree. So keeping this around for backwards compatibility.
+compression=
 ;;
 -?)
usage
@@ -140,6 +159,7 @@ while [ "$#" -gt 0 ]; do
 shift
 done
 
+
 if [ -n "$dts" ]; then
 if [ ! -r "$dts" -a -r "$object/dts/$dts" ]; then
dts="$object/dts/$dts"
@@ -212,7 +232,7 @@ miboot|uboot*)
 ;;
 cuboot*)
 binary=y
-gzip=
+compression=
 case "$platform" in
 *-mpc866ads|*-mpc885ads|*-adder875*|*-ep88xc)
 platformo=$object/cuboot-8xx.o
@@ -243,7 +263,7 @@ cuboot*)
 ps3)
 platformo="$object/ps3-head.o $object/ps3-hvcall.o $object/ps3.o"
 lds=$object/zImage.ps3.lds
-gzip=
+compression=
 ext=bin
 objflags="-O binary --set-section-flags=.bss=contents,alloc,load,data"
 ksection=.kernel:vmlinux.bin
@@ -310,27 +330,37 @@ mvme7100)
 esac
 
 vmz="$tmpdir/`basename \"$kernel\"`.$ext"
-if [ -z "$cacheit" -o ! -f "$vmz$gzip" -o "$vmz$gzip" -ot "$kernel" ]; then
-${CROSS}objcopy $objflags "$kernel" "$vmz.$$"
 
-strip_size=$(stat -c %s $vmz.$$)
+# Calculate the vmlinux.strip size
+${CROSS}objcopy $objflags "$kernel" "$vmz.$$"
+strip_size=$(stat -c %s $vmz.$$)
 
-if [ -n "$gzip" ]; then
+if [ -z "$cacheit" -o ! -f "$vmz$compression" -o "$vmz$compression" -ot 
"$kernel" ]; then
+# recompress the image if we need to
+case $compression in
+.xz)
+xz --check=crc32 -f -9 "$vmz.$$"
+;;
+.gz)
 gzip -n -f -9 "$vmz.$$"
-fi
+;;
+*)
+# drop the compression suffix so the stripped vmlinux is used
+compression=
+   ;;
+esac
 
 if [ -n "$cacheit" ]; then
-   mv -f "$

[PATCH v2 2/6] powerpc/boot: Use CONFIG_KERNEL_GZIP

2016-09-19 Thread Oliver O'Halloran
Most architectures allow the compression algorithm used to produced the
vmlinuz image to be selected as a kernel config option. In preperation
for supporting algorithms other than gzip in the powerpc boot wrapper
the makefile needs to be modified to use these config options.

Signed-off-by: Oliver O'Halloran <ooh...@gmail.com>
---
 arch/powerpc/Kconfig   |  1 +
 arch/powerpc/boot/Makefile | 30 ++
 2 files changed, 19 insertions(+), 12 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 914983a29156..aa96bda118aa 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -161,6 +161,7 @@ config PPC
select GENERIC_CPU_AUTOPROBE
select HAVE_VIRT_CPU_ACCOUNTING
select HAVE_ARCH_HARDENED_USERCOPY
+   select HAVE_KERNEL_GZIP
 
 config GENERIC_CSUM
def_bool CPU_LITTLE_ENDIAN
diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile
index 7d6768253caa..bede555d78cf 100644
--- a/arch/powerpc/boot/Makefile
+++ b/arch/powerpc/boot/Makefile
@@ -19,10 +19,14 @@
 
 all: $(obj)/zImage
 
+compress-$(CONFIG_KERNEL_GZIP) := CONFIG_KERNEL_GZIP
+
 BOOTCFLAGS:= -Wall -Wundef -Wstrict-prototypes -Wno-trigraphs \
 -fno-strict-aliasing -Os -msoft-float -pipe \
 -fomit-frame-pointer -fno-builtin -fPIC -nostdinc \
--isystem $(shell $(CROSS32CC) -print-file-name=include)
+-isystem $(shell $(CROSS32CC) -print-file-name=include) \
+-D$(compress-y)
+
 ifdef CONFIG_PPC64_BOOT_WRAPPER
 BOOTCFLAGS += -m64
 endif
@@ -59,13 +63,15 @@ $(obj)/treeboot-currituck.o: BOOTCFLAGS += -mcpu=405
 $(obj)/treeboot-akebono.o: BOOTCFLAGS += -mcpu=405
 $(obj)/virtex405-head.o: BOOTAFLAGS += -mcpu=405
 
+# the kernel's version of zlib pulls in a lot of other kernel headers
+# which we don't provide inside the wrapper.
+zlib-$(CONFIG_KERNEL_GZIP) := inffast.c inflate.c inftrees.c
+zlibheader-$(CONFIG_KERNEL_GZIP) := inffast.h inffixed.h inflate.h inftrees.h 
infutil.h
+zliblinuxheader-$(CONFIG_KERNEL_GZIP) := zlib.h zconf.h zutil.h
 
-zlib   := inffast.c inflate.c inftrees.c
-zlibheader := inffast.h inffixed.h inflate.h inftrees.h infutil.h
-zliblinuxheader := zlib.h zconf.h zutil.h
-
-$(addprefix $(obj)/,$(zlib) cuboot-c2k.o gunzip_util.o main.o): \
-   $(addprefix $(obj)/,$(zliblinuxheader)) $(addprefix 
$(obj)/,$(zlibheader))
+$(addprefix $(obj)/,$(zlib-y) cuboot-c2k.o gunzip_util.o main.o): \
+   $(addprefix $(obj)/,$(zliblinuxheader-y)) \
+   $(addprefix $(obj)/,$(zlibheader-y))
 
 libfdt   := fdt.c fdt_ro.c fdt_wip.c fdt_sw.c fdt_rw.c fdt_strerror.c
 libfdtheader := fdt.h libfdt.h libfdt_internal.h
@@ -76,7 +82,7 @@ $(addprefix $(obj)/,$(libfdt) libfdt-wrapper.o simpleboot.o 
epapr.o opal.o): \
 src-wlib-y := string.S crt0.S crtsavres.S stdio.c main.c \
$(libfdt) libfdt-wrapper.c \
ns16550.c serial.c simple_alloc.c div64.S util.S \
-   gunzip_util.c elf_util.c $(zlib) devtree.c stdlib.c \
+   gunzip_util.c elf_util.c $(zlib-y) devtree.c stdlib.c \
oflib.c ofconsole.c cuboot.c mpsc.c cpm-serial.c \
uartlite.c mpc52xx-psc.c opal.c opal-calls.S
 src-wlib-$(CONFIG_40x) += 4xx.c planetcore.c
@@ -128,13 +134,13 @@ obj-plat: $(libfdt)
 quiet_cmd_copy_kern_src = COPY$@
   cmd_copy_kern_src = sed -f 
$(srctree)/arch/powerpc/boot/fixup-headers.sed $< > $@
 
-$(addprefix $(obj)/,$(zlib)): $(obj)/%: $(srctree)/lib/zlib_inflate/%
+$(addprefix $(obj)/,$(zlib-y)): $(obj)/%: $(srctree)/lib/zlib_inflate/%
$(call cmd,copy_kern_src)
 
-$(addprefix $(obj)/,$(zlibheader)): $(obj)/%: $(srctree)/lib/zlib_inflate/%
+$(addprefix $(obj)/,$(zlibheader-y)): $(obj)/%: $(srctree)/lib/zlib_inflate/%
$(call cmd,copy_kern_src)
 
-$(addprefix $(obj)/,$(zliblinuxheader)): $(obj)/%: $(srctree)/include/linux/%
+$(addprefix $(obj)/,$(zliblinuxheader-y)): $(obj)/%: $(srctree)/include/linux/%
$(call cmd,copy_kern_src)
 
 quiet_cmd_copy_libfdt = COPY$@
@@ -153,7 +159,7 @@ $(obj)/zImage.lds: $(obj)/%: $(srctree)/$(src)/%.S
 $(obj)/zImage.coff.lds $(obj)/zImage.ps3.lds : $(obj)/%: $(srctree)/$(src)/%.S
$(Q)cp $< $@
 
-clean-files := $(zlib) $(zlibheader) $(zliblinuxheader) \
+clean-files := $(zlib-) $(zlibheader-) $(zliblinuxheader-) \
$(libfdt) $(libfdtheader) \
empty.c zImage.coff.lds zImage.ps3.lds zImage.lds
 
-- 
2.5.5



[PATCH v2 4/6] powerpc/boot: remove legacy gzip wrapper

2016-09-19 Thread Oliver O'Halloran
This code is no longer used and can be removed.

Signed-off-by: Oliver O'Halloran <ooh...@gmail.com>
---
 arch/powerpc/boot/cuboot-c2k.c  |   1 -
 arch/powerpc/boot/gunzip_util.c | 204 
 arch/powerpc/boot/gunzip_util.h |  45 -
 3 files changed, 250 deletions(-)
 delete mode 100644 arch/powerpc/boot/gunzip_util.c
 delete mode 100644 arch/powerpc/boot/gunzip_util.h

diff --git a/arch/powerpc/boot/cuboot-c2k.c b/arch/powerpc/boot/cuboot-c2k.c
index e43594950ba3..9309c51f1d65 100644
--- a/arch/powerpc/boot/cuboot-c2k.c
+++ b/arch/powerpc/boot/cuboot-c2k.c
@@ -18,7 +18,6 @@
 #include "io.h"
 #include "ops.h"
 #include "elf.h"
-#include "gunzip_util.h"
 #include "mv64x60.h"
 #include "cuboot.h"
 #include "ppcboot.h"
diff --git a/arch/powerpc/boot/gunzip_util.c b/arch/powerpc/boot/gunzip_util.c
deleted file mode 100644
index 9dc52501de83..
--- a/arch/powerpc/boot/gunzip_util.c
+++ /dev/null
@@ -1,204 +0,0 @@
-/*
- * Copyright 2007 David Gibson, IBM Corporation.
- * Based on earlier work, Copyright (C) Paul Mackerras 1997.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#include 
-#include "string.h"
-#include "stdio.h"
-#include "ops.h"
-#include "gunzip_util.h"
-
-#define HEAD_CRC   2
-#define EXTRA_FIELD4
-#define ORIG_NAME  8
-#define COMMENT0x10
-#define RESERVED   0xe0
-
-/**
- * gunzip_start - prepare to decompress gzip data
- * @state: decompressor state structure to be initialized
- * @src:   buffer containing gzip compressed or uncompressed data
- * @srclen:size in bytes of the buffer at src
- *
- * If the buffer at @src contains a gzip header, this function
- * initializes zlib to decompress the data, storing the decompression
- * state in @state.  The other functions in this file can then be used
- * to decompress data from the gzipped stream.
- *
- * If the buffer at @src does not contain a gzip header, it is assumed
- * to contain uncompressed data.  The buffer information is recorded
- * in @state and the other functions in this file will simply copy
- * data from the uncompressed data stream at @src.
- *
- * Any errors, such as bad compressed data, cause an error to be
- * printed an the platform's exit() function to be called.
- */
-void gunzip_start(struct gunzip_state *state, void *src, int srclen)
-{
-   char *hdr = src;
-   int hdrlen = 0;
-
-   memset(state, 0, sizeof(*state));
-
-   /* Check for gzip magic number */
-   if ((hdr[0] == 0x1f) && (hdr[1] == 0x8b)) {
-   /* gzip data, initialize zlib parameters */
-   int r, flags;
-
-   state->s.workspace = state->scratch;
-   if (zlib_inflate_workspacesize() > sizeof(state->scratch))
-   fatal("insufficient scratch space for gunzip\n\r");
-
-   /* skip header */
-   hdrlen = 10;
-   flags = hdr[3];
-   if (hdr[2] != Z_DEFLATED || (flags & RESERVED) != 0)
-   fatal("bad gzipped data\n\r");
-   if ((flags & EXTRA_FIELD) != 0)
-   hdrlen = 12 + hdr[10] + (hdr[11] << 8);
-   if ((flags & ORIG_NAME) != 0)
-   while (hdr[hdrlen++] != 0)
-   ;
-   if ((flags & COMMENT) != 0)
-   while (hdr[hdrlen++] != 0)
-   ;
-   if ((flags & HEAD_CRC) != 0)
-   hdrlen += 2;
-   if (hdrlen >= srclen)
-   fatal("gunzip_start: ran out of data in header\n\r");
-
-   r = zlib_inflateInit2(>s, -MAX_WBITS);
-   if (r != Z_OK)
-   fatal("inflateInit2 returned %d\n\r", r);
-   }
-
-   state->s.total_in = hdrlen;
-   state->s.next_in = src + hdrlen;
-   state->s.avail_in = srclen - hdrlen;
-}
-
-/**
- * gunzip_partial - extract bytes from a gzip data stream
- * @state: gzip state structure previously initialized by gunzip_start()
- * @dst:   buffer to store extracted data
- * @dstlen:maximum number of bytes to extract
- *
- * This function extracts at most @dstlen bytes from the data stream
- * previously associated with @state by gunzip_start(), decompressing
- * if necessary.  Exactly @dstlen bytes are extracted unless the data
- * stream doesn't contain enough bytes, in which case the entire
- * remainder of the stream is decompressed.
- *
- * Returns the actual number of bytes extracted.  If any err

[PATCH 1/6] powerpc/boot: add sed script

2016-09-22 Thread Oliver O'Halloran
The powerpc boot wrapper is compiled with a separate "bootcc" toolchain
rather than the toolchain used for the rest of the kernel. The main
problem with this is that the wrapper does not have access to the kernel
headers (without a lot of gross hacks). To get around this the required
headers are copied into the build directory via several sed scripts
which rewrite problematic includes. This patch moves these fixups out of
the makefile into a separate .sed script file to clean up makefile
slightly.

Signed-off-by: Oliver O'Halloran <ooh...@gmail.com>
---
 arch/powerpc/boot/Makefile  | 16 +---
 arch/powerpc/boot/fixup-headers.sed | 12 
 2 files changed, 17 insertions(+), 11 deletions(-)
 create mode 100644 arch/powerpc/boot/fixup-headers.sed

diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile
index df0fd406aed1..7d6768253caa 100644
--- a/arch/powerpc/boot/Makefile
+++ b/arch/powerpc/boot/Makefile
@@ -125,23 +125,17 @@ obj-wlib := $(addsuffix .o, $(basename $(addprefix 
$(obj)/, $(src-wlib
 obj-plat := $(addsuffix .o, $(basename $(addprefix $(obj)/, $(src-plat
 obj-plat: $(libfdt)
 
-quiet_cmd_copy_zlib = COPY$@
-  cmd_copy_zlib = sed "s@__used@@;s@]*\).*@\"\1\"@" $< > $@
-
-quiet_cmd_copy_zlibheader = COPY$@
-  cmd_copy_zlibheader = sed "s@]*\).*@\"\1\"@" $< > $@
-# stddef.h for NULL
-quiet_cmd_copy_zliblinuxheader = COPY$@
-  cmd_copy_zliblinuxheader = sed 
"s@@\"string.h\"@;s@@@;s@]*\).*@\"\1\"@"
 $< > $@
+quiet_cmd_copy_kern_src = COPY$@
+  cmd_copy_kern_src = sed -f 
$(srctree)/arch/powerpc/boot/fixup-headers.sed $< > $@
 
 $(addprefix $(obj)/,$(zlib)): $(obj)/%: $(srctree)/lib/zlib_inflate/%
-   $(call cmd,copy_zlib)
+   $(call cmd,copy_kern_src)
 
 $(addprefix $(obj)/,$(zlibheader)): $(obj)/%: $(srctree)/lib/zlib_inflate/%
-   $(call cmd,copy_zlibheader)
+   $(call cmd,copy_kern_src)
 
 $(addprefix $(obj)/,$(zliblinuxheader)): $(obj)/%: $(srctree)/include/linux/%
-   $(call cmd,copy_zliblinuxheader)
+   $(call cmd,copy_kern_src)
 
 quiet_cmd_copy_libfdt = COPY$@
   cmd_copy_libfdt = cp $< $@
diff --git a/arch/powerpc/boot/fixup-headers.sed 
b/arch/powerpc/boot/fixup-headers.sed
new file mode 100644
index ..96362428eb37
--- /dev/null
+++ b/arch/powerpc/boot/fixup-headers.sed
@@ -0,0 +1,12 @@
+# Copyright 2016 IBM Corporation.
+#
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License version 2 or later as
+# published by the Free Software Foundation.
+
+s@#include @@;
+s@\"zlib_inflate/\([^\"]*\).*@"\1"@;
+s@@@;
+
+s@__used@@;
+s@]*\).*@"\1"@;
-- 
2.5.5



[PATCH 4/6] powerpc/boot: remove legacy gzip wrapper

2016-09-22 Thread Oliver O'Halloran
This code is no longer used and can be removed.

Signed-off-by: Oliver O'Halloran <ooh...@gmail.com>
---
 arch/powerpc/boot/cuboot-c2k.c  |   1 -
 arch/powerpc/boot/gunzip_util.c | 204 
 arch/powerpc/boot/gunzip_util.h |  45 -
 3 files changed, 250 deletions(-)
 delete mode 100644 arch/powerpc/boot/gunzip_util.c
 delete mode 100644 arch/powerpc/boot/gunzip_util.h

diff --git a/arch/powerpc/boot/cuboot-c2k.c b/arch/powerpc/boot/cuboot-c2k.c
index e43594950ba3..9309c51f1d65 100644
--- a/arch/powerpc/boot/cuboot-c2k.c
+++ b/arch/powerpc/boot/cuboot-c2k.c
@@ -18,7 +18,6 @@
 #include "io.h"
 #include "ops.h"
 #include "elf.h"
-#include "gunzip_util.h"
 #include "mv64x60.h"
 #include "cuboot.h"
 #include "ppcboot.h"
diff --git a/arch/powerpc/boot/gunzip_util.c b/arch/powerpc/boot/gunzip_util.c
deleted file mode 100644
index 9dc52501de83..
--- a/arch/powerpc/boot/gunzip_util.c
+++ /dev/null
@@ -1,204 +0,0 @@
-/*
- * Copyright 2007 David Gibson, IBM Corporation.
- * Based on earlier work, Copyright (C) Paul Mackerras 1997.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#include 
-#include "string.h"
-#include "stdio.h"
-#include "ops.h"
-#include "gunzip_util.h"
-
-#define HEAD_CRC   2
-#define EXTRA_FIELD4
-#define ORIG_NAME  8
-#define COMMENT0x10
-#define RESERVED   0xe0
-
-/**
- * gunzip_start - prepare to decompress gzip data
- * @state: decompressor state structure to be initialized
- * @src:   buffer containing gzip compressed or uncompressed data
- * @srclen:size in bytes of the buffer at src
- *
- * If the buffer at @src contains a gzip header, this function
- * initializes zlib to decompress the data, storing the decompression
- * state in @state.  The other functions in this file can then be used
- * to decompress data from the gzipped stream.
- *
- * If the buffer at @src does not contain a gzip header, it is assumed
- * to contain uncompressed data.  The buffer information is recorded
- * in @state and the other functions in this file will simply copy
- * data from the uncompressed data stream at @src.
- *
- * Any errors, such as bad compressed data, cause an error to be
- * printed an the platform's exit() function to be called.
- */
-void gunzip_start(struct gunzip_state *state, void *src, int srclen)
-{
-   char *hdr = src;
-   int hdrlen = 0;
-
-   memset(state, 0, sizeof(*state));
-
-   /* Check for gzip magic number */
-   if ((hdr[0] == 0x1f) && (hdr[1] == 0x8b)) {
-   /* gzip data, initialize zlib parameters */
-   int r, flags;
-
-   state->s.workspace = state->scratch;
-   if (zlib_inflate_workspacesize() > sizeof(state->scratch))
-   fatal("insufficient scratch space for gunzip\n\r");
-
-   /* skip header */
-   hdrlen = 10;
-   flags = hdr[3];
-   if (hdr[2] != Z_DEFLATED || (flags & RESERVED) != 0)
-   fatal("bad gzipped data\n\r");
-   if ((flags & EXTRA_FIELD) != 0)
-   hdrlen = 12 + hdr[10] + (hdr[11] << 8);
-   if ((flags & ORIG_NAME) != 0)
-   while (hdr[hdrlen++] != 0)
-   ;
-   if ((flags & COMMENT) != 0)
-   while (hdr[hdrlen++] != 0)
-   ;
-   if ((flags & HEAD_CRC) != 0)
-   hdrlen += 2;
-   if (hdrlen >= srclen)
-   fatal("gunzip_start: ran out of data in header\n\r");
-
-   r = zlib_inflateInit2(>s, -MAX_WBITS);
-   if (r != Z_OK)
-   fatal("inflateInit2 returned %d\n\r", r);
-   }
-
-   state->s.total_in = hdrlen;
-   state->s.next_in = src + hdrlen;
-   state->s.avail_in = srclen - hdrlen;
-}
-
-/**
- * gunzip_partial - extract bytes from a gzip data stream
- * @state: gzip state structure previously initialized by gunzip_start()
- * @dst:   buffer to store extracted data
- * @dstlen:maximum number of bytes to extract
- *
- * This function extracts at most @dstlen bytes from the data stream
- * previously associated with @state by gunzip_start(), decompressing
- * if necessary.  Exactly @dstlen bytes are extracted unless the data
- * stream doesn't contain enough bytes, in which case the entire
- * remainder of the stream is decompressed.
- *
- * Returns the actual number of bytes extracted.  If any err

[PATCH 5/6] powerpc/boot: add xz support to the wrapper script

2016-09-22 Thread Oliver O'Halloran
This modifies the script so that the -Z option takes an argument to
specify the compression type. It can either be 'gz', 'xz' or 'none'.
The legazy --no-gzip and -z options are still supported and will set
the compression to none and gzip respectively, but they are not
documented.

Only xz -6 is used for compression rather than xz -9. Using compression
levels higher than 6 requires the decompressor to build a large (64MB)
dictionary when decompressing and some environments cannot satisfy large
allocations (e.g. POWER 6 LPAR partition firmware).

Signed-off-by: Oliver O'Halloran <ooh...@gmail.com>
---
 arch/powerpc/boot/Makefile |  7 --
 arch/powerpc/boot/wrapper  | 61 ++
 2 files changed, 50 insertions(+), 18 deletions(-)

diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile
index 861348c72519..9fb451d0586e 100644
--- a/arch/powerpc/boot/Makefile
+++ b/arch/powerpc/boot/Makefile
@@ -225,10 +225,13 @@ CROSSWRAP := -C "$(CROSS_COMPILE)"
 endif
 endif
 
+compressor-$(CONFIG_KERNEL_GZIP) := gz
+
 # args (to if_changed): 1 = (this rule), 2 = platform, 3 = dts 4=dtb 5=initrd
 quiet_cmd_wrap = WRAP$@
-  cmd_wrap =$(CONFIG_SHELL) $(wrapper) -c -o $@ -p $2 $(CROSSWRAP) \
-   $(if $3, -s $3)$(if $4, -d $4)$(if $5, -i $5) vmlinux
+  cmd_wrap =$(CONFIG_SHELL) $(wrapper) -Z $(compressor-y) -c -o $@ -p $2 \
+   $(CROSSWRAP) $(if $3, -s $3)$(if $4, -d $4)$(if $5, -i $5) \
+   vmlinux
 
 image-$(CONFIG_PPC_PSERIES)+= zImage.pseries
 image-$(CONFIG_PPC_POWERNV)+= zImage.pseries
diff --git a/arch/powerpc/boot/wrapper b/arch/powerpc/boot/wrapper
index 6681ec3625c9..6feacfd87588 100755
--- a/arch/powerpc/boot/wrapper
+++ b/arch/powerpc/boot/wrapper
@@ -20,6 +20,8 @@
 # -D dir   specify directory containing data files used by script
 #  (default ./arch/powerpc/boot)
 # -W dir   specify working directory for temporary files (default .)
+# -z   use gzip (legacy)
+# -Z zsuffixcompression to use (gz, xz or none)
 
 # Stop execution if any command fails
 set -e
@@ -38,7 +40,7 @@ dtb=
 dts=
 cacheit=
 binary=
-gzip=.gz
+compression=.gz
 pie=
 format=
 
@@ -59,7 +61,8 @@ tmpdir=.
 usage() {
 echo 'Usage: wrapper [-o output] [-p platform] [-i initrd]' >&2
 echo '   [-d devtree] [-s tree.dts] [-c] [-C cross-prefix]' >&2
-echo '   [-D datadir] [-W workingdir] [--no-gzip] [vmlinux]' >&2
+echo '   [-D datadir] [-W workingdir] [-Z (gz|xz|none)]' >&2
+echo '   [--no-compression] [vmlinux]' >&2
 exit 1
 }
 
@@ -126,8 +129,24 @@ while [ "$#" -gt 0 ]; do
[ "$#" -gt 0 ] || usage
tmpdir="$1"
;;
+-z)
+   compression=.gz
+   ;;
+-Z)
+   shift
+   [ "$#" -gt 0 ] || usage
+[ "$1" != "gz" -o "$1" != "xz" -o "$1" != "none" ] || usage
+
+   compression=".$1"
+
+if [ $compression = ".none" ]; then
+compression=
+fi
+   ;;
 --no-gzip)
-gzip=
+# a "feature" of the the wrapper script is that it can be used outside
+# the kernel tree. So keeping this around for backwards compatibility.
+compression=
 ;;
 -?)
usage
@@ -140,6 +159,7 @@ while [ "$#" -gt 0 ]; do
 shift
 done
 
+
 if [ -n "$dts" ]; then
 if [ ! -r "$dts" -a -r "$object/dts/$dts" ]; then
dts="$object/dts/$dts"
@@ -212,7 +232,7 @@ miboot|uboot*)
 ;;
 cuboot*)
 binary=y
-gzip=
+compression=
 case "$platform" in
 *-mpc866ads|*-mpc885ads|*-adder875*|*-ep88xc)
 platformo=$object/cuboot-8xx.o
@@ -243,7 +263,7 @@ cuboot*)
 ps3)
 platformo="$object/ps3-head.o $object/ps3-hvcall.o $object/ps3.o"
 lds=$object/zImage.ps3.lds
-gzip=
+compression=
 ext=bin
 objflags="-O binary --set-section-flags=.bss=contents,alloc,load,data"
 ksection=.kernel:vmlinux.bin
@@ -310,27 +330,37 @@ mvme7100)
 esac
 
 vmz="$tmpdir/`basename \"$kernel\"`.$ext"
-if [ -z "$cacheit" -o ! -f "$vmz$gzip" -o "$vmz$gzip" -ot "$kernel" ]; then
-${CROSS}objcopy $objflags "$kernel" "$vmz.$$"
 
-strip_size=$(stat -c %s $vmz.$$)
+# Calculate the vmlinux.strip size
+${CROSS}objcopy $objflags "$kernel" "$vmz.$$"
+strip_size=$(stat -c %s $vmz.$$)
 
-if [ -n "$gzip" ]; then
+if [ -z "$cacheit" -o ! -f "$vmz$compression" -o "$vmz$compression" -ot 
"$kernel" ]; then
+# recompress the image if we need to
+case $compression in
+.xz)
+xz --check=crc32 -f -6 "$vmz.$$"
+;;

[v3] XZ compressed zImage support

2016-09-22 Thread Oliver O'Halloran
This series adds support for using XZ compression in addition to gzip in the
kernel boot wrapper. Currently this is only enabled for 64bit Book3S processors
since it seems that some embedded platforms rely on uBoot (or similar) to
decompress the image rather than having the kernel decompress itself. Enabling
it for other platforms should be fairly straight forward though.

Supporting other compression algorithms (like ARM and x86 do) is possible, but
painful. Each algorithm includes some kernel headers even when the #defines
that are supposed to make them usable in a pre-boot environment are set.
Including kernel headers is an issue because on powerpc  the boot wrapper is
compiled with a different toolchain and possibly for a different target for
backwards compatibility reasons*. This makes it difficult to include kernel
headers since the include paths, etc are not setup for BOOTCC.

This can be worked around by rewriting parts of the each decompressor with sed
scripts, but the rewriting requried is specific to each decompressor.

-oliver

*powermacs have 32bit firmware that cannot directly load a 64bit kernel. A 64
bit big endian kernel has a 32bit wrapper to work around this. On 64bit little
endian we don't have this legacy problem so the wrapper is also 64bit little
endian, but the toolchain issues are still there.

---
Changes from v1:
fixed some missing dependecies in the Makefile that were causing random
build breaks.

Fixed "make clean" so that it would remove the files copied into
arch/powerpc/boot/ when the wrapper was built.

previously this series renamed "zlibheader" to "zlibheaders". There were
consequences.

Changes from v2:
Adding missing stdint.h and stdbool.h

Reduced XZ compression level from -9 to -6. Using compression levels
above -6 requires the decompressor to construct a 64MB dictionary. The
firmware on some platforms cannot satisfy large allocations (even when
the memory is physically present) causing decompression failures.
Luckily using the lower compression level doesn't have much of a
penalty.
---


[PATCH 6/6] powerpc/boot: Add support for XZ compression

2016-09-22 Thread Oliver O'Halloran
This patch adds an option to use XZ compression for the kernel image.
Currently this is only enabled for PPC64 targets since the bulk of the
32bit platforms produce uboot images which do not use the wrapper.

Signed-off-by: Oliver O'Halloran <ooh...@gmail.com>
---
 arch/powerpc/boot/Makefile |  3 +++
 arch/powerpc/boot/decompress.c |  5 +
 arch/powerpc/boot/stdbool.h| 15 +
 arch/powerpc/boot/stdint.h | 13 
 arch/powerpc/boot/types.h  | 14 
 arch/powerpc/boot/xz_config.h  | 39 ++
 arch/powerpc/platforms/Kconfig.cputype |  1 +
 7 files changed, 90 insertions(+)
 create mode 100644 arch/powerpc/boot/stdbool.h
 create mode 100644 arch/powerpc/boot/stdint.h
 create mode 100644 arch/powerpc/boot/xz_config.h

diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile
index 9fb451d0586e..eae2dc8bc218 100644
--- a/arch/powerpc/boot/Makefile
+++ b/arch/powerpc/boot/Makefile
@@ -20,6 +20,7 @@
 all: $(obj)/zImage
 
 compress-$(CONFIG_KERNEL_GZIP) := CONFIG_KERNEL_GZIP
+compress-$(CONFIG_KERNEL_XZ)   := CONFIG_KERNEL_XZ
 
 BOOTCFLAGS:= -Wall -Wundef -Wstrict-prototypes -Wno-trigraphs \
 -fno-strict-aliasing -Os -msoft-float -pipe \
@@ -226,6 +227,7 @@ endif
 endif
 
 compressor-$(CONFIG_KERNEL_GZIP) := gz
+compressor-$(CONFIG_KERNEL_XZ)   := xz
 
 # args (to if_changed): 1 = (this rule), 2 = platform, 3 = dts 4=dtb 5=initrd
 quiet_cmd_wrap = WRAP$@
@@ -433,6 +435,7 @@ clean-files += $(image-) $(initrd-) cuImage.* dtbImage.* 
treeImage.* \
 # clean up files cached by wrapper
 clean-kernel-base := vmlinux.strip vmlinux.bin
 clean-kernel := $(addsuffix .gz,$(clean-kernel-base))
+clean-kernel += $(addsuffix .xz,$(clean-kernel-base))
 # If not absolute clean-files are relative to $(obj).
 clean-files += $(addprefix $(objtree)/, $(clean-kernel))
 
diff --git a/arch/powerpc/boot/decompress.c b/arch/powerpc/boot/decompress.c
index 60fc6fb26867..8f32ea4289af 100644
--- a/arch/powerpc/boot/decompress.c
+++ b/arch/powerpc/boot/decompress.c
@@ -37,6 +37,11 @@
 #  include "decompress_inflate.c"
 #endif
 
+#ifdef CONFIG_KERNEL_XZ
+#  include "xz_config.h"
+#  include "../../../lib/decompress_unxz.c"
+#endif
+
 /* globals for tracking the state of the decompression */
 static unsigned long decompressed_bytes;
 static unsigned long limit;
diff --git a/arch/powerpc/boot/stdbool.h b/arch/powerpc/boot/stdbool.h
new file mode 100644
index ..2ebcfa53b4c7
--- /dev/null
+++ b/arch/powerpc/boot/stdbool.h
@@ -0,0 +1,15 @@
+/*
+ * Copyright (C) IBM Corporation 2016.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * This file is only necessary because some of the pre-boot decompressors
+ * expect stdbool.h to be available.
+ *
+ */
+
+#include "types.h"
+
diff --git a/arch/powerpc/boot/stdint.h b/arch/powerpc/boot/stdint.h
new file mode 100644
index ..c1c853be7490
--- /dev/null
+++ b/arch/powerpc/boot/stdint.h
@@ -0,0 +1,13 @@
+/*
+ * Copyright (C) IBM Corporation 2016.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * This file is only necessary because some of the pre-boot decompressors
+ * expect stdint.h to be available.
+ */
+
+#include "types.h"
diff --git a/arch/powerpc/boot/types.h b/arch/powerpc/boot/types.h
index 85565a89bcc2..af6b66b842c4 100644
--- a/arch/powerpc/boot/types.h
+++ b/arch/powerpc/boot/types.h
@@ -1,6 +1,8 @@
 #ifndef _TYPES_H_
 #define _TYPES_H_
 
+#include 
+
 #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
 
 typedef unsigned char  u8;
@@ -34,4 +36,16 @@ typedef s64 int64_t;
(void) (&_x == &_y);\
_x > _y ? _x : _y; })
 
+#define min_t(type, a, b) min(((type) a), ((type) b))
+#define max_t(type, a, b) max(((type) a), ((type) b))
+
+typedef int bool;
+
+#ifndef true
+#define true 1
+#endif
+
+#ifndef false
+#define false 0
+#endif
 #endif /* _TYPES_H_ */
diff --git a/arch/powerpc/boot/xz_config.h b/arch/powerpc/boot/xz_config.h
new file mode 100644
index ..5c6afdbca642
--- /dev/null
+++ b/arch/powerpc/boot/xz_config.h
@@ -0,0 +1,39 @@
+#ifndef __XZ_CONFIG_H__
+#define __XZ_CONFIG_H__
+
+/*
+ * most of this is copied from lib/xz/xz_private.h, we can't use their defines
+ * since the boot wrapper is not built in the same environment as the rest of
+ * the kernel.
+ */
+
+#include "types.h"
+#include "swab.h"
+
+static inline uint32_t swab32p(void *p)
+{
+ 

[PATCH 3/6] powerpc/boot: use the preboot decompression API

2016-09-22 Thread Oliver O'Halloran
Currently the powerpc boot wrapper has its own wrapper around zlib to
handle decompressing gzipped kernels. The kernel decompressor library
functions now provide a generic interface that can be used in the pre-boot
environment. This allows boot wrappers to easily support different
compression algorithms. This patch converts the wrapper to use this new
API, but does not add support for using new algorithms.

Signed-off-by: Oliver O'Halloran <ooh...@gmail.com>
---
 arch/powerpc/boot/Makefile |  34 +++---
 arch/powerpc/boot/decompress.c | 142 +
 arch/powerpc/boot/main.c   |  35 +-
 arch/powerpc/boot/ops.h|   3 +
 4 files changed, 189 insertions(+), 25 deletions(-)
 create mode 100644 arch/powerpc/boot/decompress.c

diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile
index bede555d78cf..861348c72519 100644
--- a/arch/powerpc/boot/Makefile
+++ b/arch/powerpc/boot/Makefile
@@ -63,13 +63,28 @@ $(obj)/treeboot-currituck.o: BOOTCFLAGS += -mcpu=405
 $(obj)/treeboot-akebono.o: BOOTCFLAGS += -mcpu=405
 $(obj)/virtex405-head.o: BOOTAFLAGS += -mcpu=405
 
-# the kernel's version of zlib pulls in a lot of other kernel headers
-# which we don't provide inside the wrapper.
+# The pre-boot decompressors pull in a lot of kernel headers and other source
+# files. This creates a bit of a dependency headache since we need to copy
+# these files into the build dir, fix up any includes and ensure that dependent
+# files are copied in the right order.
+
+# these need to be seperate variables because they are copied out of different
+# directories in the kernel tree. Sure you COULd merge them, but it's a
+# cure-is-worse-than-disease situation.
+zlib-decomp-$(CONFIG_KERNEL_GZIP) := decompress_inflate.c
 zlib-$(CONFIG_KERNEL_GZIP) := inffast.c inflate.c inftrees.c
 zlibheader-$(CONFIG_KERNEL_GZIP) := inffast.h inffixed.h inflate.h inftrees.h 
infutil.h
 zliblinuxheader-$(CONFIG_KERNEL_GZIP) := zlib.h zconf.h zutil.h
 
-$(addprefix $(obj)/,$(zlib-y) cuboot-c2k.o gunzip_util.o main.o): \
+$(addprefix $(obj)/, decompress.o): \
+   $(addprefix $(obj)/,$(zlib-decomp-y))
+
+$(addprefix $(obj)/, $(zlib-decomp-y)): \
+   $(addprefix $(obj)/,$(zliblinuxheader-y)) \
+   $(addprefix $(obj)/,$(zlibheader-y)) \
+   $(addprefix $(obj)/,$(zlib-y))
+
+$(addprefix $(obj)/,$(zlib-y)): \
$(addprefix $(obj)/,$(zliblinuxheader-y)) \
$(addprefix $(obj)/,$(zlibheader-y))
 
@@ -79,10 +94,10 @@ libfdtheader := fdt.h libfdt.h libfdt_internal.h
 $(addprefix $(obj)/,$(libfdt) libfdt-wrapper.o simpleboot.o epapr.o opal.o): \
$(addprefix $(obj)/,$(libfdtheader))
 
-src-wlib-y := string.S crt0.S crtsavres.S stdio.c main.c \
+src-wlib-y := string.S crt0.S crtsavres.S stdio.c decompress.c main.c \
$(libfdt) libfdt-wrapper.c \
ns16550.c serial.c simple_alloc.c div64.S util.S \
-   gunzip_util.c elf_util.c $(zlib-y) devtree.c stdlib.c \
+   elf_util.c $(zlib-y) devtree.c stdlib.c \
oflib.c ofconsole.c cuboot.c mpsc.c cpm-serial.c \
uartlite.c mpc52xx-psc.c opal.c opal-calls.S
 src-wlib-$(CONFIG_40x) += 4xx.c planetcore.c
@@ -143,6 +158,9 @@ $(addprefix $(obj)/,$(zlibheader-y)): $(obj)/%: 
$(srctree)/lib/zlib_inflate/%
 $(addprefix $(obj)/,$(zliblinuxheader-y)): $(obj)/%: $(srctree)/include/linux/%
$(call cmd,copy_kern_src)
 
+$(addprefix $(obj)/,$(zlib-decomp-y)): $(obj)/%: $(srctree)/lib/%
+   $(call cmd,copy_kern_src)
+
 quiet_cmd_copy_libfdt = COPY$@
   cmd_copy_libfdt = cp $< $@
 
@@ -160,7 +178,7 @@ $(obj)/zImage.coff.lds $(obj)/zImage.ps3.lds : $(obj)/%: 
$(srctree)/$(src)/%.S
$(Q)cp $< $@
 
 clean-files := $(zlib-) $(zlibheader-) $(zliblinuxheader-) \
-   $(libfdt) $(libfdtheader) \
+   $(zlib-decomp-) $(libfdt) $(libfdtheader) \
empty.c zImage.coff.lds zImage.ps3.lds zImage.lds
 
 quiet_cmd_bootcc = BOOTCC  $@
@@ -410,8 +428,8 @@ clean-files += $(image-) $(initrd-) cuImage.* dtbImage.* 
treeImage.* \
zImage.maple simpleImage.* otheros.bld *.dtb
 
 # clean up files cached by wrapper
-clean-kernel := vmlinux.strip vmlinux.bin
-clean-kernel += $(addsuffix .gz,$(clean-kernel))
+clean-kernel-base := vmlinux.strip vmlinux.bin
+clean-kernel := $(addsuffix .gz,$(clean-kernel-base))
 # If not absolute clean-files are relative to $(obj).
 clean-files += $(addprefix $(objtree)/, $(clean-kernel))
 
diff --git a/arch/powerpc/boot/decompress.c b/arch/powerpc/boot/decompress.c
new file mode 100644
index ..60fc6fb26867
--- /dev/null
+++ b/arch/powerpc/boot/decompress.c
@@ -0,0 +1,142 @@
+/*
+ * Wrapper around the kernel's pre-boot decompression library.
+ *
+ * Copyright (C) IBM Corporation 2016.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Softwar

[PATCH 2/6] powerpc/boot: Use CONFIG_KERNEL_GZIP

2016-09-22 Thread Oliver O'Halloran
Most architectures allow the compression algorithm used to produced the
vmlinuz image to be selected as a kernel config option. In preperation
for supporting algorithms other than gzip in the powerpc boot wrapper
the makefile needs to be modified to use these config options.

Signed-off-by: Oliver O'Halloran <ooh...@gmail.com>
---
 arch/powerpc/Kconfig   |  1 +
 arch/powerpc/boot/Makefile | 30 ++
 2 files changed, 19 insertions(+), 12 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 5c295830e8c7..59e53f4552ae 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -161,6 +161,7 @@ config PPC
select GENERIC_CPU_AUTOPROBE
select HAVE_VIRT_CPU_ACCOUNTING
select HAVE_ARCH_HARDENED_USERCOPY
+   select HAVE_KERNEL_GZIP
 
 config GENERIC_CSUM
def_bool CPU_LITTLE_ENDIAN
diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile
index 7d6768253caa..bede555d78cf 100644
--- a/arch/powerpc/boot/Makefile
+++ b/arch/powerpc/boot/Makefile
@@ -19,10 +19,14 @@
 
 all: $(obj)/zImage
 
+compress-$(CONFIG_KERNEL_GZIP) := CONFIG_KERNEL_GZIP
+
 BOOTCFLAGS:= -Wall -Wundef -Wstrict-prototypes -Wno-trigraphs \
 -fno-strict-aliasing -Os -msoft-float -pipe \
 -fomit-frame-pointer -fno-builtin -fPIC -nostdinc \
--isystem $(shell $(CROSS32CC) -print-file-name=include)
+-isystem $(shell $(CROSS32CC) -print-file-name=include) \
+-D$(compress-y)
+
 ifdef CONFIG_PPC64_BOOT_WRAPPER
 BOOTCFLAGS += -m64
 endif
@@ -59,13 +63,15 @@ $(obj)/treeboot-currituck.o: BOOTCFLAGS += -mcpu=405
 $(obj)/treeboot-akebono.o: BOOTCFLAGS += -mcpu=405
 $(obj)/virtex405-head.o: BOOTAFLAGS += -mcpu=405
 
+# the kernel's version of zlib pulls in a lot of other kernel headers
+# which we don't provide inside the wrapper.
+zlib-$(CONFIG_KERNEL_GZIP) := inffast.c inflate.c inftrees.c
+zlibheader-$(CONFIG_KERNEL_GZIP) := inffast.h inffixed.h inflate.h inftrees.h 
infutil.h
+zliblinuxheader-$(CONFIG_KERNEL_GZIP) := zlib.h zconf.h zutil.h
 
-zlib   := inffast.c inflate.c inftrees.c
-zlibheader := inffast.h inffixed.h inflate.h inftrees.h infutil.h
-zliblinuxheader := zlib.h zconf.h zutil.h
-
-$(addprefix $(obj)/,$(zlib) cuboot-c2k.o gunzip_util.o main.o): \
-   $(addprefix $(obj)/,$(zliblinuxheader)) $(addprefix 
$(obj)/,$(zlibheader))
+$(addprefix $(obj)/,$(zlib-y) cuboot-c2k.o gunzip_util.o main.o): \
+   $(addprefix $(obj)/,$(zliblinuxheader-y)) \
+   $(addprefix $(obj)/,$(zlibheader-y))
 
 libfdt   := fdt.c fdt_ro.c fdt_wip.c fdt_sw.c fdt_rw.c fdt_strerror.c
 libfdtheader := fdt.h libfdt.h libfdt_internal.h
@@ -76,7 +82,7 @@ $(addprefix $(obj)/,$(libfdt) libfdt-wrapper.o simpleboot.o 
epapr.o opal.o): \
 src-wlib-y := string.S crt0.S crtsavres.S stdio.c main.c \
$(libfdt) libfdt-wrapper.c \
ns16550.c serial.c simple_alloc.c div64.S util.S \
-   gunzip_util.c elf_util.c $(zlib) devtree.c stdlib.c \
+   gunzip_util.c elf_util.c $(zlib-y) devtree.c stdlib.c \
oflib.c ofconsole.c cuboot.c mpsc.c cpm-serial.c \
uartlite.c mpc52xx-psc.c opal.c opal-calls.S
 src-wlib-$(CONFIG_40x) += 4xx.c planetcore.c
@@ -128,13 +134,13 @@ obj-plat: $(libfdt)
 quiet_cmd_copy_kern_src = COPY$@
   cmd_copy_kern_src = sed -f 
$(srctree)/arch/powerpc/boot/fixup-headers.sed $< > $@
 
-$(addprefix $(obj)/,$(zlib)): $(obj)/%: $(srctree)/lib/zlib_inflate/%
+$(addprefix $(obj)/,$(zlib-y)): $(obj)/%: $(srctree)/lib/zlib_inflate/%
$(call cmd,copy_kern_src)
 
-$(addprefix $(obj)/,$(zlibheader)): $(obj)/%: $(srctree)/lib/zlib_inflate/%
+$(addprefix $(obj)/,$(zlibheader-y)): $(obj)/%: $(srctree)/lib/zlib_inflate/%
$(call cmd,copy_kern_src)
 
-$(addprefix $(obj)/,$(zliblinuxheader)): $(obj)/%: $(srctree)/include/linux/%
+$(addprefix $(obj)/,$(zliblinuxheader-y)): $(obj)/%: $(srctree)/include/linux/%
$(call cmd,copy_kern_src)
 
 quiet_cmd_copy_libfdt = COPY$@
@@ -153,7 +159,7 @@ $(obj)/zImage.lds: $(obj)/%: $(srctree)/$(src)/%.S
 $(obj)/zImage.coff.lds $(obj)/zImage.ps3.lds : $(obj)/%: $(srctree)/$(src)/%.S
$(Q)cp $< $@
 
-clean-files := $(zlib) $(zlibheader) $(zliblinuxheader) \
+clean-files := $(zlib-) $(zlibheader-) $(zliblinuxheader-) \
$(libfdt) $(libfdtheader) \
empty.c zImage.coff.lds zImage.ps3.lds zImage.lds
 
-- 
2.5.5



Re: [PATCH v5 04/13] powerpc: Factor out relocation code from module_64.c to elf_util_64.c.

2016-08-23 Thread Oliver O'Halloran
On Tue, Aug 23, 2016 at 1:21 PM, Balbir Singh  wrote:
>
>> zImage on ppc64 BE is an ELF32 file. This patch set only supports loading
>> ELF files of the same class as the kernel, so a 64 bit kernel can't load an
>> ELF32 file. It would be possible to add such support, but it would be a new
>> feature.
>>
>> The distros I was able to check on ppc64 LE and BE all use vmlinux.
>> kexec-tools with kexec_load also doesn't support zImage. Do you think it is
>> important to support zImage?
>
> Well if it didn't work already, I think its low priority. Michael should be
> able to confirm this. Oliver's been trying to cleanup the zImage to get rid
> the old zImage limitation, cc'ing him

I don't think it's ever worked so I wouldn't worry too much about
supporting it. Fixing kexec-into-zImage and fixing the 32bit wrapper
on 64bit BE kernel problem has been on my TODO list for a while, but
it's not a priority.

oliver


Re: [PATCH] Fix "ibm,processor-radix-AP-encodings"

2016-10-09 Thread Oliver O'Halloran
On Wed, Sep 28, 2016 at 12:43 PM, Aneesh Kumar K.V
<aneesh.ku...@linux.vnet.ibm.com> wrote:
> Balbir Singh <bsinghar...@gmail.com> writes:
>
>> The top 3 bits of the lower order byte should contain the
>> AP encoding, we assume the top 3 bits of the MSB.

Balbir, could you reword this so it says "Currently we wrongly assume
" or similar. The current commit message made me think you were
changing it to look at the top 3 bits of the MSB rather than changing
it look at the LSB.

> Are you sure, Power architecture documents always confuse about MSB vs
> lowe order bytes. ?

PAPR seems to be pretty consistent about "low order" meaning "least
significant." Additionally the PAPR that describes
ibm,processor-radix-AP-encodings says that it is formatted this way so
it can be used when constructing the register argument to tlbie. The
modes of tlbie that use the AP field place it in bits 56:59 so I think
Balbir's fix is correct.

Reviewed-By: Oliver O'Halloran <ooh...@gmail.com>


Re: Commit 1b7898ee276b "powerpc/boot: Use the pre-boot decompression API" breaks boot

2016-10-09 Thread Oliver O'Halloran
On Mon, Oct 10, 2016 at 3:41 PM, Michael Ellerman <m...@ellerman.id.au> wrote:
> Heiner Kallweit <hkallwe...@gmail.com> writes:
>
>> Am 07.10.2016 um 21:26 schrieb Heiner Kallweit:
>>> Am 07.10.2016 um 07:51 schrieb Oliver O'Halloran:
>>>> Hi, Heiner
>>>>
>>>> Could you send me a copy of the kernel .config (or which defconfig)
>>>> that you're using, the name of the HW platform that you're using and
>>>> if possible the kernel image itself?
>>>>
>>>> Thanks,
>>>> Oliver
>>>>
>>> Thanks for the quick reply. Attached are .config and cuImage.
>>> HW is a TP-Link TL-WDR4900 WiFi router (P1014-based) running OpenWRT.
>>>
>> After further checking I think I found the issue. The old gunzip code
>> handled uncompressed data transparently whilst the new one bails out
>> if it doesn't find a proper gzip header.
>> And in my case the actual kernel image is uncompressed.
>> With the following patch the system boots fine again (at least for me).
>
> Thanks for testing and tracking it down.

Yeah thanks for that. I was putting off looking at it until Monday :)

>
> I wonder why the actual image is uncompressed? Or alternately why do we
> tell uboot the image is compressed when it's not?

The uboot payload (wrapper, kernel, initrd) as a whole is compressed
as a single blob. Modern uboot can just decompress the payload and
jump straight into the kernel and I'd assumed that all uboot platforms
did this. The problem is that the compatible uboot (cuboot) images do
use the wrapper and the vmlinux baked into the wrapper is
uncompressed.

Oliver


[PATCH] powerpc/boot: fix the early OPAL console wrappers

2016-11-22 Thread Oliver O'Halloran
When configured with CONFIG_PPC_EARLY_DEBUG_OPAL=y the kernel expects
the OPAL entry and base addresses to be passed in r8 and r9
respectively. Currently the wrapper does not attempt to restore these
values before entering the decompressed kernel which causes the kernel
to branch into whatever happens to be in r9 when doing a write to the
OPAL console in early boot.

This patch adds a platform_ops hook that can be used to branch into the
new kernel. The OPAL console driver patches this at runtime so that if
the console is used it will be restored just prior to entering the
kernel.

Fixes: 656ad58ef19e
Signed-off-by: Oliver O'Halloran <ooh...@gmail.com>
---
 arch/powerpc/boot/main.c   |  8 ++--
 arch/powerpc/boot/opal-calls.S | 13 +
 arch/powerpc/boot/opal.c   | 11 +++
 arch/powerpc/boot/ops.h|  1 +
 4 files changed, 31 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/boot/main.c b/arch/powerpc/boot/main.c
index d80161b633f4..60522d22a428 100644
--- a/arch/powerpc/boot/main.c
+++ b/arch/powerpc/boot/main.c
@@ -217,8 +217,12 @@ void start(void)
console_ops.close();
 
kentry = (kernel_entry_t) vmlinux.addr;
-   if (ft_addr)
-   kentry(ft_addr, 0, NULL);
+   if (ft_addr) {
+   if(platform_ops.kentry)
+   platform_ops.kentry(ft_addr, vmlinux.addr);
+   else
+   kentry(ft_addr, 0, NULL);
+   }
else
kentry((unsigned long)initrd.addr, initrd.size,
   loader_info.promptr);
diff --git a/arch/powerpc/boot/opal-calls.S b/arch/powerpc/boot/opal-calls.S
index ff2f1b97bc53..2a99fc9a3ccf 100644
--- a/arch/powerpc/boot/opal-calls.S
+++ b/arch/powerpc/boot/opal-calls.S
@@ -12,6 +12,19 @@
 
.text
 
+   .globl opal_kentry
+opal_kentry:
+   /* r3 is the fdt ptr */
+   mtctr r4
+   li  r4, 0
+   li  r5, 0
+   li  r6, 0
+   li  r7, 0
+   ld  r11,opal@got(r2)
+   ld  r8,0(r11)
+   ld  r9,8(r11)
+   bctr
+
 #define OPAL_CALL(name, token) \
.globl name;\
 name:  \
diff --git a/arch/powerpc/boot/opal.c b/arch/powerpc/boot/opal.c
index 1f37e1c1d6d8..d7b4fd47eb44 100644
--- a/arch/powerpc/boot/opal.c
+++ b/arch/powerpc/boot/opal.c
@@ -23,14 +23,25 @@ struct opal {
 
 static u32 opal_con_id;
 
+/* see opal-wrappers.S */
 int64_t opal_console_write(int64_t term_number, u64 *length, const u8 *buffer);
 int64_t opal_console_read(int64_t term_number, uint64_t *length, u8 *buffer);
 int64_t opal_console_write_buffer_space(uint64_t term_number, uint64_t 
*length);
 int64_t opal_console_flush(uint64_t term_number);
 int64_t opal_poll_events(uint64_t *outstanding_event_mask);
 
+void opal_kentry(unsigned long fdt_addr, void *vmlinux_addr);
+
 static int opal_con_open(void)
 {
+   /*
+* When OPAL loads the boot kernel it stashes the OPAL base and entry
+* address in r8 and r9 so the kernel can use the OPAL console
+* before unflattening the devicetree. While executing the wrapper will
+* probably trash r8 and r9 so this kentry hook restores them before
+* entering the decompressed kernel.
+*/
+   platform_ops.kentry = opal_kentry;
return 0;
 }
 
diff --git a/arch/powerpc/boot/ops.h b/arch/powerpc/boot/ops.h
index e19b64ef977a..deeae6f6ba9c 100644
--- a/arch/powerpc/boot/ops.h
+++ b/arch/powerpc/boot/ops.h
@@ -30,6 +30,7 @@ struct platform_ops {
void *  (*realloc)(void *ptr, unsigned long size);
void(*exit)(void);
void *  (*vmlinux_alloc)(unsigned long size);
+   void(*kentry)(unsigned long fdt_addr, void *vmlinux_addr);
 };
 extern struct platform_ops platform_ops;
 
-- 
2.5.5



Re: [RFC][PATCH] powerpc/64be: use ELFv2 ABI for big endian kernels

2016-11-23 Thread Oliver O'Halloran
On Thu, Nov 24, 2016 at 1:38 AM, Segher Boessenkool
 wrote:
> On Thu, Nov 24, 2016 at 12:08:40AM +1100, Nicholas Piggin wrote:
>> Question, are there any fundamental reasons we shouldn't use the ELFv2
>> ABI to build big endian kernels if the compiler supports it?
>
> No one uses ELFv2 for BE in production, and it isn't thoroughly tested
> at all, not even regularly tested.  "Not supported", as far as GCC is
> concerned (or any of the distros AFAIK).

Is this actually unsupported by gcc? The ppc64 musl libc port is ABI
v2 only so they use it on BE too. Buildroot forces ABI v2 to be used
for all of userspace when musl is selected as the libc for this reason
so it's not completely used in the wild. It's still pretty niche
though...


[PATCH] powerpc/powernv: de-deuplicate OPAL call wrappers

2016-10-30 Thread Oliver O'Halloran
Currently the code to perform an OPAL call is duplicated between the
normal path and path taken when tracepoints are enabled. There's no
real need for this and combining them makes opal_tracepoint_entry
considerably easier to understand.

Signed-off-by: Oliver O'Halloran <ooh...@gmail.com>
---
 arch/powerpc/platforms/powernv/opal-wrappers.S | 44 ++
 1 file changed, 17 insertions(+), 27 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S 
b/arch/powerpc/platforms/powernv/opal-wrappers.S
index 44d2d842cee7..3ebe0db7ffeb 100644
--- a/arch/powerpc/platforms/powernv/opal-wrappers.S
+++ b/arch/powerpc/platforms/powernv/opal-wrappers.S
@@ -50,18 +50,14 @@ END_FTR_SECTION(0, 1);  
\
 #define OPAL_BRANCH(LABEL)
 #endif
 
-/* TODO:
+/*
+ * DO_OPAL_CALL assumes:
  *
- * - Trace irqs in/off (needs saving/restoring all args, argh...)
- * - Get r11 feed up by Dave so I can have better register usage
+ * r0 = OPAL call token
+ * LR has been saved on the stack
  */
 
-#define OPAL_CALL(name, token) \
- _GLOBAL_TOC(name);\
-   mflrr0; \
-   std r0,PPC_LR_STKOFF(r1);   \
-   li  r0,token;   \
-   OPAL_BRANCH(opal_tracepoint_entry) \
+#define DO_OPAL_CALL() \
mfcrr12;\
stw r12,8(r1);  \
li  r11,0;  \
@@ -81,6 +77,14 @@ END_FTR_SECTION(0, 1);   
\
mtspr   SPRN_HSRR0,r12; \
hrfid
 
+#define OPAL_CALL(name, token) \
+ _GLOBAL_TOC(name);\
+   mflrr0; \
+   std r0,PPC_LR_STKOFF(r1);   \
+   li  r0,token;   \
+   OPAL_BRANCH(opal_tracepoint_entry) \
+   DO_OPAL_CALL()
+
 opal_return:
/*
 * Fixup endian on OPAL return... we should be able to simplify
@@ -122,26 +126,12 @@ opal_tracepoint_entry:
ld  r8,STK_REG(R29)(r1)
ld  r9,STK_REG(R30)(r1)
ld  r10,STK_REG(R31)(r1)
+
+   /* return from the opal call via tracepoint_return */
LOAD_REG_ADDR(r11,opal_tracepoint_return)
-   mfcrr12
std r11,16(r1)
-   stw r12,8(r1)
-   li  r11,0
-   mfmsr   r12
-   ori r11,r11,MSR_EE
-   std r12,PACASAVEDMSR(r13)
-   andcr12,r12,r11
-   mtmsrd  r12,1
-   LOAD_REG_ADDR(r11,opal_return)
-   mtlrr11
-   li  r11,MSR_DR|MSR_IR|MSR_LE
-   andcr12,r12,r11
-   mtspr   SPRN_HSRR1,r12
-   LOAD_REG_ADDR(r11,opal)
-   ld  r12,8(r11)
-   ld  r2,0(r11)
-   mtspr   SPRN_HSRR0,r12
-   hrfid
+
+   DO_OPAL_CALL()
 
 opal_tracepoint_return:
std r3,STK_REG(R31)(r1)
-- 
2.5.5



Re: [PATCH v2 2/3] cpuidle:powernv: Add helper function to populate powernv idle states.

2016-11-01 Thread Oliver O'Halloran
exit_latency, 0);
> } else if ((flags[i] & OPAL_PM_STOP_INST_FAST) &&
> !(flags[i] & OPAL_PM_TIMEBASE_STOP)) {
> -   strncpy(powernv_states[nr_idle_states].name,
> -   names[i], CPUIDLE_NAME_LEN);
> -   strncpy(powernv_states[nr_idle_states].desc,
> -   names[i], CPUIDLE_NAME_LEN);
> -   powernv_states[nr_idle_states].flags = 0;
> -
> -   powernv_states[nr_idle_states].enter = stop_loop;
> -   stop_psscr_table[nr_idle_states] = psscr_val[i];
> +   add_powernv_state(nr_idle_states, names[i],
> + CPUIDLE_FLAG_NONE, stop_loop,
> + target_residency, exit_latency,
> + psscr_val[i]);
> }
>
> /*
> @@ -274,32 +300,20 @@ static int powernv_add_idle_states(void)
>  #ifdef CONFIG_TICK_ONESHOT
> if (flags[i] & OPAL_PM_SLEEP_ENABLED ||
> flags[i] & OPAL_PM_SLEEP_ENABLED_ER1) {
> +   target_residency = 30;

Same comment as above.

> /* Add FASTSLEEP state */
> -   strcpy(powernv_states[nr_idle_states].name, 
> "FastSleep");
> -   strcpy(powernv_states[nr_idle_states].desc, 
> "FastSleep");
> -   powernv_states[nr_idle_states].flags = 
> CPUIDLE_FLAG_TIMER_STOP;
> -   powernv_states[nr_idle_states].target_residency = 
> 30;
> -   powernv_states[nr_idle_states].enter = fastsleep_loop;
> +   add_powernv_state(nr_idle_states, "FastSleep",
> + CPUIDLE_FLAG_TIMER_STOP,
> + fastsleep_loop,
> + target_residency, exit_latency, 0);
> } else if ((flags[i] & OPAL_PM_STOP_INST_DEEP) &&
> (flags[i] & OPAL_PM_TIMEBASE_STOP)) {
> -   strncpy(powernv_states[nr_idle_states].name,
> -   names[i], CPUIDLE_NAME_LEN);
> -   strncpy(powernv_states[nr_idle_states].desc,
> -   names[i], CPUIDLE_NAME_LEN);
> -
> -   powernv_states[nr_idle_states].flags = 
> CPUIDLE_FLAG_TIMER_STOP;
> -   powernv_states[nr_idle_states].enter = stop_loop;
> -   stop_psscr_table[nr_idle_states] = psscr_val[i];
> +   add_powernv_state(nr_idle_states, names[i],
> + CPUIDLE_FLAG_TIMER_STOP, stop_loop,
> + target_residency, exit_latency,
> + psscr_val[i]);
> }
>  #endif
> -   powernv_states[nr_idle_states].exit_latency =
> -   ((unsigned int)latency_ns[i]) / 1000;
> -
> -   if (!rc) {
> -   powernv_states[nr_idle_states].target_residency =
> -   ((unsigned int)residency_ns[i]) / 1000;
> -   }
> -
> nr_idle_states++;
> }
>  out:
> diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
> index bb31373..c4e10f8 100644
> --- a/include/linux/cpuidle.h
> +++ b/include/linux/cpuidle.h
> @@ -62,6 +62,7 @@ struct cpuidle_state {
>  };
>
>  /* Idle State Flags */
> +#define CPUIDLE_FLAG_NONE   (0x00)
>  #define CPUIDLE_FLAG_COUPLED   (0x02) /* state applies to multiple cpus */
>  #define CPUIDLE_FLAG_TIMER_STOP (0x04)  /* timer is stopped on this state */
>
> --
> 1.9.4
>

Looks good otherwise.

Reviewed-by: Oliver O'Halloran <ooh...@gmail.com>


Re: Commit 1b7898ee276b "powerpc/boot: Use the pre-boot decompression API" breaks boot

2016-10-11 Thread Oliver O'Halloran
On Tue, Oct 11, 2016 at 7:06 AM, Heiner Kallweit  wrote:
>> IMHO in case of using cuboot no CONFIG_KERNEL_ config option
>> should be set and Makefile + code in arch/powerpc/boot should be able
>> to deal with this situation:
>> - don't copy and build the decompression stuff
>> - use an alternative version of prep_kernel() in main.c which doesn't
>>   attempt to decompress the kernel image
>>
>> This should be a cleaner solution than probing the kernel image whether
>> it's compressed or not.
>>
>
> This would be the patch implementing the idea. Advantage is that all
> the unnecessary decompression code isn't built. Works fine for me.

I don't think this approach is viable. The wrapper code is shared
among the various output image formats some of which *will* contain a
compressed kernel image so we can't simply remove the decompressor
from the wrapper. A random example I found in the makefile was
CONFIG_BAMBOO:

> image-$(CONFIG_BAMBOO) += treeImage.bamboo cuImage.bamboo

When building for this platform Kbuild will produce treeboot and a
cuboot image. Unlike uboot, Treeboot doesn't do any decompression so
the wrapper needs to decompress the kernel itself. The probing
solution more or less matches the old behaviour (which we know works)
so I think we should just stick with that.

- Oliver


[PATCH] powerpc/time: clear LPCR.LD when unneeded

2016-12-15 Thread Oliver O'Halloran
Currently the kernel will enable LD mode at boot when required. However,
when using kexec the second kernel may not want to have the LD enabled.
This patch ensures the second kernel will explicitly clear the LD flag
when not required by the current kernel.

Signed-off-by: Oliver O'Halloran <ooh...@gmail.com>
---
 arch/powerpc/kernel/time.c | 12 +---
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index be9751f1cb2a..816700e8a475 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -925,18 +925,16 @@ static void register_decrementer_clockevent(int cpu)
 
 static void enable_large_decrementer(void)
 {
-   if (!cpu_has_feature(CPU_FTR_ARCH_300))
-   return;
-
-   if (decrementer_max <= DECREMENTER_DEFAULT_MAX)
-   return;
-
/*
 * If we're running as the hypervisor we need to enable the LD manually
 * otherwise firmware should have done it for us.
 */
-   if (cpu_has_feature(CPU_FTR_HVMODE))
+   if (decrementer_max > DECREMENTER_DEFAULT_MAX
+   && cpu_has_feature(CPU_FTR_HVMODE)
+   && cpu_has_feature(CPU_FTR_ARCH_300))
mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) | LPCR_LD);
+   else
+   mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) & ~LPCR_LD);
 }
 
 static void __init set_decrementer_max(void)
-- 
2.7.4



Re: [PATCH v5 2/5] powernv:stop: Uniformly rename power9 to arch300

2017-01-12 Thread Oliver O'Halloran
On Fri, Jan 13, 2017 at 2:44 PM, Gautham R Shenoy
 wrote:
> On Thu, Jan 12, 2017 at 03:17:33PM +0530, Balbir Singh wrote:
>> On Tue, Jan 10, 2017 at 02:37:01PM +0530, Gautham R. Shenoy wrote:
>> > From: "Gautham R. Shenoy" 
>> >
>> > Balbir pointed out that in idle_book3s.S and powernv/idle.c some
>> > functions and variables had power9 in their names while some others
>> > had arch300.
>> >
>>
>> I would prefer power9 to arch300
>>
>
>
> I don't have a strong preference for arch300 vs power9, will change it
> to power9 if that looks better.

Personally I think we should be as descriptive as possible and use
power_9_arch_300_the_bikeshed_is_red_dammit.

Oliver


[RFC PATCH] powerpc/powernv: report error messages from opal

2016-12-20 Thread Oliver O'Halloran
Recent versions of skiboot will raise an OPAL event (read: interrupt)
when firmware writes an error message to its internal console. In
conjunction they provide an OPAL call that the kernel can use to extract
these messages from the OPAL log to allow them to be written into the
kernel's log buffer where someone will (hopefully) look at them.

For the companion skiboot patches see:

https://lists.ozlabs.org/pipermail/skiboot/2016-December/005861.html

Signed-off-by: Oliver O'Halloran <ooh...@gmail.com>
---
 arch/powerpc/include/asm/opal-api.h|  5 +++-
 arch/powerpc/include/asm/opal.h|  1 +
 arch/powerpc/platforms/powernv/opal-msglog.c   | 41 ++
 arch/powerpc/platforms/powernv/opal-wrappers.S |  1 +
 4 files changed, 47 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/opal-api.h 
b/arch/powerpc/include/asm/opal-api.h
index 0e2e57bcab50..cb9c0e6afb33 100644
--- a/arch/powerpc/include/asm/opal-api.h
+++ b/arch/powerpc/include/asm/opal-api.h
@@ -167,7 +167,8 @@
 #define OPAL_INT_EOI   124
 #define OPAL_INT_SET_MFRR  125
 #define OPAL_PCI_TCE_KILL  126
-#define OPAL_LAST  126
+#define OPAL_SCRAPE_LOG128
+#define OPAL_LAST  128
 
 /* Device tree flags */
 
@@ -288,6 +289,7 @@ enum OpalPendingState {
OPAL_EVENT_PCI_ERROR   = 0x200,
OPAL_EVENT_DUMP_AVAIL  = 0x400,
OPAL_EVENT_MSG_PENDING = 0x800,
+   OPAL_EVENT_LOG_PENDING = 0x1000,
 };
 
 enum OpalThreadStatus {
@@ -406,6 +408,7 @@ enum opal_msg_type {
OPAL_MSG_DPO= 5,
OPAL_MSG_PRD= 6,
OPAL_MSG_OCC= 7,
+   OPAL_MSG_LOG= 8,
OPAL_MSG_TYPE_MAX,
 };
 
diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
index 5c7db0f1a708..2b3bd3219fb4 100644
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@@ -232,6 +232,7 @@ int64_t opal_pci_tce_kill(uint64_t phb_id, uint32_t 
kill_type,
 int64_t opal_rm_pci_tce_kill(uint64_t phb_id, uint32_t kill_type,
 uint32_t pe_num, uint32_t tce_size,
 uint64_t dma_addr, uint32_t npages);
+int64_t opal_scrape_log(int64_t *offset, char *buf, int64_t len, int64_t *lvl);
 
 /* Internal functions */
 extern int early_init_dt_scan_opal(unsigned long node, const char *uname,
diff --git a/arch/powerpc/platforms/powernv/opal-msglog.c 
b/arch/powerpc/platforms/powernv/opal-msglog.c
index 39d6ff9e5630..78168f66fb24 100644
--- a/arch/powerpc/platforms/powernv/opal-msglog.c
+++ b/arch/powerpc/platforms/powernv/opal-msglog.c
@@ -15,6 +15,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /* OPAL in-memory console. Defined in OPAL source at core/console.c */
 struct memcons {
@@ -102,8 +103,36 @@ static struct bin_attribute opal_msglog_attr = {
.read = opal_msglog_read
 };
 
+static char *log_levels[] = { "Emergency", "Alert", "Critical", "Error", 
"Warning" };
+static int64_t offset = -1;
+
+static irqreturn_t opal_print_log(int irq, void *data)
+{
+   int64_t rc, log_lvl;
+   char buffer[320];
+
+   /*
+* only print one message per invokation of the IRQ handler
+*/
+
+   rc = opal_scrape_log(, buffer, sizeof(buffer), _lvl);
+
+   if (rc == OPAL_SUCCESS || rc == OPAL_PARTIAL) {
+   log_lvl = be64_to_cpu(log_lvl);
+   if (log_lvl > 4)
+   log_lvl = 4;
+
+   printk_emit(0, log_lvl, NULL, 0, "OPAL %s: %s%s\r\n",
+   log_levels[log_lvl], buffer,
+   rc == OPAL_PARTIAL ? "" : "");
+   }
+
+   return IRQ_HANDLED;
+}
+
 void __init opal_msglog_init(void)
 {
+   int virq, rc = -1;
u64 mcaddr;
struct memcons *mc;
 
@@ -123,6 +152,18 @@ void __init opal_msglog_init(void)
return;
}
 
+   virq = opal_event_request(ilog2(OPAL_EVENT_LOG_PENDING));
+   if (virq) {
+   rc = request_irq(virq, opal_print_log,
+   IRQF_TRIGGER_HIGH, "opal memcons", NULL);
+
+   if (rc)
+   irq_dispose_mapping(virq);
+   }
+
+   if (!virq || rc)
+   pr_warn("Unable to register OPAL log event handler\n");
+
opal_memcons = mc;
 }
 
diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S 
b/arch/powerpc/platforms/powernv/opal-wrappers.S
index 3aa40f1b20f5..c59d7da3fd1a 100644
--- a/arch/powerpc/platforms/powernv/opal-wrappers.S
+++ b/arch/powerpc/platforms/powernv/opal-wrappers.S
@@ -312,3 +312,4 @@ OPAL_CALL(opal_int_set_mfrr,
OPAL_INT_SET_MFRR);
 OPAL_CALL_REAL(opal_rm_int_set_mfrr,   OPAL_INT_SET_MF

[PATCH v2] powerpc/powernv: de-deuplicate OPAL call wrappers

2017-03-23 Thread Oliver O'Halloran
Currently the code to perform an OPAL call is duplicated between the
normal path and path taken when tracepoints are enabled. There's no
real need for this and combining them makes opal_tracepoint_entry
considerably easier to understand.

Signed-off-by: Oliver O'Halloran <ooh...@gmail.com>
---
v1 -> v2:
slight rework due to the real mode opal call changes
---
 arch/powerpc/platforms/powernv/opal-wrappers.S | 53 +++---
 1 file changed, 22 insertions(+), 31 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S 
b/arch/powerpc/platforms/powernv/opal-wrappers.S
index da8a0f7a035c..ebf6719d241a 100644
--- a/arch/powerpc/platforms/powernv/opal-wrappers.S
+++ b/arch/powerpc/platforms/powernv/opal-wrappers.S
@@ -50,21 +50,13 @@ END_FTR_SECTION(0, 1);  
\
 #define OPAL_BRANCH(LABEL)
 #endif
 
-/* TODO:
- *
- * - Trace irqs in/off (needs saving/restoring all args, argh...)
- * - Get r11 feed up by Dave so I can have better register usage
+/*
+ * DO_OPAL_CALL assumes:
+ * r0  = opal call token
+ * r12 = msr
+ * LR has been saved
  */
-
-#define OPAL_CALL(name, token) \
- _GLOBAL_TOC(name);\
-   mfmsr   r12;\
-   mflrr0; \
-   andi.   r11,r12,MSR_IR|MSR_DR;  \
-   std r0,PPC_LR_STKOFF(r1);   \
-   li  r0,token;   \
-   beq opal_real_call; \
-   OPAL_BRANCH(opal_tracepoint_entry) \
+#define DO_OPAL_CALL() \
mfcrr11;\
stw r11,8(r1);  \
li  r11,0;  \
@@ -83,6 +75,18 @@ END_FTR_SECTION(0, 1);   
\
mtspr   SPRN_HSRR0,r12; \
hrfid
 
+#define OPAL_CALL(name, token) \
+ _GLOBAL_TOC(name);\
+   mfmsr   r12;\
+   mflrr0; \
+   andi.   r11,r12,MSR_IR|MSR_DR;  \
+   std r0,PPC_LR_STKOFF(r1);   \
+   li  r0,token;   \
+   beq opal_real_call; \
+   OPAL_BRANCH(opal_tracepoint_entry) \
+   DO_OPAL_CALL()
+
+
 opal_return:
/*
 * Fixup endian on OPAL return... we should be able to simplify
@@ -148,26 +152,13 @@ opal_tracepoint_entry:
ld  r8,STK_REG(R29)(r1)
ld  r9,STK_REG(R30)(r1)
ld  r10,STK_REG(R31)(r1)
+
+   /* setup LR so we return via tracepoint_return */
LOAD_REG_ADDR(r11,opal_tracepoint_return)
-   mfcrr12
std r11,16(r1)
-   stw r12,8(r1)
-   li  r11,0
+
mfmsr   r12
-   ori r11,r11,MSR_EE
-   std r12,PACASAVEDMSR(r13)
-   andcr12,r12,r11
-   mtmsrd  r12,1
-   LOAD_REG_ADDR(r11,opal_return)
-   mtlrr11
-   li  r11,MSR_DR|MSR_IR|MSR_LE
-   andcr12,r12,r11
-   mtspr   SPRN_HSRR1,r12
-   LOAD_REG_ADDR(r11,opal)
-   ld  r12,8(r11)
-   ld  r2,0(r11)
-   mtspr   SPRN_HSRR0,r12
-   hrfid
+   DO_OPAL_CALL()
 
 opal_tracepoint_return:
std r3,STK_REG(R31)(r1)
-- 
2.9.3



Re: Build failure -- powerpc/boot: Add OPAL console to epapr wrappers

2017-03-24 Thread Oliver O'Halloran
On Sat, Mar 25, 2017 at 4:00 AM, Daniel Walker <danie...@cisco.com> wrote:
> I get this build failure,
>
>
> In file included from arch/powerpc/boot/fdt.c:51:
> ../arch/powerpc/boot/libfdt_env.h:9: error: redefinition of typedef
> 'uint32_t'
> ../arch/powerpc/boot/types.h:20: note: previous declaration of 'uint32_t'
> was here
> ../arch/powerpc/boot/libfdt_env.h:10: error: redefinition of typedef
> 'uint64_t'
> ../arch/powerpc/boot/types.h:21: note: previous declaration of 'uint64_t'
> was here
> make[2]: *** [arch/powerpc/boot/fdt.o] Error 1
> make[1]: *** [uImage] Error 2
> make[1]: Leaving directory `/nobackup/danielwa/linux/t1040'
> make: *** [sub-make] Error 2
>
>
> and it bisects to ,
>
>
> commit 656ad58ef19e2a763fa5c938b20ae0f6b8d67242
> Author: Oliver O'Halloran <ooh...@gmail.com>
> Date:   Fri Jul 1 00:34:37 2016 +1000
>
> powerpc/boot: Add OPAL console to epapr wrappers
>
> This patch adds an OPAL console backend to the powerpc boot wrapper so
> that decompression failures inside the wrapper can be reported to the
> user. This is important since it typically indicates data corruption in
> the firmware and other nasty things.
>
> Currently this only works when building a little endian kernel. When
> compiling a 64 bit BE kernel the wrapper is always build 32 bit to be
> compatible with some 32 bit firmwares. BE support will be added at a
> later date. Another limitation of this is that only the "raw" type of
> OPAL console is supported, however machines that provide a hvsi console
> also provide a raw console so this is not an issue in practice.
>
> Actually-written-by: Benjamin Herrenschmidt <b...@kernel.crashing.org>
> Signed-off-by: Oliver O'Halloran <ooh...@gmail.com>
> [mpe: Move #ifdef __powerpc64__ to avoid warnings on 32-bit]
> Signed-off-by: Michael Ellerman <m...@ellerman.id.au>
>
>
> I can provide a config file if needed. My apologies if this was already
> reported.

Thanks for the report, I don't think this is a known bug. mpe's build
testing is pretty thorough so I'm surprised this wasn't caught sooner.

A config file and the version of gcc that you're using would be useful.

Oliver


Re: [PATCH 2/5] powerpc/smp: add set_cpus_related()

2017-03-22 Thread Oliver O'Halloran
On Wed, Mar 15, 2017 at 10:18 PM, Michael Ellerman <m...@ellerman.id.au> wrote:
> Oliver O'Halloran <ooh...@gmail.com> writes:
>> diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
>> index dfe0e1d9cd06..1c531887ca51 100644
>> --- a/arch/powerpc/kernel/smp.c
>> +++ b/arch/powerpc/kernel/smp.c
>> @@ -377,6 +377,25 @@ static void smp_store_cpu_info(int id)
>>  #endif
>>  }
>>
>> +/*
>> + * Relationships between CPUs are maintained in a set of per-cpu cpumasks. 
>> We
>> + * need to ensure that they are kept consistant between CPUs when they are
>> + * changed.
>> + *
>> + * This is slightly tricky since the core mask must be a strict superset of
>> + * the sibling mask.
>> + */
>> +static void set_cpus_related(int i, int j, bool related, struct cpumask 
>> *(*relation_fn)(int))
>> +{
>> + if (related) {
>> + cpumask_set_cpu(i, relation_fn(j));
>> + cpumask_set_cpu(j, relation_fn(i));
>> + } else {
>> + cpumask_clear_cpu(i, relation_fn(j));
>> + cpumask_clear_cpu(j, relation_fn(i));
>> + }
>> +}
>
> I think you pushed the abstraction one notch too far on this one, or
> perhaps not far enough.
>
> We end up with a function called "set" that might clear, depending on a
> bool you pass. Which is hard to parse, eg:
>
> set_cpus_related(cpu, base + i, false, cpu_sibling_mask);
>
> And I know there's two places where we pass an existing bool "add", but
> there's four where we pass true or false.

I think you're looking at this patch. With the full series applied we
never pass a literal to set_cpus_related() directly:

[12:14 oliver ~/.../powerpc/kernel (p9-sched $%)]$ gg set_cpus_related
smp.c:391:static void set_cpus_related(int i, int j, bool related,
struct cpumask *(*relation_fn)(int))
smp.c:647:  set_cpus_related(cpu, cpu, add, cpu_core_mask);
smp.c:651:  set_cpus_related(cpu, i, add, cpu_core_mask);
smp.c:685:  set_cpus_related(cpu, cpu, onlining, mask_fn);
smp.c:697:  set_cpus_related(cpu, i, onlining, mask_fn);
smp.c:721:  set_cpus_related(cpu, base + i, onlining,
cpu_sibling_mask);
smp.c:736:  set_cpus_related(cpu, cpu, onlining, cpu_core_mask);
smp.c:746:  set_cpus_related(cpu, i, onlining, cpu_core_mask);

I agree that set_cpus_related() is probably a bad name,
make_cpus_related() maybe?

>
> If we want to push it in that direction I think we should just pass the
> set/clear routine instead of the flag, so:
>
> do_cpus_related(cpu, base + i, cpumask_clear_cpu, cpu_sibling_mask);
>
> But that might be overdoing it.

I think this would be ok.

>
> So I think we should just do:
>
> static void set_cpus_related(int i, int j, struct cpumask *(*mask_func)(int))
> {
> cpumask_set_cpu(i, mask_func(j));
> cpumask_set_cpu(j, mask_func(i));
> }
>
> static void clear_cpus_related(int i, int j, struct cpumask 
> *(*mask_func)(int))
> {
> cpumask_clear_cpu(i, mask_func(j));
> cpumask_clear_cpu(j, mask_func(i));
> }
>
>
> So the cases with add become:
>
> if (add)
> set_cpus_related(cpu, i, cpu_core_mask(i));
> else
> clear_cpus_related(cpu, i, cpu_core_mask(i));

Dunno, I was trying to get rid of this sort of thing since the logic
is duplicated in a lot of places. Seemed to me that it was just
pointlessly verbose rather than being helpfully explicit.

>
> Which is not as pretty but more explicit.
>
> And the other cases look much better, eg:
>
> clear_cpus_related(cpu, base + i, cpu_sibling_mask);
>
> ??
>
> cheers


Re: [PATCH 1/5] powerpc/smp: use cpu_to_chip_id() to find siblings

2017-03-22 Thread Oliver O'Halloran
On Wed, Mar 15, 2017 at 10:18 PM, Michael Ellerman <m...@ellerman.id.au> wrote:
> Oliver O'Halloran <ooh...@gmail.com> writes:
>
>> To determine which logical CPUs are on the same core the kernel uses the
>> ibm,chipid property from the device tree node associated with that cpu.
>> The lookup for this this information is currently open coded in both
>> traverse_siblings() and traverse_siblings_chip_id(). This patch replaces
>> these manual lookups with the existing cpu_to_chip_id() function.
>
> Some minor nits.
>
> cpu_to_chip_id() actually searches recursively up the parents until it
> finds a ibm,chip-id, so it's not a 1:1 replacement for the existing
> logic, but it's probably still an OK conversion. It's still worth
> mentioning in the change log thought.

fair enough

>> diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
>> index 893bd7f79be6..dfe0e1d9cd06 100644
>> --- a/arch/powerpc/kernel/smp.c
>> +++ b/arch/powerpc/kernel/smp.c
>> @@ -664,23 +655,19 @@ static void traverse_core_siblings(int cpu, bool add)
>>  {
>>   struct device_node *l2_cache, *np;
>>   const struct cpumask *mask;
>> - int i, chip, plen;
>> - const __be32 *prop;
>> + int chip_id;
>> + int i;
>>
>> - /* First see if we have ibm,chip-id properties in cpu nodes */
>> - np = of_get_cpu_node(cpu, NULL);
>> - if (np) {
>> - chip = -1;
>> - prop = of_get_property(np, "ibm,chip-id", );
>> - if (prop && plen == sizeof(int))
>> - chip = of_read_number(prop, 1);
>> - of_node_put(np);
>> - if (chip >= 0) {
>> - traverse_siblings_chip_id(cpu, add, chip);
>> - return;
>> - }
>> + /* threads that share a chip-id are considered siblings (same die) */
>
> You might know it means the "same die", but AFAIK there's no actual
> definition for what the chip-id means, so let's not write comments that
> might be wrong in future. Just saying they're considered siblings is
> sufficient.
>
> Also "Threads" :)

The cpus masks are all built in terms of threads, so this is
technically correct even if it sounds stupid. Maybe "logical cpus"
would be better?

>
> cheers


Re: [PATCH 4/5] powerpc/smp: add cpu_cache_mask

2017-03-22 Thread Oliver O'Halloran
On Wed, Mar 15, 2017 at 10:26 PM, Michael Ellerman <m...@ellerman.id.au> wrote:
> Oliver O'Halloran <ooh...@gmail.com> writes:
>
>> Traditionally we have only ever tracked which CPUs are in the same core
>> (cpu_sibling_mask) and on the same die (cpu_core_mask). For Power9 we
>> need to be aware of which CPUs share cache with each other so this patch
>> adds cpu_cache_mask and the underlying cpu_cache_map variable to track
>> this.
>
> But which cache?

I'm not sure it matters. All the scheduler really wants to know is
that that migrating between cpus with a shared cache is cheaper than
migrating elsewhere.

> Some CPUs on Power8 share L3, or L4.

Eh... it's not really the same. The "L4" is part of the memory buffers
and it's function is conceptually different to the processor caches.
The L3 on P8 is only shared when the core that owns is offline (or
sleeping) so the scheduler doesn't really need to be aware of it. Even
if the scheduler was aware I don't think it can take advantage of it
without some terrible hacks.

>
> I think just call it cpu_l2cache_map to make it explicit.

I was being deliberately vague. I know it's only a shared currently,
but it's possible we might have a (real) shared L3 in the future. The
latest high-end x86 chips have some of l3 sharing across the entire
chip so you never know. I'm not particularly attached to the name
though, so i'll rename it if you really want.

Oliver


Re: [PATCH 1/5] powerpc/smp: use cpu_to_chip_id() to find siblings

2017-03-27 Thread Oliver O'Halloran
On Tue, Mar 28, 2017 at 2:03 PM, Michael Ellerman <m...@ellerman.id.au> wrote:
> Oliver O'Halloran <ooh...@gmail.com> writes:
>> On Wed, Mar 15, 2017 at 10:18 PM, Michael Ellerman <m...@ellerman.id.au> 
>> wrote:
>>> Oliver O'Halloran <ooh...@gmail.com> writes:
>>>> + /* threads that share a chip-id are considered siblings (same die) */
>>>
>>> Also "Threads" :)
>>
>> The cpus masks are all built in terms of threads, so this is
>> technically correct even if it sounds stupid. Maybe "logical cpus"
>> would be better?
>
> No I meant you need a capital "T" !

capital letters are against my religion.

>
> cheers


[PATCH 2/2] powerpc/mm: add phys addr to linux page table dump

2017-03-30 Thread Oliver O'Halloran
The current page table dumper scans the linux page tables and coalesces
mappings with adjacent virtual addresses and similar PTE flags. This
behaviour is somewhat broken when you consider the IOREMAP space where
entirely unrelated mappings will appear to be contiguous.  This patch
modifies the range coalescing so that only ranges that are both physically
and virtually contiguous are combined. This patch also adds to the dump
output the physical address at the start of each range.

Cc: Rashmica Gupta <rashmic...@gmail.com>
Signed-off-by: Oliver O'Halloran <ooh...@gmail.com>
---
 arch/powerpc/mm/dump_linuxpagetables.c | 18 --
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/mm/dump_linuxpagetables.c 
b/arch/powerpc/mm/dump_linuxpagetables.c
index e7cbfd5a0940..85e6a45bd7ee 100644
--- a/arch/powerpc/mm/dump_linuxpagetables.c
+++ b/arch/powerpc/mm/dump_linuxpagetables.c
@@ -56,6 +56,8 @@ struct pg_state {
struct seq_file *seq;
const struct addr_marker *marker;
unsigned long start_address;
+   unsigned long start_pa;
+   unsigned long last_pa;
unsigned int level;
u64 current_flags;
 };
@@ -265,7 +267,9 @@ static void dump_addr(struct pg_state *st, unsigned long 
addr)
const char *unit = units;
unsigned long delta;
 
-   seq_printf(st->seq, "0x%016lx-0x%016lx   ", st->start_address, addr-1);
+   seq_printf(st->seq, "0x%016lx-0x%016lx ", st->start_address, addr-1);
+   seq_printf(st->seq, "%016lx ", st->start_pa);
+
delta = (addr - st->start_address) >> 10;
/* Work out what appropriate unit to use */
while (!(delta & 1023) && unit[1]) {
@@ -280,11 +284,15 @@ static void note_page(struct pg_state *st, unsigned long 
addr,
   unsigned int level, u64 val)
 {
u64 flag = val & pg_level[level].mask;
+   u64 pa = val & PTE_RPN_MASK;
+
/* At first no level is set */
if (!st->level) {
st->level = level;
st->current_flags = flag;
st->start_address = addr;
+   st->start_pa = pa;
+   st->last_pa = pa;
seq_printf(st->seq, "---[ %s ]---\n", st->marker->name);
/*
 * Dump the section of virtual memory when:
@@ -292,9 +300,11 @@ static void note_page(struct pg_state *st, unsigned long 
addr,
 *   - we change levels in the tree.
 *   - the address is in a different section of memory and is thus
 *   used for a different purpose, regardless of the flags.
+*   - the pa of this page is not adjacent to the last inspected page
 */
} else if (flag != st->current_flags || level != st->level ||
-  addr >= st->marker[1].start_address) {
+  addr >= st->marker[1].start_address ||
+  pa != st->last_pa + PAGE_SIZE) {
 
/* Check the PTE flags */
if (st->current_flags) {
@@ -318,8 +328,12 @@ static void note_page(struct pg_state *st, unsigned long 
addr,
seq_printf(st->seq, "---[ %s ]---\n", st->marker->name);
}
st->start_address = addr;
+   st->start_pa = pa;
+   st->last_pa = pa;
st->current_flags = flag;
st->level = level;
+   } else {
+   st->last_pa = pa;
}
 }
 
-- 
2.9.3



Re: [Patch v5] powerpc/powernv: add hdat attribute to sysfs

2017-03-16 Thread Oliver O'Halloran
On Thu, Mar 2, 2017 at 4:44 PM, Matt Brown  wrote:
> The HDAT data area is consumed by skiboot and turned into a device-tree.
> In some cases we would like to look directly at the HDAT, so this patch
> adds a sysfs node to allow it to be viewed.  This is not possible through
> /dev/mem as it is reserved memory which is stopped by the /dev/mem filter.
> This patch also adds sysfs nodes for all properties in the device-tree
> under /ibm,opal/firmware/exports.
>
> Signed-off-by: Matt Brown 
> ---
> Changes between v4 and v5:
> - all properties under /ibm,opal/firmware/exports in the device-tree
>   are now added as new sysfs nodes
> - the new sysfs nodes are now placed under /opal/exports
> - added a generic read function for all exported attributes
> ---
>  arch/powerpc/platforms/powernv/opal.c | 84 
> +++
>  1 file changed, 84 insertions(+)
>
> diff --git a/arch/powerpc/platforms/powernv/opal.c 
> b/arch/powerpc/platforms/powernv/opal.c
> index 2822935..fbb8264 100644
> --- a/arch/powerpc/platforms/powernv/opal.c
> +++ b/arch/powerpc/platforms/powernv/opal.c
> @@ -36,6 +36,9 @@
>  /* /sys/firmware/opal */
>  struct kobject *opal_kobj;
>
> +/* /sys/firmware/opal/exports */
> +struct kobject *opal_export_kobj;
> +
>  struct opal {
> u64 base;
> u64 entry;
> @@ -604,6 +607,82 @@ static void opal_export_symmap(void)
> pr_warn("Error %d creating OPAL symbols file\n", rc);
>  }
>
> +

> +static int opal_exports_sysfs_init(void)
> +{
> +   opal_export_kobj = kobject_create_and_add("exports", opal_kobj);
> +   if (!opal_export_kobj) {
> +   pr_warn("kobject_create_and_add opal_exports failed\n");
> +   return -ENOMEM;
> +   }
> +
> +   return 0;
> +}

This can be folded into opal_export_attrs().

> +
> +static ssize_t export_attr_read(struct file *fp, struct kobject *kobj,
> +struct bin_attribute *bin_attr, char *buf,
> +loff_t off, size_t count)
> +{
> +   return memory_read_from_buffer(buf, count, , bin_attr->private,
> +  bin_attr->size);
> +}
> +
> +static struct bin_attribute *exported_attrs;
> +/*
> + * opal_export_attrs: creates a sysfs node for each property listed in
> + * the device-tree under /ibm,opal/firmware/exports/
> + * All new sysfs nodes are created under /opal/exports/.
> + * This allows for reserved memory regions (e.g. HDAT) to be read.
> + * The new sysfs nodes are only readable by root.
> + */
> +static void opal_export_attrs(void)
> +{
> +   const __be64 *syms;
> +   unsigned int size;
> +   struct device_node *fw;
> +   struct property *prop;
> +   int rc;
> +   int attr_count = 0;
> +   int n = 0;
> +

> +   fw = of_find_node_by_path("/ibm,opal/firmware/exports");
> +   if (!fw)
> +   return;

devicetree nodes are reference counted so when you take a reference to
one using of_find_node_* you should use of_put_node() to drop the reference
when you're finished with it. Of course, there's plenty of existing code that
doesn't do this, but that's no reason to make a bad problem worse ;)

> +
> +   for (prop = fw->properties; prop != NULL; prop = prop->next)
> +   attr_count++;
> +
> +   if (attr_count > 2)
> +   exported_attrs = 
> kmalloc(sizeof(exported_attrs)*(attr_count-2),
> +   __GFP_IO | __GFP_FS);

Why are you using __GFP_IO | __GFP_FS instead of GFP_KERNEL? Also,
using kzalloc(), which zeros memory, over kmalloc() is a good idea in
general since structures can contain fields that change the behaviour
of the function that you pass them to.

> +
> +
> +   for_each_property_of_node(fw, prop) {
> +
> +   syms = of_get_property(fw, prop->name, );
> +
> +   if (!strcmp(prop->name, "name") ||
> +   !strcmp(prop->name, "phandle"))
> +   continue;
> +
> +   if (!syms || size != 2 * sizeof(__be64))
> +   continue;
> +

> +   (exported_attrs+n)->attr.name = prop->name;

References to DT properties are only valid if you have a reference to
the DT node that contains them. DT nodes and properties can (in
theory) be changed at runtime, but in practice this only really
happens for nodes that refer to hotpluggable devices (memory, PCI,
etc), but its still poor form to rely on things not happening. You can
make a copy of the name with kstrdup() and store that pointer for as
long as you like, since you can guarantee the copy will exist until
you explicitly free() it.

> +   (exported_attrs+n)->attr.mode = 0400;
> +   (exported_attrs+n)->read = export_attr_read;
> +   (exported_attrs+n)->private = __va(be64_to_cpu(syms[0]));
> +   

[PATCH] powerpc/mm: remove stale comment

2017-04-03 Thread Oliver O'Halloran
The code to fix the problem it describes was removed in c40785a and it
uses the stupid comment style. Away it goes!

Signed-off-by: Oliver O'Halloran <ooh...@gmail.com>
---
 arch/powerpc/mm/hash_utils_64.c | 5 -
 1 file changed, 5 deletions(-)

diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 8848fec..69a05b3 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -927,11 +927,6 @@ static void __init htab_initialize(void)
}
 #endif /* CONFIG_DEBUG_PAGEALLOC */
 
-   /* On U3 based machines, we need to reserve the DART area and
-* _NOT_ map it to avoid cache paradoxes as it's remapped non
-* cacheable later on
-*/
-
/* create bolted the linear mapping in the hash table */
for_each_memblock(memory, reg) {
base = (unsigned long)__va(reg->base);
-- 
2.9.3



[PATCH 2/9] mm/huge_memory: Deposit a pgtable for DAX PMD faults when required

2017-04-11 Thread Oliver O'Halloran
Although all architectures use a deposited page table for THP on anonymous VMAs
some architectures (s390 and powerpc) require the deposited storage even for
file backed VMAs due to quirks of their MMUs. This patch adds support for
depositing a table in DAX PMD fault handling path for archs that require it.
Other architectures should see no functional changes.

Cc: "Aneesh Kumar K.V" <aneesh.ku...@linux.vnet.ibm.com>
Cc: linux...@kvack.org
Signed-off-by: Oliver O'Halloran <ooh...@gmail.com>
---
 mm/huge_memory.c | 20 ++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index aa01dd47cc65..a84909cf20d3 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -715,7 +715,8 @@ int do_huge_pmd_anonymous_page(struct vm_fault *vmf)
 }
 
 static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
-   pmd_t *pmd, pfn_t pfn, pgprot_t prot, bool write)
+   pmd_t *pmd, pfn_t pfn, pgprot_t prot, bool write,
+   pgtable_t pgtable)
 {
struct mm_struct *mm = vma->vm_mm;
pmd_t entry;
@@ -729,6 +730,12 @@ static void insert_pfn_pmd(struct vm_area_struct *vma, 
unsigned long addr,
entry = pmd_mkyoung(pmd_mkdirty(entry));
entry = maybe_pmd_mkwrite(entry, vma);
}
+
+   if (pgtable) {
+   pgtable_trans_huge_deposit(mm, pmd, pgtable);
+   atomic_long_inc(>nr_ptes);
+   }
+
set_pmd_at(mm, addr, pmd, entry);
update_mmu_cache_pmd(vma, addr, pmd);
spin_unlock(ptl);
@@ -738,6 +745,7 @@ int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned 
long addr,
pmd_t *pmd, pfn_t pfn, bool write)
 {
pgprot_t pgprot = vma->vm_page_prot;
+   pgtable_t pgtable = NULL;
/*
 * If we had pmd_special, we could avoid all these restrictions,
 * but we need to be consistent with PTEs and architectures that
@@ -752,9 +760,15 @@ int vmf_insert_pfn_pmd(struct vm_area_struct *vma, 
unsigned long addr,
if (addr < vma->vm_start || addr >= vma->vm_end)
return VM_FAULT_SIGBUS;
 
+   if (arch_needs_pgtable_deposit()) {
+   pgtable = pte_alloc_one(vma->vm_mm, addr);
+   if (!pgtable)
+   return VM_FAULT_OOM;
+   }
+
track_pfn_insert(vma, , pfn);
 
-   insert_pfn_pmd(vma, addr, pmd, pfn, pgprot, write);
+   insert_pfn_pmd(vma, addr, pmd, pfn, pgprot, write, pgtable);
return VM_FAULT_NOPAGE;
 }
 EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd);
@@ -1611,6 +1625,8 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct 
vm_area_struct *vma,
tlb->fullmm);
tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
if (vma_is_dax(vma)) {
+   if (arch_needs_pgtable_deposit())
+   zap_deposited_table(tlb->mm, pmd);
spin_unlock(ptl);
if (is_huge_zero_pmd(orig_pmd))
tlb_remove_page_size(tlb, pmd_page(orig_pmd), 
HPAGE_PMD_SIZE);
-- 
2.9.3



[PATCH 1/9] mm/huge_memory: Use zap_deposited_table() more

2017-04-11 Thread Oliver O'Halloran
Depending flags of the PMD being zapped there may or may not be a
deposited pgtable to be freed. In two of the three cases this is open
coded while the third uses the zap_deposited_table() helper. This patch
converts the others to use the helper to clean things up a bit.

Cc: "Aneesh Kumar K.V" <aneesh.ku...@linux.vnet.ibm.com>
Cc: "Kirill A. Shutemov" <kirill.shute...@linux.intel.com>
Cc: linux...@kvack.org
Signed-off-by: Oliver O'Halloran <ooh...@gmail.com>
---
For reference:

void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd)
{
pgtable_t pgtable;

pgtable = pgtable_trans_huge_withdraw(mm, pmd);
pte_free(mm, pgtable);
atomic_long_dec(>nr_ptes);
}
---
 mm/huge_memory.c | 8 ++--
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index b787c4cfda0e..aa01dd47cc65 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1615,8 +1615,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct 
vm_area_struct *vma,
if (is_huge_zero_pmd(orig_pmd))
tlb_remove_page_size(tlb, pmd_page(orig_pmd), 
HPAGE_PMD_SIZE);
} else if (is_huge_zero_pmd(orig_pmd)) {
-   pte_free(tlb->mm, pgtable_trans_huge_withdraw(tlb->mm, pmd));
-   atomic_long_dec(>mm->nr_ptes);
+   zap_deposited_table(tlb->mm, pmd);
spin_unlock(ptl);
tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE);
} else {
@@ -1625,10 +1624,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct 
vm_area_struct *vma,
VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
VM_BUG_ON_PAGE(!PageHead(page), page);
if (PageAnon(page)) {
-   pgtable_t pgtable;
-   pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd);
-   pte_free(tlb->mm, pgtable);
-   atomic_long_dec(>mm->nr_ptes);
+   zap_deposited_table(tlb->mm, pmd);
add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
} else {
if (arch_needs_pgtable_deposit())
-- 
2.9.3



[PATCH 3/9] powerpc/mm: Add _PAGE_DEVMAP for ppc64.

2017-04-11 Thread Oliver O'Halloran
From: "Aneesh Kumar K.V" <aneesh.ku...@linux.vnet.ibm.com>

Add a _PAGE_DEVMAP bit for PTE and DAX PMD entires. PowerPC doesn't
currently support PUD faults so we haven't extended it to the PUD
level.

Cc: Aneesh Kumar K.V <aneesh.ku...@linux.vnet.ibm.com>
Signed-off-by: Oliver O'Halloran <ooh...@gmail.com>
---
 arch/powerpc/include/asm/book3s/64/pgtable.h | 37 +++-
 1 file changed, 36 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h 
b/arch/powerpc/include/asm/book3s/64/pgtable.h
index fb72ff6b98e6..b5fc6337649e 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -78,6 +78,9 @@
 
 #define _PAGE_SOFT_DIRTY   _RPAGE_SW3 /* software: software dirty tracking 
*/
 #define _PAGE_SPECIAL  _RPAGE_SW2 /* software: special page */
+#define _PAGE_DEVMAP   _RPAGE_SW1
+#define __HAVE_ARCH_PTE_DEVMAP
+
 /*
  * Drivers request for cache inhibited pte mapping using _PAGE_NO_CACHE
  * Instead of fixing all of them, add an alternate define which
@@ -602,6 +605,16 @@ static inline pte_t pte_mkhuge(pte_t pte)
return pte;
 }
 
+static inline pte_t pte_mkdevmap(pte_t pte)
+{
+   return __pte(pte_val(pte) | _PAGE_SPECIAL|_PAGE_DEVMAP);
+}
+
+static inline int pte_devmap(pte_t pte)
+{
+   return !!(pte_raw(pte) & cpu_to_be64(_PAGE_DEVMAP));
+}
+
 static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
 {
/* FIXME!! check whether this need to be a conditional */
@@ -966,6 +979,9 @@ static inline pte_t *pmdp_ptep(pmd_t *pmd)
 #define pmd_mk_savedwrite(pmd) pte_pmd(pte_mk_savedwrite(pmd_pte(pmd)))
 #define pmd_clear_savedwrite(pmd)  
pte_pmd(pte_clear_savedwrite(pmd_pte(pmd)))
 
+#define pud_pfn(...) (0)
+#define pgd_pfn(...) (0)
+
 #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
 #define pmd_soft_dirty(pmd)pte_soft_dirty(pmd_pte(pmd))
 #define pmd_mksoft_dirty(pmd)  pte_pmd(pte_mksoft_dirty(pmd_pte(pmd)))
@@ -1140,7 +1156,6 @@ static inline int pmd_move_must_withdraw(struct spinlock 
*new_pmd_ptl,
return true;
 }
 
-
 #define arch_needs_pgtable_deposit arch_needs_pgtable_deposit
 static inline bool arch_needs_pgtable_deposit(void)
 {
@@ -1149,6 +1164,26 @@ static inline bool arch_needs_pgtable_deposit(void)
return true;
 }
 
+static inline pmd_t pmd_mkdevmap(pmd_t pmd)
+{
+   return pte_pmd(pte_mkdevmap(pmd_pte(pmd)));
+}
+
+static inline int pmd_devmap(pmd_t pmd)
+{
+   return pte_devmap(pmd_pte(pmd));
+}
+
+static inline int pud_devmap(pud_t pud)
+{
+   return 0;
+}
+
+static inline int pgd_devmap(pgd_t pgd)
+{
+   return 0;
+}
+
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 #endif /* __ASSEMBLY__ */
 #endif /* _ASM_POWERPC_BOOK3S_64_PGTABLE_H_ */
-- 
2.9.3



[PATCH 5/9] powerpc/vmemmap: Add altmap support

2017-04-11 Thread Oliver O'Halloran
Adds support to powerpc for the altmap feature of ZONE_DEVICE memory. An
altmap is a driver provided region that is used to provide the backing
storage for the struct pages of ZONE_DEVICE memory. In situations where
large amount of ZONE_DEVICE memory is being added to the system the
altmap reduces pressure on main system memory by allowing the mm/
metadata to be stored on the device itself rather in main memory.

Signed-off-by: Oliver O'Halloran <ooh...@gmail.com>
---
 arch/powerpc/mm/init_64.c | 20 +++-
 arch/powerpc/mm/mem.c | 16 +---
 2 files changed, 28 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index f8124edb6ffa..225fbb8034e6 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -44,6 +44,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -171,13 +172,17 @@ int __meminit vmemmap_populate(unsigned long start, 
unsigned long end, int node)
pr_debug("vmemmap_populate %lx..%lx, node %d\n", start, end, node);
 
for (; start < end; start += page_size) {
+   struct vmem_altmap *altmap;
void *p;
int rc;
 
if (vmemmap_populated(start, page_size))
continue;
 
-   p = vmemmap_alloc_block(page_size, node);
+   /* altmap lookups only work at section boundaries */
+   altmap = to_vmem_altmap(SECTION_ALIGN_DOWN(start));
+
+   p =  __vmemmap_alloc_block_buf(page_size, node, altmap);
if (!p)
return -ENOMEM;
 
@@ -241,9 +246,10 @@ void __ref vmemmap_free(unsigned long start, unsigned long 
end)
pr_debug("vmemmap_free %lx...%lx\n", start, end);
 
for (; start < end; start += page_size) {
-   struct page *page = pfn_to_page(addr >> PAGE_SHIFT);
-   unsigned int nr_pages;
-   unsigned long addr;
+   unsigned long nr_pages, addr;
+   struct vmem_altmap *altmap;
+   struct page *section_base;
+   struct page *page;
 
/*
 * the section has already be marked as invalid, so
@@ -258,9 +264,13 @@ void __ref vmemmap_free(unsigned long start, unsigned long 
end)
continue;
 
page = pfn_to_page(addr >> PAGE_SHIFT);
+   section_base = pfn_to_page(vmemmap_section_start(start));
nr_pages = 1 << page_order;
 
-   if (PageReserved(page)) {
+   altmap = to_vmem_altmap((unsigned long) section_base);
+   if (altmap) {
+   vmem_altmap_free(altmap, nr_pages);
+   } else if (PageReserved(page)) {
/* allocated from bootmem */
if (page_size < PAGE_SIZE) {
/*
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 3bbba178b464..6f7b64eaa9d8 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -36,6 +36,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -176,7 +177,8 @@ int arch_remove_memory(u64 start, u64 size, enum 
memory_type type)
 {
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
-   struct zone *zone;
+   struct vmem_altmap *altmap;
+   struct page *page;
int ret;
 
/*
@@ -193,8 +195,16 @@ int arch_remove_memory(u64 start, u64 size, enum 
memory_type type)
return -EINVAL;
}
 
-   zone = page_zone(pfn_to_page(start_pfn));
-   ret = __remove_pages(zone, start_pfn, nr_pages);
+   /*
+* If we have an altmap then we need to skip over any reserved PFNs
+* when querying the zone.
+*/
+   page = pfn_to_page(start_pfn);
+   altmap = to_vmem_altmap((unsigned long) page);
+   if (altmap)
+   page += vmem_altmap_offset(altmap);
+
+   ret = __remove_pages(page_zone(page), start_pfn, nr_pages);
if (ret)
return ret;
 
-- 
2.9.3



[PATCH 6/9] powerpc, mm: Enable ZONE_DEVICE on powerpc

2017-04-11 Thread Oliver O'Halloran
Flip the switch. Running around and screaming "IT'S ALIVE" is optional,
but recommended.

Signed-off-by: Oliver O'Halloran <ooh...@gmail.com>
---
 mm/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/Kconfig b/mm/Kconfig
index 43d000e44424..d696af58f97f 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -724,7 +724,7 @@ config ZONE_DEVICE
depends on MEMORY_HOTPLUG
depends on MEMORY_HOTREMOVE
depends on SPARSEMEM_VMEMMAP
-   depends on X86_64 #arch_add_memory() comprehends device memory
+   depends on (X86_64 || PPC_BOOK3S_64)  #arch_add_memory() comprehends 
device memory
 
help
  Device memory hotplug support allows for establishing pmem,
-- 
2.9.3



[PATCH 9/9] powerpc: Add pmem API support

2017-04-11 Thread Oliver O'Halloran
Initial powerpc support for the arch-specific bit of the persistent
memory API. Nothing fancy here.

Signed-off-by: Oliver O'Halloran <ooh...@gmail.com>
---
 arch/powerpc/Kconfig|   1 +
 arch/powerpc/include/asm/pmem.h | 109 
 arch/powerpc/kernel/misc_64.S   |   2 +-
 3 files changed, 111 insertions(+), 1 deletion(-)
 create mode 100644 arch/powerpc/include/asm/pmem.h

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index d7413ed700b8..cf84d0db49ab 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -87,6 +87,7 @@ config PPC
select ARCH_HAS_DMA_SET_COHERENT_MASK
select ARCH_HAS_ELF_RANDOMIZE
select ARCH_HAS_GCOV_PROFILE_ALL
+   select ARCH_HAS_PMEM_API
select ARCH_HAS_SCALED_CPUTIME  if VIRT_CPU_ACCOUNTING_NATIVE
select ARCH_HAS_SG_CHAIN
select ARCH_HAS_TICK_BROADCAST  if GENERIC_CLOCKEVENTS_BROADCAST
diff --git a/arch/powerpc/include/asm/pmem.h b/arch/powerpc/include/asm/pmem.h
new file mode 100644
index ..27da9594040f
--- /dev/null
+++ b/arch/powerpc/include/asm/pmem.h
@@ -0,0 +1,109 @@
+/*
+ * Copyright(c) 2017 IBM Corporation. All rights reserved.
+ *
+ * Based on the x86 version.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#ifndef __ASM_POWERPC_PMEM_H__
+#define __ASM_POWERPC_PMEM_H__
+
+#include 
+#include 
+#include 
+
+/*
+ * See include/linux/pmem.h for API documentation
+ *
+ * PPC specific notes:
+ *
+ * 1. PPC has no non-temporal (cache bypassing) stores so we're stuck with
+ *doing cache writebacks.
+ *
+ * 2. DCBST is a suggestion. DCBF *will* force a writeback.
+ *
+ */
+
+static inline void arch_wb_cache_pmem(void *addr, size_t size)
+{
+   unsigned long iaddr = (unsigned long) addr;
+
+   /* NB: contains a barrier */
+   flush_inval_dcache_range(iaddr, iaddr + size);
+}
+
+/* invalidate and writeback are functionally identical */
+#define arch_invalidate_pmem arch_wb_cache_pmem
+
+static inline void arch_memcpy_to_pmem(void *dst, const void *src, size_t n)
+{
+   int unwritten;
+
+   /*
+* We are copying between two kernel buffers, if
+* __copy_from_user_inatomic_nocache() returns an error (page
+* fault) we would have already reported a general protection fault
+* before the WARN+BUG.
+*
+* XXX: replace this with a hand-rolled memcpy+dcbf
+*/
+   unwritten = __copy_from_user_inatomic(dst, (void __user *) src, n);
+   if (WARN(unwritten, "%s: fault copying %p <- %p unwritten: %d\n",
+   __func__, dst, src, unwritten))
+   BUG();
+
+   arch_wb_cache_pmem(dst, n);
+}
+
+static inline int arch_memcpy_from_pmem(void *dst, const void *src, size_t n)
+{
+   /*
+* TODO: We should have most of the infrastructure for MCE handling
+*   but it needs to be made slightly smarter.
+*/
+   memcpy(dst, src, n);
+   return 0;
+}
+
+static inline size_t arch_copy_from_iter_pmem(void *addr, size_t bytes,
+   struct iov_iter *i)
+{
+   size_t len;
+
+   /* XXX: under what conditions would this return len < size? */
+   len = copy_from_iter(addr, bytes, i);
+   arch_wb_cache_pmem(addr, bytes - len);
+
+   return len;
+}
+
+static inline void arch_clear_pmem(void *addr, size_t size)
+{
+   void *start = addr;
+
+   /*
+* XXX: A hand rolled dcbz+dcbf loop would probably be better.
+*/
+
+   if (((uintptr_t) addr & ~PAGE_MASK) == 0) {
+   while (size >= PAGE_SIZE) {
+   clear_page(addr);
+   addr += PAGE_SIZE;
+   size -= PAGE_SIZE;
+   }
+   }
+
+   if (size)
+   memset(addr, 0, size);
+
+   arch_wb_cache_pmem(start, size);
+}
+
+#endif /* __ASM_POWERPC_PMEM_H__ */
diff --git a/arch/powerpc/kernel/misc_64.S b/arch/powerpc/kernel/misc_64.S
index c119044cad0d..1378a8d61faf 100644
--- a/arch/powerpc/kernel/misc_64.S
+++ b/arch/powerpc/kernel/misc_64.S
@@ -182,7 +182,7 @@ _GLOBAL(flush_dcache_phys_range)
isync
blr
 
-_GLOBAL(flush_inval_dcache_range)
+_GLOBAL_TOC(flush_inval_dcache_range)
ld  r10,PPC64_CACHES@toc(r2)
lwz r7,DCACHEL1BLOCKSIZE(r10)   /* Get dcache block size */
addir5,r7,-1
-- 
2.9.3



ZONE_DEVICE and pmem API support for powerpc

2017-04-11 Thread Oliver O'Halloran
Hi all,

This series adds support for ZONE_DEVICE and the pmem api on powerpc. Namely,
support for altmaps and the various bits and pieces required for DAX PMD faults.
The first two patches touch generic mm/ code, but otherwise this is fairly well
contained in arch/powerpc.

If the nvdimm folks could sanity check this series I'd appreciate it.

Series is based on next-20170411, but it should apply elsewhere with minor
fixups to arch_{add|remove}_memory due to conflicts with HMM.  For those
interested in testing this, there is a driver and matching firmware that carves
out some system memory for use as an emulated Con Tutto memory card.

Driver: https://github.com/oohal/linux/tree/contutto-next
Firmware: https://github.com/oohal/skiboot/tree/fake-contutto

Edit core/init.c:686 to control the amount of memory borrowed for the emulated
device.  I'm keeping the driver out of tree for a until 4.13 since I plan on
reworking the firmware interface anyway and There's at least one showstopper
bug.


Thanks,
Oliver



[PATCH 4/9] powerpc/mm: Reshuffle vmemmap_free()

2017-04-11 Thread Oliver O'Halloran
Removes an indentation level and shuffles some code around to make the
following patch cleaner. No functional changes.

Signed-off-by: Oliver O'Halloran <ooh...@gmail.com>
---
 arch/powerpc/mm/init_64.c | 47 +--
 1 file changed, 25 insertions(+), 22 deletions(-)

diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index ec84b31c6c86..f8124edb6ffa 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -234,12 +234,15 @@ static unsigned long vmemmap_list_free(unsigned long 
start)
 void __ref vmemmap_free(unsigned long start, unsigned long end)
 {
unsigned long page_size = 1 << mmu_psize_defs[mmu_vmemmap_psize].shift;
+   unsigned long page_order = get_order(page_size);
 
start = _ALIGN_DOWN(start, page_size);
 
pr_debug("vmemmap_free %lx...%lx\n", start, end);
 
for (; start < end; start += page_size) {
+   struct page *page = pfn_to_page(addr >> PAGE_SHIFT);
+   unsigned int nr_pages;
unsigned long addr;
 
/*
@@ -251,29 +254,29 @@ void __ref vmemmap_free(unsigned long start, unsigned 
long end)
continue;
 
addr = vmemmap_list_free(start);
-   if (addr) {
-   struct page *page = pfn_to_page(addr >> PAGE_SHIFT);
-
-   if (PageReserved(page)) {
-   /* allocated from bootmem */
-   if (page_size < PAGE_SIZE) {
-   /*
-* this shouldn't happen, but if it is
-* the case, leave the memory there
-*/
-   WARN_ON_ONCE(1);
-   } else {
-   unsigned int nr_pages =
-   1 << get_order(page_size);
-   while (nr_pages--)
-   free_reserved_page(page++);
-   }
-   } else
-   free_pages((unsigned long)(__va(addr)),
-   get_order(page_size));
-
-   vmemmap_remove_mapping(start, page_size);
+   if (!addr)
+   continue;
+
+   page = pfn_to_page(addr >> PAGE_SHIFT);
+   nr_pages = 1 << page_order;
+
+   if (PageReserved(page)) {
+   /* allocated from bootmem */
+   if (page_size < PAGE_SIZE) {
+   /*
+* this shouldn't happen, but if it is
+* the case, leave the memory there
+*/
+   WARN_ON_ONCE(1);
+   } else {
+   while (nr_pages--)
+   free_reserved_page(page++);
+   }
+   } else {
+   free_pages((unsigned long)(__va(addr)), page_order);
}
+
+   vmemmap_remove_mapping(start, page_size);
}
 }
 #endif
-- 
2.9.3



[PATCH 7/9] powerpc/mm: Wire up ioremap_cache

2017-04-11 Thread Oliver O'Halloran
The default implementation of ioremap_cache() is aliased to ioremap().
On powerpc ioremap() creates cache-inhibited mappings by default which
is almost certainly not what you wanted.

Signed-off-by: Oliver O'Halloran <ooh...@gmail.com>
---
 arch/powerpc/include/asm/io.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/powerpc/include/asm/io.h b/arch/powerpc/include/asm/io.h
index 5ed292431b5b..839eb031857f 100644
--- a/arch/powerpc/include/asm/io.h
+++ b/arch/powerpc/include/asm/io.h
@@ -757,6 +757,8 @@ extern void __iomem *ioremap_prot(phys_addr_t address, 
unsigned long size,
 extern void __iomem *ioremap_wc(phys_addr_t address, unsigned long size);
 #define ioremap_nocache(addr, size)ioremap((addr), (size))
 #define ioremap_uc(addr, size) ioremap((addr), (size))
+#define ioremap_cache(addr, size) \
+   ioremap_prot((addr), (size), pgprot_val(PAGE_KERNEL))
 
 extern void iounmap(volatile void __iomem *addr);
 
-- 
2.9.3



[PATCH 8/9] powerpc/mm: Wire up hpte_removebolted for powernv

2017-04-11 Thread Oliver O'Halloran
From: Rashmica Gupta <rashmic...@gmail.com>

Adds support for removing bolted (i.e kernel linear mapping) mappings on
powernv. This is needed to support memory hot unplug operations which
are required for the teardown of DAX/PMEM devices.

Cc: Rashmica Gupta <rashmic...@gmail.com>
Cc: Anton Blanchard <an...@samba.org>
Signed-off-by: Oliver O'Halloran <ooh...@gmail.com>
---
Could the original author of this add their S-o-b? I pulled it out of
Rashmica's memtrace patch, but I remember someone saying Anton wrote
it originally.
---
 arch/powerpc/mm/hash_native_64.c | 31 +++
 1 file changed, 31 insertions(+)

diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c
index 65bb8f33b399..9ba91d4905a4 100644
--- a/arch/powerpc/mm/hash_native_64.c
+++ b/arch/powerpc/mm/hash_native_64.c
@@ -407,6 +407,36 @@ static void native_hpte_updateboltedpp(unsigned long 
newpp, unsigned long ea,
tlbie(vpn, psize, psize, ssize, 0);
 }
 
+/*
+ * Remove a bolted kernel entry. Memory hotplug uses this.
+ *
+ * No need to lock here because we should be the only user.
+ */
+static int native_hpte_removebolted(unsigned long ea, int psize, int ssize)
+{
+   unsigned long vpn;
+   unsigned long vsid;
+   long slot;
+   struct hash_pte *hptep;
+
+   vsid = get_kernel_vsid(ea, ssize);
+   vpn = hpt_vpn(ea, vsid, ssize);
+
+   slot = native_hpte_find(vpn, psize, ssize);
+   if (slot == -1)
+   return -ENOENT;
+
+   hptep = htab_address + slot;
+
+   /* Invalidate the hpte */
+   hptep->v = 0;
+
+   /* Invalidate the TLB */
+   tlbie(vpn, psize, psize, ssize, 0);
+   return 0;
+}
+
+
 static void native_hpte_invalidate(unsigned long slot, unsigned long vpn,
   int bpsize, int apsize, int ssize, int local)
 {
@@ -725,6 +755,7 @@ void __init hpte_init_native(void)
mmu_hash_ops.hpte_invalidate= native_hpte_invalidate;
mmu_hash_ops.hpte_updatepp  = native_hpte_updatepp;
mmu_hash_ops.hpte_updateboltedpp = native_hpte_updateboltedpp;
+   mmu_hash_ops.hpte_removebolted = native_hpte_removebolted;
mmu_hash_ops.hpte_insert= native_hpte_insert;
mmu_hash_ops.hpte_remove= native_hpte_remove;
mmu_hash_ops.hpte_clear_all = native_hpte_clear;
-- 
2.9.3



[PATCH 1/2] powerpc/mm: fix up pgtable dump flags

2017-03-30 Thread Oliver O'Halloran
On Book3s we have two PTE flags used to mark cache-inhibited mappings:
_PAGE_TOLERANT and _PAGE_NON_IDEMPOTENT. Currently the kernel page
table dumper only looks at the generic _PAGE_NO_CACHE which is
defined to be _PAGE_TOLERANT. This patch modifies the dumper so
both flags are shown in the dump.

Cc: Rashmica Gupta <rashmic...@gmail.com>
Signed-off-by: Oliver O'Halloran <ooh...@gmail.com>
---
 arch/powerpc/mm/dump_linuxpagetables.c | 13 +
 1 file changed, 13 insertions(+)

diff --git a/arch/powerpc/mm/dump_linuxpagetables.c 
b/arch/powerpc/mm/dump_linuxpagetables.c
index 49abaf4dc8e3..e7cbfd5a0940 100644
--- a/arch/powerpc/mm/dump_linuxpagetables.c
+++ b/arch/powerpc/mm/dump_linuxpagetables.c
@@ -154,11 +154,24 @@ static const struct flag_info flag_array[] = {
.clear  = " ",
}, {
 #endif
+#ifndef CONFIG_PPC_BOOK3S_64
.mask   = _PAGE_NO_CACHE,
.val= _PAGE_NO_CACHE,
.set= "no cache",
.clear  = "",
}, {
+#else
+   .mask   = _PAGE_NON_IDEMPOTENT,
+   .val= _PAGE_NON_IDEMPOTENT,
+   .set= "non-idempotent",
+   .clear  = "  ",
+   }, {
+   .mask   = _PAGE_TOLERANT,
+   .val= _PAGE_TOLERANT,
+   .set= "tolerant",
+   .clear  = "",
+   }, {
+#endif
 #ifdef CONFIG_PPC_BOOK3S_64
.mask   = H_PAGE_BUSY,
.val= H_PAGE_BUSY,
-- 
2.9.3



Re: ZONE_DEVICE and pmem API support for powerpc

2017-04-12 Thread Oliver O'Halloran
On Wed, Apr 12, 2017 at 4:22 AM, Dan Williams <dan.j.willi...@intel.com> wrote:
> On Tue, Apr 11, 2017 at 10:42 AM, Oliver O'Halloran <ooh...@gmail.com> wrote:
>> Hi all,
>>
>> This series adds support for ZONE_DEVICE and the pmem api on powerpc. Namely,
>> support for altmaps and the various bits and pieces required for DAX PMD 
>> faults.
>> The first two patches touch generic mm/ code, but otherwise this is fairly 
>> well
>> contained in arch/powerpc.
>>
>> If the nvdimm folks could sanity check this series I'd appreciate it.
>
> Quick feedback: I'm in the process of cleaning up and resubmitting my
> patch set to push the pmem api down into the driver directly.
>
> https://lwn.net/Articles/713064/

That's been on my radar for a while and I was hoping it would be in
4.12. Moving operations into the driver makes a lot of sense from a
design perspective and it should make supporting some of the
contutto's eccentricities a bit easier.

> I'm also reworking memory hotplug to allow sub-section allocations
> which has collided with Michal Hocko's hotplug reworks. It will be
> good to have some more eyes on that work to understand the cross-arch
> implications.
>
> https://lkml.org/lkml/2017/3/19/146

I'd been putting off looking at this since I figured it would clash
with the hotplug rework and HMM, but I'll see if I can get it working
on ppc.

>> Series is based on next-20170411, but it should apply elsewhere with minor
>> fixups to arch_{add|remove}_memory due to conflicts with HMM.  For those
>> interested in testing this, there is a driver and matching firmware that 
>> carves
>> out some system memory for use as an emulated Con Tutto memory card.
>>
>> Driver: https://github.com/oohal/linux/tree/contutto-next
>> Firmware: https://github.com/oohal/skiboot/tree/fake-contutto
>>
>> Edit core/init.c:686 to control the amount of memory borrowed for the 
>> emulated
>> device.  I'm keeping the driver out of tree for a until 4.13 since I plan on
>> reworking the firmware interface anyway and There's at least one showstopper
>> bug.
>
> Is this memory card I/O-cache coherent? I.e. existing dma mapping api
> can hand out mappings to it? Just trying to figure out if this the
> existing pmem-definition of ZONE_DEVICE or a new one.

As far as the rest of the system is concerned Con Tutto memory is
identical to normal system memory. All accesses to the card's memory
is mediated by a memory controller which participates in the memory
coherency protocol. That said, the link between the card and the
system is non-coherent so logic on the FPGA can access memory
incoherently. I'm primarily interested in using the card as a memory
platform so I haven't spent a lot of time thinking about the latter
use case, but a different concept of device memory might be required
there.

Oliver


Re: [v8] powerpc/powernv: add 'firmware/exports' attributes to sysfs

2017-04-06 Thread Oliver O'Halloran
On Thu, Mar 30, 2017 at 10:28 AM, Matt Brown
<matthew.brown@gmail.com> wrote:
> The HDAT data area is consumed by skiboot and turned into a device-tree. In
> some cases we would like to look directly at the HDAT. This is not possible
> through /dev/mem as it is reserved memory which is stopped by the /dev/mem
> filter. There are also other memory areas which are reserved but could be
> useful to view for debugging purposes.
>
> This patch adds sysfs nodes to allow specified memory areas to be viewed.
> sysfs nodes are created for each property in the device-tree under
> /ibm,opal/firmware/exports/, and adds them to /sys/firmware/opal/exports/
> with root read-only permissions.
>
> Signed-off-by: Matt Brown <matthew.brown@gmail.com>
> ---
> Changelog
> v8
> - fixed error handling
> - added dynamic allocation of attributes
> - using of_property_read_u64_array for reading attr vals
> - reordered vars
> - renaming vars
> ---
>  arch/powerpc/platforms/powernv/opal.c | 81 
> +++
>  1 file changed, 81 insertions(+)
>
> diff --git a/arch/powerpc/platforms/powernv/opal.c 
> b/arch/powerpc/platforms/powernv/opal.c
> index 2822935..232f94e 100644
> --- a/arch/powerpc/platforms/powernv/opal.c
> +++ b/arch/powerpc/platforms/powernv/opal.c
> @@ -604,6 +604,84 @@ static void opal_export_symmap(void)
> pr_warn("Error %d creating OPAL symbols file\n", rc);
>  }
>
> +static ssize_t export_attr_read(struct file *fp, struct kobject *kobj,
> +struct bin_attribute *bin_attr, char *buf,
> +loff_t off, size_t count)
> +{
> +   return memory_read_from_buffer(buf, count, , bin_attr->private,
> +  bin_attr->size);
> +}
> +
> +/*
> + * opal_export_attrs: creates a sysfs node for each property listed in
> + * the device-tree under /ibm,opal/firmware/exports/
> + * All new sysfs nodes are created under /opal/exports/.
> + * This allows for reserved memory regions (e.g. HDAT) to be read.
> + * The new sysfs nodes are only readable by root.
> + */
> +static void opal_export_attrs(void)
> +{
> +   struct bin_attribute *attr_tmp;
> +   struct device_node *np;
> +   struct property *prop;
> +   struct kobject *kobj;
> +   u64 vals[2];
> +   int rc, n;
> +
> +   /* Create new 'exports' directory - /sys/firmware/opal/exports */
> +   kobj = kobject_create_and_add("exports", opal_kobj);
> +   if (!kobj) {
> +   pr_warn("kobject_create_and_add exports failed\n");
> +   return;
> +   }
> +
> +   np = of_find_node_by_path("/ibm,opal/firmware/exports");
> +   if (!np)
> +   return;
> +
> +   n = 0;
> +   for (prop = np->properties; prop != NULL; prop = prop->next)
> +   n++;
> +
> +   if (n < 2)
> +   goto cleanup;
> +
> +   for_each_property_of_node(np, prop) {
> +   if (!strcmp(prop->name, "name") ||
> +   !strcmp(prop->name, "phandle"))
> +   continue;
> +
> +   if (of_property_read_u64_array(np, prop->name, [0], 2))
> +   continue;
> +
> +   attr_tmp = kmalloc(sizeof(*attr_tmp), GFP_KERNEL);
> +
> +   if (attr_tmp == NULL) {
> +   pr_warn("Failed kmalloc for bin_attribute attr_tmp");
> +   continue;
> +   }
> +
> +   attr_tmp->attr.name = kstrdup(prop->name, GFP_KERNEL);
> +   attr_tmp->attr.mode = 0400;
> +   attr_tmp->read = export_attr_read;
> +   attr_tmp->private = __va(vals[0]);
> +   attr_tmp->size = vals[1];
> +
> +   if (attr_tmp->attr.name == NULL) {
> +   pr_warn("Failed kstrdup for bin_attribute attr.name");
> +   kfree(attr_tmp);
> +   continue;
> +   }
> +   rc = sysfs_create_bin_file(kobj, attr_tmp);
> +   if (rc)
> +   pr_warn("Error %d creating OPAL sysfs exports/%s 
> file\n",
> + rc, prop->name);
> +   }
> +
> +cleanup:
> +   of_node_put(np);
> +}
> +
>  static void __init opal_dump_region_init(void)
>  {
> void *addr;
> @@ -742,6 +820,9 @@ static int __init opal_init(void)
> opal_msglog_sysfs_init();
> }
>
> +   /* Export all properties */
> +   opal_export_attrs();
> +
> /* Initialize platform devices: IPMI backend, PRD & flash interface */
> opal_pdev_init("ibm,opal-ipmi");
> opal_pdev_init("ibm,opal-flash");
> --
> 2.9.3
>

Reviewed-by: Oliver O'Halloran <ooh...@gmail.com>


Re: [PATCH 1/2] powerpc/mm: fix up pgtable dump flags

2017-04-12 Thread Oliver O'Halloran
On Wed, Apr 12, 2017 at 4:52 PM, Michael Ellerman <m...@ellerman.id.au> wrote:
> Rashmica Gupta <rashmic...@gmail.com> writes:
>
>> On 31/03/17 12:37, Oliver O'Halloran wrote:
>>> On Book3s we have two PTE flags used to mark cache-inhibited mappings:
>>> _PAGE_TOLERANT and _PAGE_NON_IDEMPOTENT. Currently the kernel page
>>> table dumper only looks at the generic _PAGE_NO_CACHE which is
>>> defined to be _PAGE_TOLERANT. This patch modifies the dumper so
>>> both flags are shown in the dump.
>>>
>>> Cc: Rashmica Gupta <rashmic...@gmail.com>
>>> Signed-off-by: Oliver O'Halloran <ooh...@gmail.com>
>
>> Should we also add in _PAGE_SAO  that is in Book3s?
>
> I don't think we ever expect to see it in the kernel page tables. But if
> we did that would be "interesting".
>
> I've forgotten what the code does with unknown bits, does it already
> print them in some way?

Currently it just traverses the list of known bits and prints out a
message for each. Printing any unknown bits is probably a good idea.
I'll send another patch to add that though and leave this one as-is.

> If not we should either add that or add _PAGE_SAO and everything else
> that could possibly ever be there.

ok


Re: [PATCH 8/9] powerpc/mm: Wire up hpte_removebolted for powernv

2017-04-12 Thread Oliver O'Halloran
On Wed, Apr 12, 2017 at 11:53 AM, Balbir Singh <bsinghar...@gmail.com> wrote:
> On Wed, 2017-04-12 at 03:42 +1000, Oliver O'Halloran wrote:
>> From: Rashmica Gupta <rashmic...@gmail.com>
>>
>> Adds support for removing bolted (i.e kernel linear mapping) mappings on
>> powernv. This is needed to support memory hot unplug operations which
>> are required for the teardown of DAX/PMEM devices.
>>
>> Cc: Rashmica Gupta <rashmic...@gmail.com>
>> Cc: Anton Blanchard <an...@samba.org>
>> Signed-off-by: Oliver O'Halloran <ooh...@gmail.com>
>> ---
>> Could the original author of this add their S-o-b? I pulled it out of
>> Rashmica's memtrace patch, but I remember someone saying Anton wrote
>> it originally.
>> ---
>>  arch/powerpc/mm/hash_native_64.c | 31 +++
>>  1 file changed, 31 insertions(+)
>>
>> diff --git a/arch/powerpc/mm/hash_native_64.c 
>> b/arch/powerpc/mm/hash_native_64.c
>> index 65bb8f33b399..9ba91d4905a4 100644
>> --- a/arch/powerpc/mm/hash_native_64.c
>> +++ b/arch/powerpc/mm/hash_native_64.c
>> @@ -407,6 +407,36 @@ static void native_hpte_updateboltedpp(unsigned long 
>> newpp, unsigned long ea,
>>   tlbie(vpn, psize, psize, ssize, 0);
>>  }
>>
>> +/*
>> + * Remove a bolted kernel entry. Memory hotplug uses this.
>> + *
>> + * No need to lock here because we should be the only user.
>
> As long as this is after the necessary isolation and is called from
> arch_remove_memory(), I think we should be fine
>
>> + */
>> +static int native_hpte_removebolted(unsigned long ea, int psize, int ssize)
>> +{
>> + unsigned long vpn;
>> + unsigned long vsid;
>> + long slot;
>> + struct hash_pte *hptep;
>> +
>> + vsid = get_kernel_vsid(ea, ssize);
>> + vpn = hpt_vpn(ea, vsid, ssize);
>> +
>> + slot = native_hpte_find(vpn, psize, ssize);
>> + if (slot == -1)
>> + return -ENOENT;
>
> If slot == -1, it means someone else removed the HPTE entry? Are we racing?
> I suspect we should never hit this situation during hotunplug, specifically
> since this is bolted.

Or the slot was never populated in the first place. I'd rather keep
the current behaviour since it aligns with the behaviour of
pSeries_lpar_hpte_removebolted and we might hit these situations in
the future if the sub-section hotplug patches are merged (big if...).

>
>> +
>> + hptep = htab_address + slot;
>> +
>> + /* Invalidate the hpte */
>> + hptep->v = 0;
>
> Under DEBUG or otherwise, I would add more checks like
>
> 1. was hpte_v & HPTE_V_VALID and BOLTED set? If not, we've already invalidated
> that hpte and we can skip the tlbie. Since this was bolted you might be right
> that it is always valid and bolted

A VM_WARN_ON() if the bolted bit is clear might be appropriate. We
don't need to check the valid bit since hpte_native_find() will fail
if it's cleared.

>
>> +
>> + /* Invalidate the TLB */
>> + tlbie(vpn, psize, psize, ssize, 0);
>
> The API also does not clear linear_map_hash_slots[] under DEBUG_PAGEALLOC

I'm not sure what API you're referring to here. The tracking for
linear_map_hash_slots[] is agnostic of mmu_hash_ops so we shouldn't be
touching it here. It also looks like DEBUG_PAGEALLOC is a bit broken
with hotplugged memory anyway so I think that's a fix for a different
patch.

>
>> + return 0;
>> +}
>> +
>> +
>
> Balbir Singh.


[PATCH] powerpc/misc: fix exported functions that reference the TOC

2017-04-02 Thread Oliver O'Halloran
When the kernel is compiled to use 64bit ABIv2 the _GLOBAL() macro does not
include a global entry point. A function's global entry point is used when the
function is called from a different TOC context and in the kernel this
typically means a call from a module into the vmlinux (or vis-a-vis).

There are a few exported ASM functions declared with _GLOBAL() and calling
them from a module will module will likely crash the kernel since any TOC
relative load will yield garbage.

To fix this use _GLOBAL_TOC() for exported asm functions rather than _GLOBAL()
and some documentation about when to use each.

Signed-off-by: Oliver O'Halloran <ooh...@gmail.com>
---
 arch/powerpc/include/asm/ppc_asm.h | 12 
 arch/powerpc/kernel/misc_64.S  |  4 ++--
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/ppc_asm.h 
b/arch/powerpc/include/asm/ppc_asm.h
index 359c443..3abf8c3 100644
--- a/arch/powerpc/include/asm/ppc_asm.h
+++ b/arch/powerpc/include/asm/ppc_asm.h
@@ -198,6 +198,18 @@ END_FW_FTR_SECTION_IFSET(FW_FEATURE_SPLPAR)
 
 #ifdef PPC64_ELF_ABI_v2
 
+/*
+ * When to use _GLOBAL_TOC() instead of _GLOBAL():
+ *
+ * a) The function is exported using EXPORT_SYMBOL_*()
+ *  *and*
+ * b) The function, or any function that it calls, references the TOC.
+ *
+ * In this situation _GLOBAL_TOC() is required because exported functions are
+ * callable from modules which may a different TOC to the kernel proper and the
+ * _GLOBAL() macro skips the TOC setup which is required on ELF ABIv2.
+ */
+
 #define _GLOBAL(name) \
.align 2 ; \
.type name,@function; \
diff --git a/arch/powerpc/kernel/misc_64.S b/arch/powerpc/kernel/misc_64.S
index ec94aef..d18da8c 100644
--- a/arch/powerpc/kernel/misc_64.S
+++ b/arch/powerpc/kernel/misc_64.S
@@ -67,7 +67,7 @@ PPC64_CACHES:
  *   flush all bytes from start through stop-1 inclusive
  */
 
-_GLOBAL(flush_icache_range)
+_GLOBAL_TOC(flush_icache_range)
 BEGIN_FTR_SECTION
PURGE_PREFETCHED_INS
blr
@@ -120,7 +120,7 @@ EXPORT_SYMBOL(flush_icache_range)
  *
  *flush all bytes from start to stop-1 inclusive
  */
-_GLOBAL(flush_dcache_range)
+_GLOBAL_TOC(flush_dcache_range)
 
 /*
  * Flush the data cache to memory 
-- 
2.9.3



Re: [PATCH] of: introduce event tracepoints for dynamic device_node lifecyle

2017-04-18 Thread Oliver O'Halloran
On Wed, Apr 19, 2017 at 2:46 AM, Rob Herring  wrote:
> On Mon, Apr 17, 2017 at 7:32 PM, Tyrel Datwyler
>  wrote:
>> This patch introduces event tracepoints for tracking a device_nodes
>> reference cycle as well as reconfig notifications generated in response
>> to node/property manipulations.
>>
>> With the recent upstreaming of the refcount API several device_node
>> underflows and leaks have come to my attention in the pseries (DLPAR) dynamic
>> logical partitioning code (ie. POWER speak for hotplugging virtual and 
>> physcial
>> resources at runtime such as cpus or IOAs). These tracepoints provide a
>> easy and quick mechanism for validating the reference counting of
>> device_nodes during their lifetime.
>
> Not really relevant for this patch, but since you are looking at
> pseries and refcounting, the refcounting largely exists for pseries.
> It's also hard to get right as this type of fix is fairly common. It's
> now used for overlays, but we really probably only need to refcount
> the overlays or changesets as a whole, not at a node level. If you
> have any thoughts on how a different model of refcounting could work
> for pseries, I'd like to discuss it.

One idea I've been kicking around is differentiating short and long
term references to a node. I figure most leaks are due to a missing
of_node_put() within a stack frame so it might be possible to use the
ftrace infrastructure to detect and emit warnings if a short term
reference is leaked. Long term references are slightly harder to deal
with, but they're less common so we can add more detailed reference
tracking there (devm_of_get_node?).

Oliver


[PATCH] powerpc/mm: Check for _PAGE_PTE in *_devmap()

2017-07-27 Thread Oliver O'Halloran
The ISA radix translation tree contains two different types of entry,
directories and leaves. The formats of the two entries are different
with the directory entries containing no spare bits for use by software.
As a result we need to ensure that the *_devmap() family of functions
check fail for everything except leaf (PTE) entries.

Signed-off-by: Oliver O'Halloran <ooh...@gmail.com>
---
 arch/powerpc/include/asm/book3s/64/pgtable.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h 
b/arch/powerpc/include/asm/book3s/64/pgtable.h
index c0737c8..e1989dd 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -610,7 +610,7 @@ static inline pte_t pte_mkdevmap(pte_t pte)
 
 static inline int pte_devmap(pte_t pte)
 {
-   return !!(pte_raw(pte) & cpu_to_be64(_PAGE_DEVMAP));
+   return !!(pte_raw(pte) & cpu_to_be64(_PAGE_DEVMAP | _PAGE_PTE));
 }
 
 static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
-- 
2.9.3



[PATCH v2] powerpc/mm: Check for _PAGE_PTE in *_devmap()

2017-07-27 Thread Oliver O'Halloran
The ISA radix translation tree contains two different types of entry,
directories and leaves. The formats of the two entries are different
with the directory entries containing no spare bits for use by software.
As a result we need to ensure that the *_devmap() family of functions
check fail for everything except leaf (PTE) entries.

Signed-off-by: Oliver O'Halloran <ooh...@gmail.com>
---
"i'll just tweak the mbox before i sent it, what's the worst that can happen"
*completely breaks KVM*
"..."
---
 arch/powerpc/include/asm/book3s/64/pgtable.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h 
b/arch/powerpc/include/asm/book3s/64/pgtable.h
index d1da415..6bc6248 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -610,7 +610,9 @@ static inline pte_t pte_mkdevmap(pte_t pte)
 
 static inline int pte_devmap(pte_t pte)
 {
-   return !!(pte_raw(pte) & cpu_to_be64(_PAGE_DEVMAP));
+   uint64_t mask = cpu_to_be64(_PAGE_DEVMAP | _PAGE_PTE);
+
+   return (pte_raw(pte) & mask) == mask;
 }
 
 static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
-- 
2.7.4



  1   2   3   4   5   6   7   >