vmx ghash buggy on ppc64le

2017-09-18 Thread Herbert Xu
Hi:

I have received a report that ghash on ppc64le does not interoperate
with other implementations of ghash, e.g., on x86-64.

https://bugzilla.redhat.com/show_bug.cgi?id=1490972

Could you guys take a look at this and see if this is a bug in
the mainline vmx driver too?

Thanks!
-- 
Email: Herbert Xu 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Re: [PATCH 00/12] x86/crypto: Fix RBP usage in several crypto .S files

2017-09-18 Thread Herbert Xu
On Fri, Sep 15, 2017 at 11:06:29PM +0200, Ingo Molnar wrote:
>
> Indeed, I suspect they should go through the crypto tree, these fixes are 
> independent, they don't depend on anything in the x86 tree.

Sure I can pick them up through cryptodev.

Thanks,
-- 
Email: Herbert Xu 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


[PATCH v8 4/7] iomap: introduce io{read|write}64_{lo_hi|hi_lo}

2017-09-18 Thread Logan Gunthorpe
In order to provide non-atomic functions for io{read|write}64 that will
use readq and writeq when appropriate. We define a number of variants
of these functions in the generic iomap that will do non-atomic
operations on pio but atomic operations on mmio.

These functions are only defined if readq and writeq are defined. If
they are not, then the wrappers that always use non-atomic operations
from include/linux/io-64-nonatomic*.h will be used.

Signed-off-by: Logan Gunthorpe 
Reviewed-by: Andy Shevchenko 
Cc: Benjamin Herrenschmidt 
Cc: Paul Mackerras 
Cc: Michael Ellerman 
Cc: Arnd Bergmann 
Cc: Suresh Warrier 
Cc: Nicholas Piggin 
---
 arch/powerpc/include/asm/io.h |   2 +
 include/asm-generic/iomap.h   |  26 +++--
 lib/iomap.c   | 132 ++
 3 files changed, 154 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/include/asm/io.h b/arch/powerpc/include/asm/io.h
index af074923d598..4cc420cfaa78 100644
--- a/arch/powerpc/include/asm/io.h
+++ b/arch/powerpc/include/asm/io.h
@@ -788,8 +788,10 @@ extern void __iounmap_at(void *ea, unsigned long size);
 
 #define mmio_read16be(addr)readw_be(addr)
 #define mmio_read32be(addr)readl_be(addr)
+#define mmio_read64be(addr)readq_be(addr)
 #define mmio_write16be(val, addr)  writew_be(val, addr)
 #define mmio_write32be(val, addr)  writel_be(val, addr)
+#define mmio_write64be(val, addr)  writeq_be(val, addr)
 #define mmio_insb(addr, dst, count)readsb(addr, dst, count)
 #define mmio_insw(addr, dst, count)readsw(addr, dst, count)
 #define mmio_insl(addr, dst, count)readsl(addr, dst, count)
diff --git a/include/asm-generic/iomap.h b/include/asm-generic/iomap.h
index 650fede33c25..30edebf627fe 100644
--- a/include/asm-generic/iomap.h
+++ b/include/asm-generic/iomap.h
@@ -30,9 +30,16 @@ extern unsigned int ioread16(void __iomem *);
 extern unsigned int ioread16be(void __iomem *);
 extern unsigned int ioread32(void __iomem *);
 extern unsigned int ioread32be(void __iomem *);
-#ifdef CONFIG_64BIT
-extern u64 ioread64(void __iomem *);
-extern u64 ioread64be(void __iomem *);
+
+#ifdef readq
+#define ioread64_lo_hi ioread64_lo_hi
+#define ioread64_hi_lo ioread64_hi_lo
+#define ioread64be_lo_hi ioread64be_lo_hi
+#define ioread64be_hi_lo ioread64be_hi_lo
+extern u64 ioread64_lo_hi(void __iomem *addr);
+extern u64 ioread64_hi_lo(void __iomem *addr);
+extern u64 ioread64be_lo_hi(void __iomem *addr);
+extern u64 ioread64be_hi_lo(void __iomem *addr);
 #endif
 
 extern void iowrite8(u8, void __iomem *);
@@ -40,9 +47,16 @@ extern void iowrite16(u16, void __iomem *);
 extern void iowrite16be(u16, void __iomem *);
 extern void iowrite32(u32, void __iomem *);
 extern void iowrite32be(u32, void __iomem *);
-#ifdef CONFIG_64BIT
-extern void iowrite64(u64, void __iomem *);
-extern void iowrite64be(u64, void __iomem *);
+
+#ifdef writeq
+#define iowrite64_lo_hi iowrite64_lo_hi
+#define iowrite64_hi_lo iowrite64_hi_lo
+#define iowrite64be_lo_hi iowrite64be_lo_hi
+#define iowrite64be_hi_lo iowrite64be_hi_lo
+extern void iowrite64_lo_hi(u64 val, void __iomem *addr);
+extern void iowrite64_hi_lo(u64 val, void __iomem *addr);
+extern void iowrite64be_lo_hi(u64 val, void __iomem *addr);
+extern void iowrite64be_hi_lo(u64 val, void __iomem *addr);
 #endif
 
 /*
diff --git a/lib/iomap.c b/lib/iomap.c
index fc3dcb4b238e..845b9c41082c 100644
--- a/lib/iomap.c
+++ b/lib/iomap.c
@@ -66,6 +66,7 @@ static void bad_io_access(unsigned long port, const char 
*access)
 #ifndef mmio_read16be
 #define mmio_read16be(addr) be16_to_cpu(__raw_readw(addr))
 #define mmio_read32be(addr) be32_to_cpu(__raw_readl(addr))
+#define mmio_read64be(addr) be64_to_cpu(__raw_readq(addr))
 #endif
 
 unsigned int ioread8(void __iomem *addr)
@@ -99,6 +100,80 @@ EXPORT_SYMBOL(ioread16be);
 EXPORT_SYMBOL(ioread32);
 EXPORT_SYMBOL(ioread32be);
 
+#ifdef readq
+static u64 pio_read64_lo_hi(unsigned long port)
+{
+   u64 lo, hi;
+
+   lo = inl(port);
+   hi = inl(port + sizeof(u32));
+
+   return lo | (hi << 32);
+}
+
+static u64 pio_read64_hi_lo(unsigned long port)
+{
+   u64 lo, hi;
+
+   hi = inl(port + sizeof(u32));
+   lo = inl(port);
+
+   return lo | (hi << 32);
+}
+
+static u64 pio_read64be_lo_hi(unsigned long port)
+{
+   u64 lo, hi;
+
+   lo = pio_read32be(port + sizeof(u32));
+   hi = pio_read32be(port);
+
+   return lo | (hi << 32);
+}
+
+static u64 pio_read64be_hi_lo(unsigned long port)
+{
+   u64 lo, hi;
+
+   hi = pio_read32be(port);
+   lo = pio_read32be(port + sizeof(u32));
+
+   return lo | (hi << 32);
+}
+
+u64 ioread64_lo_hi(void __iomem *addr)
+{
+   IO_COND(addr, return pio_read64_lo_hi(port), return readq(addr));
+   return 0xULL;
+}
+
+u64 

[PATCH v8 5/7] io-64-nonatomic: add io{read|write}64[be]{_lo_hi|_hi_lo} macros

2017-09-18 Thread Logan Gunthorpe
This patch adds generic io{read|write}64[be]{_lo_hi|_hi_lo} macros if
they are not already defined by the architecture. (As they are provided
by the generic iomap library).

The patch also points io{read|write}64[be] to the variant specified by the
header name.

This is because new drivers are encouraged to use ioreadXX, et al instead
of readX[1], et al -- and mixing ioreadXX with readq is pretty ugly.

[1] LDD3: section 9.4.2

Signed-off-by: Logan Gunthorpe 
Reviewed-by: Andy Shevchenko 
Cc: Christoph Hellwig 
Cc: Arnd Bergmann 
Cc: Alan Cox 
Cc: Greg Kroah-Hartman 
---
 include/linux/io-64-nonatomic-hi-lo.h | 64 +++
 include/linux/io-64-nonatomic-lo-hi.h | 64 +++
 2 files changed, 128 insertions(+)

diff --git a/include/linux/io-64-nonatomic-hi-lo.h 
b/include/linux/io-64-nonatomic-hi-lo.h
index defcc4644ce3..410c8b177080 100644
--- a/include/linux/io-64-nonatomic-hi-lo.h
+++ b/include/linux/io-64-nonatomic-hi-lo.h
@@ -54,4 +54,68 @@ static inline void hi_lo_writeq_relaxed(__u64 val, volatile 
void __iomem *addr)
 #define writeq_relaxed hi_lo_writeq_relaxed
 #endif
 
+#ifndef ioread64_hi_lo
+#define ioread64_hi_lo ioread64_hi_lo
+static inline u64 ioread64_hi_lo(void __iomem *addr)
+{
+   u32 low, high;
+
+   high = ioread32(addr + sizeof(u32));
+   low = ioread32(addr);
+
+   return low + ((u64)high << 32);
+}
+#endif
+
+#ifndef iowrite64_hi_lo
+#define iowrite64_hi_lo iowrite64_hi_lo
+static inline void iowrite64_hi_lo(u64 val, void __iomem *addr)
+{
+   iowrite32(val >> 32, addr + sizeof(u32));
+   iowrite32(val, addr);
+}
+#endif
+
+#ifndef ioread64be_hi_lo
+#define ioread64be_hi_lo ioread64be_hi_lo
+static inline u64 ioread64be_hi_lo(void __iomem *addr)
+{
+   u32 low, high;
+
+   high = ioread32be(addr);
+   low = ioread32be(addr + sizeof(u32));
+
+   return low + ((u64)high << 32);
+}
+#endif
+
+#ifndef iowrite64be_hi_lo
+#define iowrite64be_hi_lo iowrite64be_hi_lo
+static inline void iowrite64be_hi_lo(u64 val, void __iomem *addr)
+{
+   iowrite32be(val >> 32, addr);
+   iowrite32be(val, addr + sizeof(u32));
+}
+#endif
+
+#ifndef ioread64
+#define ioread64_is_nonatomic
+#define ioread64 ioread64_hi_lo
+#endif
+
+#ifndef iowrite64
+#define iowrite64_is_nonatomic
+#define iowrite64 iowrite64_hi_lo
+#endif
+
+#ifndef ioread64be
+#define ioread64be_is_nonatomic
+#define ioread64be ioread64be_hi_lo
+#endif
+
+#ifndef iowrite64be
+#define iowrite64be_is_nonatomic
+#define iowrite64be iowrite64be_hi_lo
+#endif
+
 #endif /* _LINUX_IO_64_NONATOMIC_HI_LO_H_ */
diff --git a/include/linux/io-64-nonatomic-lo-hi.h 
b/include/linux/io-64-nonatomic-lo-hi.h
index 084461a4e5ab..acba36812be8 100644
--- a/include/linux/io-64-nonatomic-lo-hi.h
+++ b/include/linux/io-64-nonatomic-lo-hi.h
@@ -54,4 +54,68 @@ static inline void lo_hi_writeq_relaxed(__u64 val, volatile 
void __iomem *addr)
 #define writeq_relaxed lo_hi_writeq_relaxed
 #endif
 
+#ifndef ioread64_lo_hi
+#define ioread64_lo_hi ioread64_lo_hi
+static inline u64 ioread64_lo_hi(void __iomem *addr)
+{
+   u32 low, high;
+
+   low = ioread32(addr);
+   high = ioread32(addr + sizeof(u32));
+
+   return low + ((u64)high << 32);
+}
+#endif
+
+#ifndef iowrite64_lo_hi
+#define iowrite64_lo_hi iowrite64_lo_hi
+static inline void iowrite64_lo_hi(u64 val, void __iomem *addr)
+{
+   iowrite32(val, addr);
+   iowrite32(val >> 32, addr + sizeof(u32));
+}
+#endif
+
+#ifndef ioread64be_lo_hi
+#define ioread64be_lo_hi ioread64be_lo_hi
+static inline u64 ioread64be_lo_hi(void __iomem *addr)
+{
+   u32 low, high;
+
+   low = ioread32be(addr + sizeof(u32));
+   high = ioread32be(addr);
+
+   return low + ((u64)high << 32);
+}
+#endif
+
+#ifndef iowrite64be_lo_hi
+#define iowrite64be_lo_hi iowrite64be_lo_hi
+static inline void iowrite64be_lo_hi(u64 val, void __iomem *addr)
+{
+   iowrite32be(val, addr + sizeof(u32));
+   iowrite32be(val >> 32, addr);
+}
+#endif
+
+#ifndef ioread64
+#define ioread64_is_nonatomic
+#define ioread64 ioread64_lo_hi
+#endif
+
+#ifndef iowrite64
+#define iowrite64_is_nonatomic
+#define iowrite64 iowrite64_lo_hi
+#endif
+
+#ifndef ioread64be
+#define ioread64be_is_nonatomic
+#define ioread64be ioread64be_lo_hi
+#endif
+
+#ifndef iowrite64be
+#define iowrite64be_is_nonatomic
+#define iowrite64be iowrite64be_lo_hi
+#endif
+
 #endif /* _LINUX_IO_64_NONATOMIC_LO_HI_H_ */
-- 
2.11.0



[PATCH v8 2/7] powerpc: io.h: move iomap.h include so that it can use readq/writeq defs

2017-09-18 Thread Logan Gunthorpe
Subsequent patches in this series makes use of the readq and writeq
defines in iomap.h. However, as is, they get missed on the powerpc
platform seeing the include comes before the define. This patch
moves the include down to fix this.

Signed-off-by: Logan Gunthorpe 
Acked-By: Michael Ellerman 
Reviewed-by: Andy Shevchenko 
Cc: Benjamin Herrenschmidt 
Cc: Paul Mackerras 
Cc: Michael Ellerman 
Cc: Nicholas Piggin 
Cc: Suresh Warrier 
Cc: "Oliver O'Halloran" 
---
 arch/powerpc/include/asm/io.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/io.h b/arch/powerpc/include/asm/io.h
index 422f99cf9924..af074923d598 100644
--- a/arch/powerpc/include/asm/io.h
+++ b/arch/powerpc/include/asm/io.h
@@ -33,8 +33,6 @@ extern struct pci_dev *isa_bridge_pcidev;
 #include 
 #include 
 
-#include 
-
 #ifdef CONFIG_PPC64
 #include 
 #endif
@@ -663,6 +661,8 @@ static inline void name at  
\
 #define writel_relaxed(v, addr)writel(v, addr)
 #define writeq_relaxed(v, addr)writeq(v, addr)
 
+#include 
+
 #ifdef CONFIG_PPC32
 #define mmiowb()
 #else
-- 
2.11.0



[PATCH v8 1/7] drm/tilcdc: ensure nonatomic iowrite64 is not used

2017-09-18 Thread Logan Gunthorpe
Add a check to ensure iowrite64 is only used if it is atomic.

It was decided in [1] that the tilcdc driver should not be using an
atomic operation (so it was left out of this patchset). However, it turns
out that through the drm code, a nonatomic header is actually included:

include/linux/io-64-nonatomic-lo-hi.h
is included from include/drm/drm_os_linux.h:9:0,
from include/drm/drmP.h:74,
from include/drm/drm_modeset_helper.h:26,
from include/drm/drm_atomic_helper.h:33,
from drivers/gpu/drm/tilcdc/tilcdc_crtc.c:19:

And thus, without this change, this patchset would inadvertantly
change the behaviour of the tilcdc driver.

[1] 
lkml.kernel.org/r/cak8p3a2hho_zcnstzq7hmwsz5la5thu19fwzpun16imnyyn...@mail.gmail.com

Signed-off-by: Logan Gunthorpe 
Reviewed-by: Andy Shevchenko 
Cc: Jyri Sarha 
Cc: Arnd Bergmann 
Cc: Tomi Valkeinen 
Cc: David Airlie 
---
 drivers/gpu/drm/tilcdc/tilcdc_regs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/tilcdc/tilcdc_regs.h 
b/drivers/gpu/drm/tilcdc/tilcdc_regs.h
index 9d528c0a67a4..5048ebb86835 100644
--- a/drivers/gpu/drm/tilcdc/tilcdc_regs.h
+++ b/drivers/gpu/drm/tilcdc/tilcdc_regs.h
@@ -133,7 +133,7 @@ static inline void tilcdc_write64(struct drm_device *dev, 
u32 reg, u64 data)
struct tilcdc_drm_private *priv = dev->dev_private;
volatile void __iomem *addr = priv->mmio + reg;
 
-#ifdef iowrite64
+#if defined(iowrite64) && !defined(iowrite64_is_nonatomic)
iowrite64(data, addr);
 #else
__iowmb();
-- 
2.11.0



[PATCH v8 3/7] powerpc: iomap.c: introduce io{read|write}64_{lo_hi|hi_lo}

2017-09-18 Thread Logan Gunthorpe
These functions will be introduced into the generic iomap.c so
they can deal with PIO accesses in hi-lo/lo-hi variants. Thus,
the powerpc version of iomap.c will need to provide the same
functions even though, in this arch, they are identical to the
regular io{read|write}64 functions.

Signed-off-by: Logan Gunthorpe 
Tested-by: Horia Geantă 
Reviewed-by: Andy Shevchenko 
Cc: Benjamin Herrenschmidt 
Cc: Paul Mackerras 
Cc: Michael Ellerman 
---
 arch/powerpc/kernel/iomap.c | 40 
 1 file changed, 40 insertions(+)

diff --git a/arch/powerpc/kernel/iomap.c b/arch/powerpc/kernel/iomap.c
index a1854d1ded8b..b43dbadfd24f 100644
--- a/arch/powerpc/kernel/iomap.c
+++ b/arch/powerpc/kernel/iomap.c
@@ -44,12 +44,32 @@ u64 ioread64(void __iomem *addr)
 {
return readq(addr);
 }
+u64 ioread64_lo_hi(void __iomem *addr)
+{
+   return readq(addr);
+}
+u64 ioread64_hi_lo(void __iomem *addr)
+{
+   return readq(addr);
+}
 u64 ioread64be(void __iomem *addr)
 {
return readq_be(addr);
 }
+u64 ioread64be_lo_hi(void __iomem *addr)
+{
+   return readq_be(addr);
+}
+u64 ioread64be_hi_lo(void __iomem *addr)
+{
+   return readq_be(addr);
+}
 EXPORT_SYMBOL(ioread64);
+EXPORT_SYMBOL(ioread64_lo_hi);
+EXPORT_SYMBOL(ioread64_hi_lo);
 EXPORT_SYMBOL(ioread64be);
+EXPORT_SYMBOL(ioread64be_lo_hi);
+EXPORT_SYMBOL(ioread64be_hi_lo);
 #endif /* __powerpc64__ */
 
 void iowrite8(u8 val, void __iomem *addr)
@@ -82,12 +102,32 @@ void iowrite64(u64 val, void __iomem *addr)
 {
writeq(val, addr);
 }
+void iowrite64_lo_hi(u64 val, void __iomem *addr)
+{
+   writeq(val, addr);
+}
+void iowrite64_hi_lo(u64 val, void __iomem *addr)
+{
+   writeq(val, addr);
+}
 void iowrite64be(u64 val, void __iomem *addr)
 {
writeq_be(val, addr);
 }
+void iowrite64be_lo_hi(u64 val, void __iomem *addr)
+{
+   writeq_be(val, addr);
+}
+void iowrite64be_hi_lo(u64 val, void __iomem *addr)
+{
+   writeq_be(val, addr);
+}
 EXPORT_SYMBOL(iowrite64);
+EXPORT_SYMBOL(iowrite64_lo_hi);
+EXPORT_SYMBOL(iowrite64_hi_lo);
 EXPORT_SYMBOL(iowrite64be);
+EXPORT_SYMBOL(iowrite64be_lo_hi);
+EXPORT_SYMBOL(iowrite64be_hi_lo);
 #endif /* __powerpc64__ */
 
 /*
-- 
2.11.0



[PATCH v8 0/7] make io{read|write}64 globally usable

2017-09-18 Thread Logan Gunthorpe
This is version eight of my patchset to enable drivers to use
io{read|write}64 on all arches.

--

Changes since v7:
- Fix minor nits from Andy Shevchenko
- Rebased onto v4.14-rc1

Changes since v6:
 ** none **

Changes since v5:
- Added a fix to the tilcdc driver to ensure it doesn't use the
  non-atomic operation. (This includes adding io{read|write}64[be]_is_nonatomic
  defines).

Changes since v4:
- Add functions so the powerpc implementation of iomap.c compiles. (As
  noticed by Horia)

Changes since v3:

- I noticed powerpc didn't use the appropriate functions seeing
  readq/writeq were not defined when iomap.h was included. Thus I've
  included a patch to adjust this
- Fixed some mistakes with a couple of the defines in io-64-nonatomic*
  headers
- Fixed a typo noticed by Horia.

(earlier versions were drastically different)

--

Horia Geantă (1):
  crypto: caam: cleanup CONFIG_64BIT ifdefs when using io{read|write}64

Logan Gunthorpe (6):
  drm/tilcdc: ensure nonatomic iowrite64 is not used
  powerpc: io.h: move iomap.h include so that it can use readq/writeq
defs
  powerpc: iomap.c: introduce io{read|write}64_{lo_hi|hi_lo}
  iomap: introduce io{read|write}64_{lo_hi|hi_lo}
  io-64-nonatomic: add io{read|write}64[be]{_lo_hi|_hi_lo} macros
  ntb: ntb_hw_intel: use io-64-nonatomic instead of in-driver hacks

 arch/powerpc/include/asm/io.h |   6 +-
 arch/powerpc/kernel/iomap.c   |  40 +++
 drivers/crypto/caam/regs.h|  35 ++---
 drivers/gpu/drm/tilcdc/tilcdc_regs.h  |   2 +-
 drivers/ntb/hw/intel/ntb_hw_intel.c   |  30 +---
 include/asm-generic/iomap.h   |  26 +--
 include/linux/io-64-nonatomic-hi-lo.h |  64 +
 include/linux/io-64-nonatomic-lo-hi.h |  64 +
 lib/iomap.c   | 132 ++
 9 files changed, 331 insertions(+), 68 deletions(-)

--
2.11.0


[PATCH v8 6/7] ntb: ntb_hw_intel: use io-64-nonatomic instead of in-driver hacks

2017-09-18 Thread Logan Gunthorpe
Now that ioread64 and iowrite64 are available in io-64-nonatomic,
we can remove the hack at the top of ntb_hw_intel.c and replace it
with an include.

Signed-off-by: Logan Gunthorpe 
Reviewed-by: Andy Shevchenko 
Acked-by: Dave Jiang 
Acked-by: Allen Hubbe 
Acked-by: Jon Mason 
---
 drivers/ntb/hw/intel/ntb_hw_intel.c | 30 +-
 1 file changed, 1 insertion(+), 29 deletions(-)

diff --git a/drivers/ntb/hw/intel/ntb_hw_intel.c 
b/drivers/ntb/hw/intel/ntb_hw_intel.c
index 2557e2c05b90..606c90f59d4b 100644
--- a/drivers/ntb/hw/intel/ntb_hw_intel.c
+++ b/drivers/ntb/hw/intel/ntb_hw_intel.c
@@ -59,6 +59,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "ntb_hw_intel.h"
 
@@ -155,35 +156,6 @@ MODULE_PARM_DESC(xeon_b2b_dsd_bar5_addr32,
 static inline enum ntb_topo xeon_ppd_topo(struct intel_ntb_dev *ndev, u8 ppd);
 static int xeon_init_isr(struct intel_ntb_dev *ndev);
 
-#ifndef ioread64
-#ifdef readq
-#define ioread64 readq
-#else
-#define ioread64 _ioread64
-static inline u64 _ioread64(void __iomem *mmio)
-{
-   u64 low, high;
-
-   low = ioread32(mmio);
-   high = ioread32(mmio + sizeof(u32));
-   return low | (high << 32);
-}
-#endif
-#endif
-
-#ifndef iowrite64
-#ifdef writeq
-#define iowrite64 writeq
-#else
-#define iowrite64 _iowrite64
-static inline void _iowrite64(u64 val, void __iomem *mmio)
-{
-   iowrite32(val, mmio);
-   iowrite32(val >> 32, mmio + sizeof(u32));
-}
-#endif
-#endif
-
 static inline int pdev_is_atom(struct pci_dev *pdev)
 {
switch (pdev->device) {
-- 
2.11.0



[PATCH v8 7/7] crypto: caam: cleanup CONFIG_64BIT ifdefs when using io{read|write}64

2017-09-18 Thread Logan Gunthorpe
From: Horia Geantă 

We can now make use of the io-64-nonatomic-lo-hi header to always
provide 64 bit IO operations. So this patch cleans up the extra
CONFIG_64BIT ifdefs.

To be consistent with CAAM engine HW spec: in case of 64-bit registers,
irrespective of device endianness, the lower address should be read from
/ written to first, followed by the upper address. Indeed the I/O
accessors in CAAM driver currently don't follow the spec, however this
is a good opportunity to fix the code.

Signed-off-by: Horia Geantă 
Signed-off-by: Logan Gunthorpe 
Reviewed-by: Andy Shevchenko 
Cc: Horia Geantă 
Cc: Dan Douglass 
Cc: Herbert Xu 
Cc: "David S. Miller" 
---
 drivers/crypto/caam/regs.h | 35 +--
 1 file changed, 5 insertions(+), 30 deletions(-)

diff --git a/drivers/crypto/caam/regs.h b/drivers/crypto/caam/regs.h
index 2b5efff9ec3c..868ae7fb7c9f 100644
--- a/drivers/crypto/caam/regs.h
+++ b/drivers/crypto/caam/regs.h
@@ -9,7 +9,7 @@
 
 #include 
 #include 
-#include 
+#include 
 
 /*
  * Architecture-specific register access methods
@@ -134,50 +134,25 @@ static inline void clrsetbits_32(void __iomem *reg, u32 
clear, u32 set)
  *base + 0x : least-significant 32 bits
  *base + 0x0004 : most-significant 32 bits
  */
-#ifdef CONFIG_64BIT
 static inline void wr_reg64(void __iomem *reg, u64 data)
 {
+#ifndef CONFIG_CRYPTO_DEV_FSL_CAAM_IMX
if (caam_little_end)
iowrite64(data, reg);
else
-   iowrite64be(data, reg);
-}
-
-static inline u64 rd_reg64(void __iomem *reg)
-{
-   if (caam_little_end)
-   return ioread64(reg);
-   else
-   return ioread64be(reg);
-}
-
-#else /* CONFIG_64BIT */
-static inline void wr_reg64(void __iomem *reg, u64 data)
-{
-#ifndef CONFIG_CRYPTO_DEV_FSL_CAAM_IMX
-   if (caam_little_end) {
-   wr_reg32((u32 __iomem *)(reg) + 1, data >> 32);
-   wr_reg32((u32 __iomem *)(reg), data);
-   } else
 #endif
-   {
-   wr_reg32((u32 __iomem *)(reg), data >> 32);
-   wr_reg32((u32 __iomem *)(reg) + 1, data);
-   }
+   iowrite64be(data, reg);
 }
 
 static inline u64 rd_reg64(void __iomem *reg)
 {
 #ifndef CONFIG_CRYPTO_DEV_FSL_CAAM_IMX
if (caam_little_end)
-   return ((u64)rd_reg32((u32 __iomem *)(reg) + 1) << 32 |
-   (u64)rd_reg32((u32 __iomem *)(reg)));
+   return ioread64(reg);
else
 #endif
-   return ((u64)rd_reg32((u32 __iomem *)(reg)) << 32 |
-   (u64)rd_reg32((u32 __iomem *)(reg) + 1));
+   return ioread64be(reg);
 }
-#endif /* CONFIG_64BIT  */
 
 #ifdef CONFIG_ARCH_DMA_ADDR_T_64BIT
 #ifdef CONFIG_SOC_IMX7D
-- 
2.11.0



[PATCH v2 01/12] x86/crypto: Fix RBP usage in blowfish-x86_64-asm_64.S

2017-09-18 Thread Josh Poimboeuf
Using RBP as a temporary register breaks frame pointer convention and
breaks stack traces when unwinding from an interrupt in the crypto code.

Use R12 instead of RBP.  R12 can't be used as the RT0 register because
of x86 instruction encoding limitations.  So use R12 for CTX and RDI for
CTX.  This means that CTX is no longer an implicit function argument.
Instead it needs to be explicitly copied from RDI.

Reported-by: Eric Biggers 
Reported-by: Peter Zijlstra 
Tested-by: Eric Biggers 
Acked-by: Eric Biggers 
Signed-off-by: Josh Poimboeuf 
---
 arch/x86/crypto/blowfish-x86_64-asm_64.S | 48 +---
 1 file changed, 26 insertions(+), 22 deletions(-)

diff --git a/arch/x86/crypto/blowfish-x86_64-asm_64.S 
b/arch/x86/crypto/blowfish-x86_64-asm_64.S
index 246c67006ed0..8c1fcb6bad21 100644
--- a/arch/x86/crypto/blowfish-x86_64-asm_64.S
+++ b/arch/x86/crypto/blowfish-x86_64-asm_64.S
@@ -33,7 +33,7 @@
 #define s3 ((16 + 2 + (3 * 256)) * 4)
 
 /* register macros */
-#define CTX %rdi
+#define CTX %r12
 #define RIO %rsi
 
 #define RX0 %rax
@@ -56,12 +56,12 @@
 #define RX2bh %ch
 #define RX3bh %dh
 
-#define RT0 %rbp
+#define RT0 %rdi
 #define RT1 %rsi
 #define RT2 %r8
 #define RT3 %r9
 
-#define RT0d %ebp
+#define RT0d %edi
 #define RT1d %esi
 #define RT2d %r8d
 #define RT3d %r9d
@@ -120,13 +120,14 @@
 
 ENTRY(__blowfish_enc_blk)
/* input:
-*  %rdi: ctx, CTX
+*  %rdi: ctx
 *  %rsi: dst
 *  %rdx: src
 *  %rcx: bool, if true: xor output
 */
-   movq %rbp, %r11;
+   movq %r12, %r11;
 
+   movq %rdi, CTX;
movq %rsi, %r10;
movq %rdx, RIO;
 
@@ -142,7 +143,7 @@ ENTRY(__blowfish_enc_blk)
round_enc(14);
add_roundkey_enc(16);
 
-   movq %r11, %rbp;
+   movq %r11, %r12;
 
movq %r10, RIO;
test %cl, %cl;
@@ -157,12 +158,13 @@ ENDPROC(__blowfish_enc_blk)
 
 ENTRY(blowfish_dec_blk)
/* input:
-*  %rdi: ctx, CTX
+*  %rdi: ctx
 *  %rsi: dst
 *  %rdx: src
 */
-   movq %rbp, %r11;
+   movq %r12, %r11;
 
+   movq %rdi, CTX;
movq %rsi, %r10;
movq %rdx, RIO;
 
@@ -181,7 +183,7 @@ ENTRY(blowfish_dec_blk)
movq %r10, RIO;
write_block();
 
-   movq %r11, %rbp;
+   movq %r11, %r12;
 
ret;
 ENDPROC(blowfish_dec_blk)
@@ -298,20 +300,21 @@ ENDPROC(blowfish_dec_blk)
 
 ENTRY(__blowfish_enc_blk_4way)
/* input:
-*  %rdi: ctx, CTX
+*  %rdi: ctx
 *  %rsi: dst
 *  %rdx: src
 *  %rcx: bool, if true: xor output
 */
-   pushq %rbp;
+   pushq %r12;
pushq %rbx;
pushq %rcx;
 
-   preload_roundkey_enc(0);
-
+   movq %rdi, CTX
movq %rsi, %r11;
movq %rdx, RIO;
 
+   preload_roundkey_enc(0);
+
read_block4();
 
round_enc4(0);
@@ -324,39 +327,40 @@ ENTRY(__blowfish_enc_blk_4way)
round_enc4(14);
add_preloaded_roundkey4();
 
-   popq %rbp;
+   popq %r12;
movq %r11, RIO;
 
-   test %bpl, %bpl;
+   test %r12b, %r12b;
jnz .L__enc_xor4;
 
write_block4();
 
popq %rbx;
-   popq %rbp;
+   popq %r12;
ret;
 
 .L__enc_xor4:
xor_block4();
 
popq %rbx;
-   popq %rbp;
+   popq %r12;
ret;
 ENDPROC(__blowfish_enc_blk_4way)
 
 ENTRY(blowfish_dec_blk_4way)
/* input:
-*  %rdi: ctx, CTX
+*  %rdi: ctx
 *  %rsi: dst
 *  %rdx: src
 */
-   pushq %rbp;
+   pushq %r12;
pushq %rbx;
-   preload_roundkey_dec(17);
 
-   movq %rsi, %r11;
+   movq %rdi, CTX;
+   movq %rsi, %r11
movq %rdx, RIO;
 
+   preload_roundkey_dec(17);
read_block4();
 
round_dec4(17);
@@ -373,7 +377,7 @@ ENTRY(blowfish_dec_blk_4way)
write_block4();
 
popq %rbx;
-   popq %rbp;
+   popq %r12;
 
ret;
 ENDPROC(blowfish_dec_blk_4way)
-- 
2.13.5



[PATCH v2 04/12] x86/crypto: Fix RBP usage in cast6-avx-x86_64-asm_64.S

2017-09-18 Thread Josh Poimboeuf
Using RBP as a temporary register breaks frame pointer convention and
breaks stack traces when unwinding from an interrupt in the crypto code.

Use R15 instead of RBP.  R15 can't be used as the RID1 register because
of x86 instruction encoding limitations.  So use R15 for CTX and RDI for
CTX.  This means that CTX is no longer an implicit function argument.
Instead it needs to be explicitly copied from RDI.

Reported-by: Eric Biggers 
Reported-by: Peter Zijlstra 
Tested-by: Eric Biggers 
Acked-by: Eric Biggers 
Signed-off-by: Josh Poimboeuf 
---
 arch/x86/crypto/cast6-avx-x86_64-asm_64.S | 50 +--
 1 file changed, 34 insertions(+), 16 deletions(-)

diff --git a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S 
b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
index 952d3156a933..7f30b6f0d72c 100644
--- a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
@@ -47,7 +47,7 @@
 /**
   8-way AVX cast6
  **/
-#define CTX %rdi
+#define CTX %r15
 
 #define RA1 %xmm0
 #define RB1 %xmm1
@@ -70,8 +70,8 @@
 
 #define RTMP %xmm15
 
-#define RID1  %rbp
-#define RID1d %ebp
+#define RID1  %rdi
+#define RID1d %edi
 #define RID2  %rsi
 #define RID2d %esi
 
@@ -264,15 +264,17 @@
 .align 8
 __cast6_enc_blk8:
/* input:
-*  %rdi: ctx, CTX
+*  %rdi: ctx
 *  RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks
 * output:
 *  RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
 */
 
-   pushq %rbp;
+   pushq %r15;
pushq %rbx;
 
+   movq %rdi, CTX;
+
vmovdqa .Lbswap_mask, RKM;
vmovd .Lfirst_mask, R1ST;
vmovd .L32_mask, R32;
@@ -297,7 +299,7 @@ __cast6_enc_blk8:
QBAR(11);
 
popq %rbx;
-   popq %rbp;
+   popq %r15;
 
vmovdqa .Lbswap_mask, RKM;
 
@@ -310,15 +312,17 @@ ENDPROC(__cast6_enc_blk8)
 .align 8
 __cast6_dec_blk8:
/* input:
-*  %rdi: ctx, CTX
+*  %rdi: ctx
 *  RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
 * output:
 *  RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: decrypted blocks
 */
 
-   pushq %rbp;
+   pushq %r15;
pushq %rbx;
 
+   movq %rdi, CTX;
+
vmovdqa .Lbswap_mask, RKM;
vmovd .Lfirst_mask, R1ST;
vmovd .L32_mask, R32;
@@ -343,7 +347,7 @@ __cast6_dec_blk8:
QBAR(0);
 
popq %rbx;
-   popq %rbp;
+   popq %r15;
 
vmovdqa .Lbswap_mask, RKM;
outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
@@ -354,12 +358,14 @@ ENDPROC(__cast6_dec_blk8)
 
 ENTRY(cast6_ecb_enc_8way)
/* input:
-*  %rdi: ctx, CTX
+*  %rdi: ctx
 *  %rsi: dst
 *  %rdx: src
 */
FRAME_BEGIN
+   pushq %r15;
 
+   movq %rdi, CTX;
movq %rsi, %r11;
 
load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
@@ -368,18 +374,21 @@ ENTRY(cast6_ecb_enc_8way)
 
store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
 
+   popq %r15;
FRAME_END
ret;
 ENDPROC(cast6_ecb_enc_8way)
 
 ENTRY(cast6_ecb_dec_8way)
/* input:
-*  %rdi: ctx, CTX
+*  %rdi: ctx
 *  %rsi: dst
 *  %rdx: src
 */
FRAME_BEGIN
+   pushq %r15;
 
+   movq %rdi, CTX;
movq %rsi, %r11;
 
load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
@@ -388,20 +397,22 @@ ENTRY(cast6_ecb_dec_8way)
 
store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
 
+   popq %r15;
FRAME_END
ret;
 ENDPROC(cast6_ecb_dec_8way)
 
 ENTRY(cast6_cbc_dec_8way)
/* input:
-*  %rdi: ctx, CTX
+*  %rdi: ctx
 *  %rsi: dst
 *  %rdx: src
 */
FRAME_BEGIN
-
pushq %r12;
+   pushq %r15;
 
+   movq %rdi, CTX;
movq %rsi, %r11;
movq %rdx, %r12;
 
@@ -411,8 +422,8 @@ ENTRY(cast6_cbc_dec_8way)
 
store_cbc_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
 
+   popq %r15;
popq %r12;
-
FRAME_END
ret;
 ENDPROC(cast6_cbc_dec_8way)
@@ -425,9 +436,10 @@ ENTRY(cast6_ctr_8way)
 *  %rcx: iv (little endian, 128bit)
 */
FRAME_BEGIN
-
pushq %r12;
+   pushq %r15
 
+   movq %rdi, CTX;
movq %rsi, %r11;
movq %rdx, %r12;
 
@@ -438,8 +450,8 @@ ENTRY(cast6_ctr_8way)
 
store_ctr_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
 
+   popq %r15;
popq %r12;
-
FRAME_END
ret;
 ENDPROC(cast6_ctr_8way)
@@ -452,7 +464,9 @@ ENTRY(cast6_xts_enc_8way)
 *  %rcx: iv (t ⊕ αⁿ ∈ 

[PATCH v2 02/12] x86/crypto: Fix RBP usage in camellia-x86_64-asm_64.S

2017-09-18 Thread Josh Poimboeuf
Using RBP as a temporary register breaks frame pointer convention and
breaks stack traces when unwinding from an interrupt in the crypto code.

Use R12 instead of RBP.  Both are callee-saved registers, so the
substitution is straightforward.

Reported-by: Eric Biggers 
Reported-by: Peter Zijlstra 
Tested-by: Eric Biggers 
Acked-by: Eric Biggers 
Signed-off-by: Josh Poimboeuf 
---
 arch/x86/crypto/camellia-x86_64-asm_64.S | 26 +-
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/arch/x86/crypto/camellia-x86_64-asm_64.S 
b/arch/x86/crypto/camellia-x86_64-asm_64.S
index 310319c601ed..95ba6956a7f6 100644
--- a/arch/x86/crypto/camellia-x86_64-asm_64.S
+++ b/arch/x86/crypto/camellia-x86_64-asm_64.S
@@ -75,17 +75,17 @@
 #define RCD1bh %dh
 
 #define RT0 %rsi
-#define RT1 %rbp
+#define RT1 %r12
 #define RT2 %r8
 
 #define RT0d %esi
-#define RT1d %ebp
+#define RT1d %r12d
 #define RT2d %r8d
 
 #define RT2bl %r8b
 
 #define RXOR %r9
-#define RRBP %r10
+#define RR12 %r10
 #define RDST %r11
 
 #define RXORd %r9d
@@ -197,7 +197,7 @@ ENTRY(__camellia_enc_blk)
 *  %rdx: src
 *  %rcx: bool xor
 */
-   movq %rbp, RRBP;
+   movq %r12, RR12;
 
movq %rcx, RXOR;
movq %rsi, RDST;
@@ -227,13 +227,13 @@ ENTRY(__camellia_enc_blk)
 
enc_outunpack(mov, RT1);
 
-   movq RRBP, %rbp;
+   movq RR12, %r12;
ret;
 
 .L__enc_xor:
enc_outunpack(xor, RT1);
 
-   movq RRBP, %rbp;
+   movq RR12, %r12;
ret;
 ENDPROC(__camellia_enc_blk)
 
@@ -248,7 +248,7 @@ ENTRY(camellia_dec_blk)
movl $24, RXORd;
cmovel RXORd, RT2d; /* max */
 
-   movq %rbp, RRBP;
+   movq %r12, RR12;
movq %rsi, RDST;
movq %rdx, RIO;
 
@@ -271,7 +271,7 @@ ENTRY(camellia_dec_blk)
 
dec_outunpack();
 
-   movq RRBP, %rbp;
+   movq RR12, %r12;
ret;
 ENDPROC(camellia_dec_blk)
 
@@ -433,7 +433,7 @@ ENTRY(__camellia_enc_blk_2way)
 */
pushq %rbx;
 
-   movq %rbp, RRBP;
+   movq %r12, RR12;
movq %rcx, RXOR;
movq %rsi, RDST;
movq %rdx, RIO;
@@ -461,14 +461,14 @@ ENTRY(__camellia_enc_blk_2way)
 
enc_outunpack2(mov, RT2);
 
-   movq RRBP, %rbp;
+   movq RR12, %r12;
popq %rbx;
ret;
 
 .L__enc2_xor:
enc_outunpack2(xor, RT2);
 
-   movq RRBP, %rbp;
+   movq RR12, %r12;
popq %rbx;
ret;
 ENDPROC(__camellia_enc_blk_2way)
@@ -485,7 +485,7 @@ ENTRY(camellia_dec_blk_2way)
cmovel RXORd, RT2d; /* max */
 
movq %rbx, RXOR;
-   movq %rbp, RRBP;
+   movq %r12, RR12;
movq %rsi, RDST;
movq %rdx, RIO;
 
@@ -508,7 +508,7 @@ ENTRY(camellia_dec_blk_2way)
 
dec_outunpack2();
 
-   movq RRBP, %rbp;
+   movq RR12, %r12;
movq RXOR, %rbx;
ret;
 ENDPROC(camellia_dec_blk_2way)
-- 
2.13.5



[PATCH v2 05/12] x86/crypto: Fix RBP usage in des3_ede-asm_64.S

2017-09-18 Thread Josh Poimboeuf
Using RBP as a temporary register breaks frame pointer convention and
breaks stack traces when unwinding from an interrupt in the crypto code.

Use RSI instead of RBP for RT1.  Since RSI is also used as a the 'dst'
function argument, it needs to be saved on the stack until the argument
is needed.

Reported-by: Eric Biggers 
Reported-by: Peter Zijlstra 
Tested-by: Eric Biggers 
Acked-by: Eric Biggers 
Signed-off-by: Josh Poimboeuf 
---
 arch/x86/crypto/des3_ede-asm_64.S | 15 +--
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/arch/x86/crypto/des3_ede-asm_64.S 
b/arch/x86/crypto/des3_ede-asm_64.S
index f3e91647ca27..8e49ce117494 100644
--- a/arch/x86/crypto/des3_ede-asm_64.S
+++ b/arch/x86/crypto/des3_ede-asm_64.S
@@ -64,12 +64,12 @@
 #define RW2bh %ch
 
 #define RT0 %r15
-#define RT1 %rbp
+#define RT1 %rsi
 #define RT2 %r14
 #define RT3 %rdx
 
 #define RT0d %r15d
-#define RT1d %ebp
+#define RT1d %esi
 #define RT2d %r14d
 #define RT3d %edx
 
@@ -177,13 +177,14 @@ ENTRY(des3_ede_x86_64_crypt_blk)
 *  %rsi: dst
 *  %rdx: src
 */
-   pushq %rbp;
pushq %rbx;
pushq %r12;
pushq %r13;
pushq %r14;
pushq %r15;
 
+   pushq %rsi; /* dst */
+
read_block(%rdx, RL0, RR0);
initial_permutation(RL0, RR0);
 
@@ -241,6 +242,8 @@ ENTRY(des3_ede_x86_64_crypt_blk)
round1(32+15, RL0, RR0, dummy2);
 
final_permutation(RR0, RL0);
+
+   popq %rsi /* dst */
write_block(%rsi, RR0, RL0);
 
popq %r15;
@@ -248,7 +251,6 @@ ENTRY(des3_ede_x86_64_crypt_blk)
popq %r13;
popq %r12;
popq %rbx;
-   popq %rbp;
 
ret;
 ENDPROC(des3_ede_x86_64_crypt_blk)
@@ -432,13 +434,14 @@ ENTRY(des3_ede_x86_64_crypt_blk_3way)
 *  %rdx: src (3 blocks)
 */
 
-   pushq %rbp;
pushq %rbx;
pushq %r12;
pushq %r13;
pushq %r14;
pushq %r15;
 
+   pushq %rsi /* dst */
+
/* load input */
movl 0 * 4(%rdx), RL0d;
movl 1 * 4(%rdx), RR0d;
@@ -520,6 +523,7 @@ ENTRY(des3_ede_x86_64_crypt_blk_3way)
bswapl RR2d;
bswapl RL2d;
 
+   popq %rsi /* dst */
movl RR0d, 0 * 4(%rsi);
movl RL0d, 1 * 4(%rsi);
movl RR1d, 2 * 4(%rsi);
@@ -532,7 +536,6 @@ ENTRY(des3_ede_x86_64_crypt_blk_3way)
popq %r13;
popq %r12;
popq %rbx;
-   popq %rbp;
 
ret;
 ENDPROC(des3_ede_x86_64_crypt_blk_3way)
-- 
2.13.5



[PATCH v2 09/12] x86/crypto: Fix RBP usage in sha256-avx2-asm.S

2017-09-18 Thread Josh Poimboeuf
Using RBP as a temporary register breaks frame pointer convention and
breaks stack traces when unwinding from an interrupt in the crypto code.

There's no need to use RBP as a temporary register for the TBL value,
because it always stores the same value: the address of the K256 table.
Instead just reference the address of K256 directly.

Reported-by: Eric Biggers 
Reported-by: Peter Zijlstra 
Tested-by: Eric Biggers 
Acked-by: Eric Biggers 
Signed-off-by: Josh Poimboeuf 
---
 arch/x86/crypto/sha256-avx2-asm.S | 22 +++---
 1 file changed, 7 insertions(+), 15 deletions(-)

diff --git a/arch/x86/crypto/sha256-avx2-asm.S 
b/arch/x86/crypto/sha256-avx2-asm.S
index 89c8f09787d2..1420db15dcdd 100644
--- a/arch/x86/crypto/sha256-avx2-asm.S
+++ b/arch/x86/crypto/sha256-avx2-asm.S
@@ -98,8 +98,6 @@ d = %r8d
 e   = %edx # clobbers NUM_BLKS
 y3 = %esi  # clobbers INP
 
-
-TBL= %rbp
 SRND   = CTX   # SRND is same register as CTX
 
 a = %eax
@@ -531,7 +529,6 @@ STACK_SIZE  = _RSP  + _RSP_SIZE
 ENTRY(sha256_transform_rorx)
 .align 32
pushq   %rbx
-   pushq   %rbp
pushq   %r12
pushq   %r13
pushq   %r14
@@ -568,8 +565,6 @@ ENTRY(sha256_transform_rorx)
mov CTX, _CTX(%rsp)
 
 loop0:
-   lea K256(%rip), TBL
-
## Load first 16 dwords from two blocks
VMOVDQ  0*32(INP),XTMP0
VMOVDQ  1*32(INP),XTMP1
@@ -597,19 +592,19 @@ last_block_enter:
 
 .align 16
 loop1:
-   vpaddd  0*32(TBL, SRND), X0, XFER
+   vpaddd  K256+0*32(SRND), X0, XFER
vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
FOUR_ROUNDS_AND_SCHED   _XFER + 0*32
 
-   vpaddd  1*32(TBL, SRND), X0, XFER
+   vpaddd  K256+1*32(SRND), X0, XFER
vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
FOUR_ROUNDS_AND_SCHED   _XFER + 1*32
 
-   vpaddd  2*32(TBL, SRND), X0, XFER
+   vpaddd  K256+2*32(SRND), X0, XFER
vmovdqa XFER, 2*32+_XFER(%rsp, SRND)
FOUR_ROUNDS_AND_SCHED   _XFER + 2*32
 
-   vpaddd  3*32(TBL, SRND), X0, XFER
+   vpaddd  K256+3*32(SRND), X0, XFER
vmovdqa XFER, 3*32+_XFER(%rsp, SRND)
FOUR_ROUNDS_AND_SCHED   _XFER + 3*32
 
@@ -619,10 +614,11 @@ loop1:
 
 loop2:
## Do last 16 rounds with no scheduling
-   vpaddd  0*32(TBL, SRND), X0, XFER
+   vpaddd  K256+0*32(SRND), X0, XFER
vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
DO_4ROUNDS  _XFER + 0*32
-   vpaddd  1*32(TBL, SRND), X1, XFER
+
+   vpaddd  K256+1*32(SRND), X1, XFER
vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
DO_4ROUNDS  _XFER + 1*32
add $2*32, SRND
@@ -676,9 +672,6 @@ loop3:
ja  done_hash
 
 do_last_block:
-    do last block
-   lea K256(%rip), TBL
-
VMOVDQ  0*16(INP),XWORD0
VMOVDQ  1*16(INP),XWORD1
VMOVDQ  2*16(INP),XWORD2
@@ -718,7 +711,6 @@ done_hash:
popq%r14
popq%r13
popq%r12
-   popq%rbp
popq%rbx
ret
 ENDPROC(sha256_transform_rorx)
-- 
2.13.5



[PATCH v2 07/12] x86/crypto: Fix RBP usage in sha1_ssse3_asm.S

2017-09-18 Thread Josh Poimboeuf
Using RBP as a temporary register breaks frame pointer convention and
breaks stack traces when unwinding from an interrupt in the crypto code.

Swap the usages of R12 and RBP.  Use R12 for the REG_D register, and use
RBP to store the pre-aligned stack pointer.

Reported-by: Eric Biggers 
Reported-by: Peter Zijlstra 
Tested-by: Eric Biggers 
Acked-by: Eric Biggers 
Signed-off-by: Josh Poimboeuf 
---
 arch/x86/crypto/sha1_ssse3_asm.S | 11 +--
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/arch/x86/crypto/sha1_ssse3_asm.S b/arch/x86/crypto/sha1_ssse3_asm.S
index a4109506a5e8..6204bd53528c 100644
--- a/arch/x86/crypto/sha1_ssse3_asm.S
+++ b/arch/x86/crypto/sha1_ssse3_asm.S
@@ -37,7 +37,7 @@
 #define REG_A  %ecx
 #define REG_B  %esi
 #define REG_C  %edi
-#define REG_D  %ebp
+#define REG_D  %r12d
 #define REG_E  %edx
 
 #define REG_T1 %eax
@@ -74,10 +74,10 @@
ENTRY(\name)
 
push%rbx
-   push%rbp
push%r12
+   push%rbp
+   mov %rsp, %rbp
 
-   mov %rsp, %r12
sub $64, %rsp   # allocate workspace
and $~15, %rsp  # align stack
 
@@ -99,10 +99,9 @@
xor %rax, %rax
rep stosq
 
-   mov %r12, %rsp  # deallocate workspace
-
-   pop %r12
+   mov %rbp, %rsp  # deallocate workspace
pop %rbp
+   pop %r12
pop %rbx
ret
 
-- 
2.13.5



[PATCH v2 08/12] x86/crypto: Fix RBP usage in sha256-avx-asm.S

2017-09-18 Thread Josh Poimboeuf
Using RBP as a temporary register breaks frame pointer convention and
breaks stack traces when unwinding from an interrupt in the crypto code.

Swap the usages of R12 and RBP.  Use R12 for the TBL register, and use
RBP to store the pre-aligned stack pointer.

Reported-by: Eric Biggers 
Reported-by: Peter Zijlstra 
Tested-by: Eric Biggers 
Acked-by: Eric Biggers 
Signed-off-by: Josh Poimboeuf 
---
 arch/x86/crypto/sha256-avx-asm.S | 15 +++
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/arch/x86/crypto/sha256-avx-asm.S b/arch/x86/crypto/sha256-avx-asm.S
index e0a1a5f2..001bbcf93c79 100644
--- a/arch/x86/crypto/sha256-avx-asm.S
+++ b/arch/x86/crypto/sha256-avx-asm.S
@@ -103,7 +103,7 @@ SRND = %rsi   # clobbers INP
 c = %ecx
 d = %r8d
 e = %edx
-TBL = %rbp
+TBL = %r12
 a = %eax
 b = %ebx
 
@@ -350,13 +350,13 @@ a = TMP_
 ENTRY(sha256_transform_avx)
 .align 32
pushq   %rbx
-   pushq   %rbp
+   pushq   %r12
pushq   %r13
pushq   %r14
pushq   %r15
-   pushq   %r12
+   pushq   %rbp
+   movq%rsp, %rbp
 
-   mov %rsp, %r12
subq$STACK_SIZE, %rsp   # allocate stack space
and $~15, %rsp  # align stack pointer
 
@@ -452,13 +452,12 @@ loop2:
 
 done_hash:
 
-   mov %r12, %rsp
-
-   popq%r12
+   mov %rbp, %rsp
+   popq%rbp
popq%r15
popq%r14
popq%r13
-   popq%rbp
+   popq%r12
popq%rbx
ret
 ENDPROC(sha256_transform_avx)
-- 
2.13.5



[PATCH v2 12/12] x86/crypto: Fix RBP usage in twofish-avx-x86_64-asm_64.S

2017-09-18 Thread Josh Poimboeuf
Using RBP as a temporary register breaks frame pointer convention and
breaks stack traces when unwinding from an interrupt in the crypto code.

Use R13 instead of RBP.  Both are callee-saved registers, so the
substitution is straightforward.

Reported-by: Eric Biggers 
Reported-by: Peter Zijlstra 
Tested-by: Eric Biggers 
Acked-by: Eric Biggers 
Signed-off-by: Josh Poimboeuf 
---
 arch/x86/crypto/twofish-avx-x86_64-asm_64.S | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S 
b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
index b3f49d286348..73b471da3622 100644
--- a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
@@ -76,8 +76,8 @@
 #define RT %xmm14
 #define RR %xmm15
 
-#define RID1  %rbp
-#define RID1d %ebp
+#define RID1  %r13
+#define RID1d %r13d
 #define RID2  %rsi
 #define RID2d %esi
 
@@ -259,7 +259,7 @@ __twofish_enc_blk8:
 
vmovdqu w(CTX), RK1;
 
-   pushq %rbp;
+   pushq %r13;
pushq %rbx;
pushq %rcx;
 
@@ -282,7 +282,7 @@ __twofish_enc_blk8:
 
popq %rcx;
popq %rbx;
-   popq %rbp;
+   popq %r13;
 
outunpack_blocks(RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
outunpack_blocks(RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2);
@@ -301,7 +301,7 @@ __twofish_dec_blk8:
 
vmovdqu (w+4*4)(CTX), RK1;
 
-   pushq %rbp;
+   pushq %r13;
pushq %rbx;
 
inpack_blocks(RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
@@ -322,7 +322,7 @@ __twofish_dec_blk8:
vmovdqu (w)(CTX), RK1;
 
popq %rbx;
-   popq %rbp;
+   popq %r13;
 
outunpack_blocks(RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2);
outunpack_blocks(RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2);
-- 
2.13.5



[PATCH v2 11/12] x86/crypto: Fix RBP usage in sha512-avx2-asm.S

2017-09-18 Thread Josh Poimboeuf
Using RBP as a temporary register breaks frame pointer convention and
breaks stack traces when unwinding from an interrupt in the crypto code.

Mix things up a little bit to get rid of the RBP usage, without hurting
performance too much.  Use RDI instead of RBP for the TBL pointer.  That
will clobber CTX, so spill CTX onto the stack and use R12 to read it in
the outer loop.  R12 is used as a non-persistent temporary variable
elsewhere, so it's safe to use.

Also remove the unused y4 variable.

Reported-by: Eric Biggers 
Reported-by: Peter Zijlstra 
Tested-by: Eric Biggers 
Acked-by: Eric Biggers 
Signed-off-by: Josh Poimboeuf 
---
 arch/x86/crypto/sha512-avx2-asm.S | 75 ---
 1 file changed, 39 insertions(+), 36 deletions(-)

diff --git a/arch/x86/crypto/sha512-avx2-asm.S 
b/arch/x86/crypto/sha512-avx2-asm.S
index 7f5f6c6ec72e..b16d56005162 100644
--- a/arch/x86/crypto/sha512-avx2-asm.S
+++ b/arch/x86/crypto/sha512-avx2-asm.S
@@ -69,8 +69,9 @@ XFER  = YTMP0
 
 BYTE_FLIP_MASK  = %ymm9
 
-# 1st arg
-CTX = %rdi
+# 1st arg is %rdi, which is saved to the stack and accessed later via %r12
+CTX1= %rdi
+CTX2= %r12
 # 2nd arg
 INP = %rsi
 # 3rd arg
@@ -81,7 +82,7 @@ d   = %r8
 e   = %rdx
 y3  = %rsi
 
-TBL   = %rbp
+TBL   = %rdi # clobbers CTX1
 
 a = %rax
 b = %rbx
@@ -91,26 +92,26 @@ g = %r10
 h = %r11
 old_h = %r11
 
-T1= %r12
+T1= %r12 # clobbers CTX2
 y0= %r13
 y1= %r14
 y2= %r15
 
-y4= %r12
-
 # Local variables (stack frame)
 XFER_SIZE = 4*8
 SRND_SIZE = 1*8
 INP_SIZE = 1*8
 INPEND_SIZE = 1*8
+CTX_SIZE = 1*8
 RSPSAVE_SIZE = 1*8
-GPRSAVE_SIZE = 6*8
+GPRSAVE_SIZE = 5*8
 
 frame_XFER = 0
 frame_SRND = frame_XFER + XFER_SIZE
 frame_INP = frame_SRND + SRND_SIZE
 frame_INPEND = frame_INP + INP_SIZE
-frame_RSPSAVE = frame_INPEND + INPEND_SIZE
+frame_CTX = frame_INPEND + INPEND_SIZE
+frame_RSPSAVE = frame_CTX + CTX_SIZE
 frame_GPRSAVE = frame_RSPSAVE + RSPSAVE_SIZE
 frame_size = frame_GPRSAVE + GPRSAVE_SIZE
 
@@ -576,12 +577,11 @@ ENTRY(sha512_transform_rorx)
mov %rax, frame_RSPSAVE(%rsp)
 
# Save GPRs
-   mov %rbp, frame_GPRSAVE(%rsp)
-   mov %rbx, 8*1+frame_GPRSAVE(%rsp)
-   mov %r12, 8*2+frame_GPRSAVE(%rsp)
-   mov %r13, 8*3+frame_GPRSAVE(%rsp)
-   mov %r14, 8*4+frame_GPRSAVE(%rsp)
-   mov %r15, 8*5+frame_GPRSAVE(%rsp)
+   mov %rbx, 8*0+frame_GPRSAVE(%rsp)
+   mov %r12, 8*1+frame_GPRSAVE(%rsp)
+   mov %r13, 8*2+frame_GPRSAVE(%rsp)
+   mov %r14, 8*3+frame_GPRSAVE(%rsp)
+   mov %r15, 8*4+frame_GPRSAVE(%rsp)
 
shl $7, NUM_BLKS# convert to bytes
jz  done_hash
@@ -589,14 +589,17 @@ ENTRY(sha512_transform_rorx)
mov NUM_BLKS, frame_INPEND(%rsp)
 
## load initial digest
-   mov 8*0(CTX),a
-   mov 8*1(CTX),b
-   mov 8*2(CTX),c
-   mov 8*3(CTX),d
-   mov 8*4(CTX),e
-   mov 8*5(CTX),f
-   mov 8*6(CTX),g
-   mov 8*7(CTX),h
+   mov 8*0(CTX1), a
+   mov 8*1(CTX1), b
+   mov 8*2(CTX1), c
+   mov 8*3(CTX1), d
+   mov 8*4(CTX1), e
+   mov 8*5(CTX1), f
+   mov 8*6(CTX1), g
+   mov 8*7(CTX1), h
+
+   # save %rdi (CTX) before it gets clobbered
+   mov %rdi, frame_CTX(%rsp)
 
vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
 
@@ -652,14 +655,15 @@ loop2:
subq$1, frame_SRND(%rsp)
jne loop2
 
-   addm8*0(CTX),a
-   addm8*1(CTX),b
-   addm8*2(CTX),c
-   addm8*3(CTX),d
-   addm8*4(CTX),e
-   addm8*5(CTX),f
-   addm8*6(CTX),g
-   addm8*7(CTX),h
+   mov frame_CTX(%rsp), CTX2
+   addm8*0(CTX2), a
+   addm8*1(CTX2), b
+   addm8*2(CTX2), c
+   addm8*3(CTX2), d
+   addm8*4(CTX2), e
+   addm8*5(CTX2), f
+   addm8*6(CTX2), g
+   addm8*7(CTX2), h
 
mov frame_INP(%rsp), INP
add $128, INP
@@ -669,12 +673,11 @@ loop2:
 done_hash:
 
 # Restore GPRs
-   mov frame_GPRSAVE(%rsp) ,%rbp
-   mov 8*1+frame_GPRSAVE(%rsp) ,%rbx
-   mov 8*2+frame_GPRSAVE(%rsp) ,%r12
-   mov 8*3+frame_GPRSAVE(%rsp) ,%r13
-   mov 8*4+frame_GPRSAVE(%rsp) ,%r14
-   mov 8*5+frame_GPRSAVE(%rsp) ,%r15
+   mov 8*0+frame_GPRSAVE(%rsp), %rbx
+   mov 8*1+frame_GPRSAVE(%rsp), %r12
+   mov 8*2+frame_GPRSAVE(%rsp), %r13
+   mov 8*3+frame_GPRSAVE(%rsp), %r14
+   mov 8*4+frame_GPRSAVE(%rsp), %r15
 
# Restore Stack Pointer
mov frame_RSPSAVE(%rsp), %rsp
-- 
2.13.5



[PATCH v2 00/12] x86/crypto: Fix RBP usage in several crypto .S files

2017-09-18 Thread Josh Poimboeuf
v2:
- fix performance issues in sha256-avx2-asm.S and sha512-avx2-asm.S
  (Eric)

Many of the x86 crypto functions use RBP as a temporary register.  This
breaks frame pointer convention, and breaks stack traces when unwinding
from an interrupt in the crypto code.

Convert most* of them to leave RBP alone.

These pass the crypto boot tests for me.  Any further testing would be
appreciated!

[*] There are still a few crypto files left that need fixing, but the
fixes weren't trivial and nobody reported unwinder warnings about
them yet, so I'm skipping them for now.

Josh Poimboeuf (12):
  x86/crypto: Fix RBP usage in blowfish-x86_64-asm_64.S
  x86/crypto: Fix RBP usage in camellia-x86_64-asm_64.S
  x86/crypto: Fix RBP usage in cast5-avx-x86_64-asm_64.S
  x86/crypto: Fix RBP usage in cast6-avx-x86_64-asm_64.S
  x86/crypto: Fix RBP usage in des3_ede-asm_64.S
  x86/crypto: Fix RBP usage in sha1_avx2_x86_64_asm.S
  x86/crypto: Fix RBP usage in sha1_ssse3_asm.S
  x86/crypto: Fix RBP usage in sha256-avx-asm.S
  x86/crypto: Fix RBP usage in sha256-avx2-asm.S
  x86/crypto: Fix RBP usage in sha256-ssse3-asm.S
  x86/crypto: Fix RBP usage in sha512-avx2-asm.S
  x86/crypto: Fix RBP usage in twofish-avx-x86_64-asm_64.S

 arch/x86/crypto/blowfish-x86_64-asm_64.S| 48 +-
 arch/x86/crypto/camellia-x86_64-asm_64.S| 26 +-
 arch/x86/crypto/cast5-avx-x86_64-asm_64.S   | 47 +++---
 arch/x86/crypto/cast6-avx-x86_64-asm_64.S   | 50 +--
 arch/x86/crypto/des3_ede-asm_64.S   | 15 +++---
 arch/x86/crypto/sha1_avx2_x86_64_asm.S  |  4 +-
 arch/x86/crypto/sha1_ssse3_asm.S| 11 ++---
 arch/x86/crypto/sha256-avx-asm.S| 15 +++---
 arch/x86/crypto/sha256-avx2-asm.S   | 22 +++--
 arch/x86/crypto/sha256-ssse3-asm.S  | 15 +++---
 arch/x86/crypto/sha512-avx2-asm.S   | 75 +++--
 arch/x86/crypto/twofish-avx-x86_64-asm_64.S | 12 ++---
 12 files changed, 184 insertions(+), 156 deletions(-)

-- 
2.13.5



[PATCH v3 3/4] crypto: jz4780-rng: Add RNG node to jz4780.dtsi

2017-09-18 Thread PrasannaKumar Muralidharan
Add RNG node to jz4780 dtsi. This driver uses registers that are part of
the register set used by Ingenic CGU driver. Use regmap in RNG driver to
access its register. Create 'simple-bus' node, make CGU and RNG node as
child of it so that both the nodes are visible without changing CGU
driver code.

Signed-off-by: PrasannaKumar Muralidharan 
---
Changes in v3:
* Create a cgublock node with "simple-bus" compatible
* Make CGU and RNG node as children of cgublock node.

Changes in v2:  
 
* Add "syscon" in CGU node's compatible section 
 
* Make RNG child node of CGU.   
 

 arch/mips/boot/dts/ingenic/jz4780.dtsi | 25 -
 1 file changed, 20 insertions(+), 5 deletions(-)

diff --git a/arch/mips/boot/dts/ingenic/jz4780.dtsi 
b/arch/mips/boot/dts/ingenic/jz4780.dtsi
index 4853ef6..5953b97 100644
--- a/arch/mips/boot/dts/ingenic/jz4780.dtsi
+++ b/arch/mips/boot/dts/ingenic/jz4780.dtsi
@@ -34,14 +34,29 @@
clock-frequency = <32768>;
};
 
-   cgu: jz4780-cgu@1000 {
-   compatible = "ingenic,jz4780-cgu";
+   cgublock {
+   compatible = "simple-bus";
+
+   #address-cells = <1>;
+   #size-cells = <1>;
+
reg = <0x1000 0x100>;
+   ranges;
 
-   clocks = <>, <>;
-   clock-names = "ext", "rtc";
+   cgu: jz4780-cgu@0 {
+   compatible = "ingenic,jz4780-cgu";
+   reg = <0x1000 0x100>;
 
-   #clock-cells = <1>;
+   clocks = <>, <>;
+   clock-names = "ext", "rtc";
+
+   #clock-cells = <1>;
+   };
+
+   rng: rng@d8 {
+   compatible = "ingenic,jz4780-rng";
+   reg = <0x10d8 0x8>;
+   };
};
 
pinctrl: pin-controller@1001 {
-- 
2.10.0



[PATCH v3 4/4] crypto: jz4780-rng: Enable PRNG support in CI20 defconfig

2017-09-18 Thread PrasannaKumar Muralidharan
Enable PRNG driver support in MIPS Creator CI20 default config.

Signed-off-by: PrasannaKumar Muralidharan 
---
No changes in v3

No changes in v2

 arch/mips/configs/ci20_defconfig | 5 +
 1 file changed, 5 insertions(+)

diff --git a/arch/mips/configs/ci20_defconfig b/arch/mips/configs/ci20_defconfig
index b42cfa7..9f48f2c 100644
--- a/arch/mips/configs/ci20_defconfig
+++ b/arch/mips/configs/ci20_defconfig
@@ -88,6 +88,11 @@ CONFIG_SERIAL_8250_RUNTIME_UARTS=5
 CONFIG_SERIAL_8250_INGENIC=y
 CONFIG_SERIAL_OF_PLATFORM=y
 # CONFIG_HW_RANDOM is not set
+CONFIG_CRYPTO_USER=y
+CONFIG_CRYPTO_USER_API=y
+CONFIG_CRYPTO_USER_API_RNG=y
+CONFIG_CRYPTO_HW=y
+CONFIG_CRYPTO_DEV_JZ4780_RNG=y
 CONFIG_I2C=y
 CONFIG_I2C_JZ4780=y
 CONFIG_GPIO_SYSFS=y
-- 
2.10.0



[PATCH v3 0/4] crypto: Add driver for JZ4780 PRNG

2017-09-18 Thread PrasannaKumar Muralidharan
This patch series adds support of pseudo random number generator found
in Ingenic's JZ4780 and X1000 SoC.

Create cgublock node which has CGU and RNG node as its children. The
cgublock node uses "simple-bus" compatible which helps in exposing CGU
and RNG nodes without changing CGU driver. Add 'syscon' compatible in
CGU node in jz4780.dtsi. The jz4780-rng driver uses regmap exposed via
syscon interface to access the RNG registers. CGU driver is not
modified in this patch set as registers used by CGU driver and this
driver are different.

PrasannaKumar Muralidharan (4):
  crypto: jz4780-rng: Add JZ4780 PRNG devicetree binding documentation
  crypto: jz4780-rng: Add Ingenic JZ4780 hardware PRNG driver
  crypto: jz4780-rng: Add RNG node to jz4780.dtsi
  crypto: jz4780-rng: Enable PRNG support in CI20 defconfig

 .../devicetree/bindings/rng/ingenic,jz4780-rng.txt |  21 +++
 MAINTAINERS|   7 +
 arch/mips/boot/dts/ingenic/jz4780.dtsi |  25 ++-
 arch/mips/configs/ci20_defconfig   |   5 +
 drivers/crypto/Kconfig |  19 ++
 drivers/crypto/Makefile|   1 +
 drivers/crypto/jz4780-rng.c| 193 +
 7 files changed, 266 insertions(+), 5 deletions(-)
 create mode 100644 Documentation/devicetree/bindings/rng/ingenic,jz4780-rng.txt
 create mode 100644 drivers/crypto/jz4780-rng.c

-- 
2.10.0



[PATCH v3 2/4] crypto: jz4780-rng: Add Ingenic JZ4780 hardware PRNG driver

2017-09-18 Thread PrasannaKumar Muralidharan
JZ4780 SoC pseudo random number generator driver using crypto framework.

Adding a delay before reading RNG data and disabling RNG after reading
data was suggested by Jeffery Walton.

Tested-by: Mathieu Malaterre 
Suggested-by: Jeffrey Walton 
Signed-off-by: PrasannaKumar Muralidharan 
---
Changes in v3:
* Add seeding support
* Reduce delay

Changes in v2:  

* Fixed buffer overflow in generate function pointed out in Stephan's review

* Fold patch that had only MAINTAINERS file change with this patch  

* Removed unnecessary comment in code   


 MAINTAINERS |   7 ++
 drivers/crypto/Kconfig  |  19 +
 drivers/crypto/Makefile |   1 +
 drivers/crypto/jz4780-rng.c | 193 
 4 files changed, 220 insertions(+)
 create mode 100644 drivers/crypto/jz4780-rng.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 2093060..d2341a7 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6783,6 +6783,13 @@ L:   linux-...@lists.infradead.org
 S: Maintained
 F: drivers/mtd/nand/jz4780_*
 
+INGENIC JZ4780 PRNG DRIVER
+M: PrasannaKumar Muralidharan 
+L: linux-crypto@vger.kernel.org
+S: Maintained
+F: drivers/crypto/jz4780-rng.c
+F: Documentation/devicetree/bindings/rng/ingenic,jz4780-rng.txt
+
 INOTIFY
 M: Jan Kara 
 R: Amir Goldstein 
diff --git a/drivers/crypto/Kconfig b/drivers/crypto/Kconfig
index fe33c19..f3ac1cd 100644
--- a/drivers/crypto/Kconfig
+++ b/drivers/crypto/Kconfig
@@ -613,6 +613,25 @@ config CRYPTO_DEV_IMGTEC_HASH
  hardware hash accelerator. Supporting MD5/SHA1/SHA224/SHA256
  hashing algorithms.
 
+config CRYPTO_DEV_JZ4780_RNG
+   tristate "JZ4780 HW pseudo random number generator support"
+   depends on MACH_JZ4780 || COMPILE_TEST
+   depends on HAS_IOMEM
+   select CRYPTO_RNG
+   select REGMAP
+   select SYSCON
+   select MFD_SYSCON
+   ---help---
+ This driver provides kernel-side support through the
+ cryptographic API for the pseudo random number generator
+ hardware found in ingenic JZ4780 and X1000 SoC. MIPS
+ Creator CI20 uses JZ4780 SoC.
+
+ To compile this driver as a module, choose M here: the
+ module will be called jz4780-rng.
+
+ If unsure, say Y.
+
 config CRYPTO_DEV_SUN4I_SS
tristate "Support for Allwinner Security System cryptographic 
accelerator"
depends on ARCH_SUNXI && !64BIT
diff --git a/drivers/crypto/Makefile b/drivers/crypto/Makefile
index 808432b..a09d9f4 100644
--- a/drivers/crypto/Makefile
+++ b/drivers/crypto/Makefile
@@ -14,6 +14,7 @@ obj-$(CONFIG_CRYPTO_DEV_GEODE) += geode-aes.o
 obj-$(CONFIG_CRYPTO_DEV_HIFN_795X) += hifn_795x.o
 obj-$(CONFIG_CRYPTO_DEV_IMGTEC_HASH) += img-hash.o
 obj-$(CONFIG_CRYPTO_DEV_IXP4XX) += ixp4xx_crypto.o
+obj-$(CONFIG_CRYPTO_DEV_JZ4780_RNG) += jz4780-rng.o
 obj-$(CONFIG_CRYPTO_DEV_MV_CESA) += mv_cesa.o
 obj-$(CONFIG_CRYPTO_DEV_MARVELL_CESA) += marvell/
 obj-$(CONFIG_CRYPTO_DEV_MEDIATEK) += mediatek/
diff --git a/drivers/crypto/jz4780-rng.c b/drivers/crypto/jz4780-rng.c
new file mode 100644
index 000..918ba94
--- /dev/null
+++ b/drivers/crypto/jz4780-rng.c
@@ -0,0 +1,193 @@
+/*
+ * jz4780-rng.c - Random Number Generator driver for the jz4780
+ *
+ * Copyright (c) 2017 PrasannaKumar Muralidharan 
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation;
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+
+#define REG_RNG_CTRL   0xD8
+#define REG_RNG_DATA   0xDC
+
+/* Context for crypto */
+struct jz4780_rng_ctx {
+   struct jz4780_rng *rng;
+};
+
+/* Device associated memory */
+struct jz4780_rng {
+   struct device *dev;
+   struct regmap *regmap;
+   u32 seed;
+};
+
+static struct jz4780_rng *jz4780_rng;
+
+static int jz4780_rng_readl(struct jz4780_rng *rng, u32 offset)
+{
+   u32 val = 0;
+   int ret;
+
+   ret = regmap_read(rng->regmap, offset, );
+   if (!ret)
+   return val;
+
+   return ret;
+}
+
+static int jz4780_rng_writel(struct jz4780_rng *rng, u32 val, u32 offset)
+{
+   return regmap_write(rng->regmap, offset, val);
+}
+
+static int jz4780_rng_seed(struct crypto_rng *tfm, const u8 *seed,
+  unsigned int slen)
+{
+   struct 

[PATCH v3 1/4] crypto: jz4780-rng: Add JZ4780 PRNG devicetree binding documentation

2017-09-18 Thread PrasannaKumar Muralidharan
Add devicetree bindings for hardware pseudo random number generator
present in Ingenic JZ4780 SoC.

Signed-off-by: PrasannaKumar Muralidharan 
---
Changes in v3:
* Create a cgublock node with "simple-bus" compatible
* Make CGU and RNG node as children of cgublock node.

Changes in v2:  
 
* Add "syscon" in CGU node's compatible section 
 
* Make RNG child node of CGU.   
 

 .../devicetree/bindings/rng/ingenic,jz4780-rng.txt  | 21 +
 1 file changed, 21 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/rng/ingenic,jz4780-rng.txt

diff --git a/Documentation/devicetree/bindings/rng/ingenic,jz4780-rng.txt 
b/Documentation/devicetree/bindings/rng/ingenic,jz4780-rng.txt
new file mode 100644
index 000..765df9c
--- /dev/null
+++ b/Documentation/devicetree/bindings/rng/ingenic,jz4780-rng.txt
@@ -0,0 +1,21 @@
+Ingenic jz4780 RNG driver
+
+Required properties:
+- compatible : Should be "ingenic,jz4780-rng"
+
+Example:
+
+cgublock {
+   compatible = "simple-bus";
+
+   #address-cells = <1>;
+   #size-cells = <1>;
+
+   reg = <0x1000 0x100>;
+   ranges;
+
+   rng: rng@d8 {
+   compatible = "ingenic,jz4780-rng";
+   reg = <0x10d8 0x8>;
+   };
+};
-- 
2.10.0



Re: [PATCH v3 0/3] STM32 CRYP crypto driver

2017-09-18 Thread Fabien DESSENNE
Just a gentle ping ... or have I missed out on a reply?


On 18/08/17 11:19, Fabien Dessenne wrote:
> This set of patches adds a new crypto driver for STMicroelectronics stm32 HW.
> This drivers uses the crypto API and provides with HW-enabled AEAD and block
> cipher algorithms.
> It makes use of the crypto engine which is upgraded in order to support AEAD
> requests.
>
> This driver was successfully tested with tcrypt / testmgr.
>
> Changes since v3:
> - update dt-bindings with Rob Herring remarks
>
> Changes since v2:
> - update dt-bindings (interrupts description)
> - rebase on STM32 crypto patches (L. Debieve : update CRC32 + add HASH)
>
> Fabien Dessenne (3):
>crypto: engine - permit to enqueue aead_request
>dt-bindings: Document STM32 CRYP bindings
>crypto: stm32 - Support for STM32 CRYP crypto module
>
>   .../devicetree/bindings/crypto/st,stm32-cryp.txt   |   19 +
>   crypto/crypto_engine.c |  101 +
>   drivers/crypto/stm32/Kconfig   |9 +
>   drivers/crypto/stm32/Makefile  |3 +-
>   drivers/crypto/stm32/stm32-cryp.c  | 1962 
> 
>   include/crypto/engine.h|   16 +
>   6 files changed, 2109 insertions(+), 1 deletion(-)
>   create mode 100644 
> Documentation/devicetree/bindings/crypto/st,stm32-cryp.txt
>   create mode 100644 drivers/crypto/stm32/stm32-cryp.c
>


Re: [PATCH 1/2] crypto: stm32 - Fix uninitialized data usage

2017-09-18 Thread Lionel DEBIEVE
Hi Arnd,

I've already push this fix for review last month, waiting the ack.

"
From: Lionel Debieve 
To: Herbert Xu , "David S . Miller"
, Maxime Coquelin , 
Alexandre
  Torgue , ,
, 
CC: Benjamin Gaignard , Fabien Dessenne
, Ludovic Barre 
Subject: [PATCH 1/1] crypto: stm32/hash - Remove uninitialized symbol
Date: Fri, 18 Aug 2017 15:54:01 +0200
"

Sorry if you receive this mail twice, I didn't see any mail in the mailing 
list, maybe server issue.

I'm reviewing your second part patch.

BR,

Lionel

> On 09/12/2017 11:35 AM, Arnd Bergmann wrote:
>> The error handling in stm32_hash_irq_thread passes
>> uninitialized data into stm32_hash_finish_req, as gcc
>> points out:
>> drivers/crypto/stm32/stm32-hash.c: In function 'stm32_hash_irq_thread':
>> drivers/crypto/stm32/stm32-hash.c:1088:2: error: 'err' may be used 
>> uninitialized in this function [-Werror=maybe-uninitialized]
>> I could not tell what data should be passed there instead,
>> so this changes the code to always pass zero, making it
>> well-defined, though possibly still wrong. Please check.
>> Signed-off-by: Arnd Bergmann 
>> ---
>>drivers/crypto/stm32/stm32-hash.c | 3 +--
>>1 file changed, 1 insertion(+), 2 deletions(-)
>> diff --git a/drivers/crypto/stm32/stm32-hash.c 
>> b/drivers/crypto/stm32/stm32-hash.c
>> index b585ce54a802..3c23a23e9ee5 100644
>> --- a/drivers/crypto/stm32/stm32-hash.c
>> +++ b/drivers/crypto/stm32/stm32-hash.c
>> @@ -1067,7 +1067,6 @@ static int stm32_hash_cra_sha256_init(struct 
>> crypto_tfm *tfm)
>>static irqreturn_t stm32_hash_irq_thread(int irq, void *dev_id)
>>{
>>struct stm32_hash_dev *hdev = dev_id;
>> -int err;
>>
>>if (HASH_FLAGS_CPU & hdev->flags) {
>>if (HASH_FLAGS_OUTPUT_READY & hdev->flags) {
>> @@ -1085,7 +1084,7 @@ static irqreturn_t stm32_hash_irq_thread(int irq, void 
>> *dev_id)
>>
>>finish:
>>/*Finish current request */
>> -stm32_hash_finish_req(hdev->req, err);
>> +stm32_hash_finish_req(hdev->req, 0);
>>
>>return IRQ_HANDLED;
>>}
>