Re: [PATCH] [v2] crypto: sha512: add ARM NEON implementation

2014-06-30 Thread Ard Biesheuvel
On 29 June 2014 16:34, Jussi Kivilinna jussi.kivili...@iki.fi wrote:
 This patch adds ARM NEON assembly implementation of SHA-512 and SHA-384
 algorithms.

 tcrypt benchmark results on Cortex-A8, sha512-generic vs sha512-neon-asm:

 block-size  bytes/updateold-vs-new
 16  16  2.99x
 64  16  2.67x
 64  64  3.00x
 256 16  2.64x
 256 64  3.06x
 256 256 3.33x
 102416  2.53x
 1024256 3.39x
 102410243.52x
 204816  2.50x
 2048256 3.41x
 204810243.54x
 204820483.57x
 409616  2.49x
 4096256 3.42x
 409610243.56x
 409640963.59x
 819216  2.48x
 8192256 3.42x
 819210243.56x
 819240963.60x
 819281923.60x


Nice speedup!

 Changes in v2:
  - Use ENTRY/ENDPROC
  - Don't provide Thumb2 version


Please move Changelog below '---'

 Signed-off-by: Jussi Kivilinna jussi.kivili...@iki.fi

Acked-by: Ard Biesheuvel ard.biesheu...@linaro.org
Tested-by: Ard Biesheuvel ard.biesheu...@linaro.org

Tested on Exynos-5250 (Cortex-A15)

ARM-asm

[ 1715.164122] testing speed of sha512
[ 1715.164150] test  0 (   16 byte blocks,   16 bytes per update,   1
updates): 136277 opers/sec,   2180437 bytes/sec
[ 1718.159959] test  1 (   64 byte blocks,   16 bytes per update,   4
updates): 126636 opers/sec,   8104746 bytes/sec
[ 1721.159962] test  2 (   64 byte blocks,   64 bytes per update,   1
updates): 136605 opers/sec,   8742720 bytes/sec
[ 1724.159958] test  3 (  256 byte blocks,   16 bytes per update,  16
updates):  41576 opers/sec,  10643541 bytes/sec
[ 1727.159957] test  4 (  256 byte blocks,   64 bytes per update,   4
updates):  45984 opers/sec,  11771989 bytes/sec
[ 1730.159959] test  5 (  256 byte blocks,  256 bytes per update,   1
updates):  47479 opers/sec,  12154794 bytes/sec
[ 1733.159977] test  6 ( 1024 byte blocks,   16 bytes per update,  64
updates):  13410 opers/sec,  13731840 bytes/sec
[ 1736.160027] test  7 ( 1024 byte blocks,  256 bytes per update,   4
updates):  15916 opers/sec,  16298325 bytes/sec
[ 1739.159975] test  8 ( 1024 byte blocks, 1024 bytes per update,   1
updates):  16095 opers/sec,  16481280 bytes/sec
[ 1742.159993] test  9 ( 2048 byte blocks,   16 bytes per update, 128
updates):   7042 opers/sec,  14423381 bytes/sec
[ 1745.159994] test 10 ( 2048 byte blocks,  256 bytes per update,   8
updates):   8438 opers/sec,  17281024 bytes/sec
[ 1748.159995] test 11 ( 2048 byte blocks, 1024 bytes per update,   2
updates):   8541 opers/sec,  17492650 bytes/sec
[ 1751.160001] test 12 ( 2048 byte blocks, 2048 bytes per update,   1
updates):   8560 opers/sec,  17531562 bytes/sec
[ 1754.159975] test 13 ( 4096 byte blocks,   16 bytes per update, 256
updates):   3612 opers/sec,  14794752 bytes/sec
[ 1757.160103] test 14 ( 4096 byte blocks,  256 bytes per update,  16
updates):   4350 opers/sec,  17820330 bytes/sec
[ 1760.160122] test 15 ( 4096 byte blocks, 1024 bytes per update,   4
updates):   4405 opers/sec,  18042880 bytes/sec
[ 1763.159957] test 16 ( 4096 byte blocks, 4096 bytes per update,   1
updates):   4463 opers/sec,  18280448 bytes/sec
[ 1766.160049] test 17 ( 8192 byte blocks,   16 bytes per update, 512
updates):   1829 opers/sec,  14988629 bytes/sec
[ 1769.160328] test 18 ( 8192 byte blocks,  256 bytes per update,  32
updates):   2209 opers/sec,  18101589 bytes/sec
[ 1772.160318] test 19 ( 8192 byte blocks, 1024 bytes per update,   8
updates):   2238 opers/sec,  18333696 bytes/sec
[ 1775.160278] test 20 ( 8192 byte blocks, 4096 bytes per update,   2
updates):   2245 opers/sec,  18393770 bytes/sec
[ 1778.160025] test 21 ( 8192 byte blocks, 8192 bytes per update,   1
updates):   2267 opers/sec,  18576725 bytes/sec

ARM-neon
=
[ 1810.729100] testing speed of sha512
[ 1810.729130] test  0 (   16 byte blocks,   16 bytes per update,   1
updates): 330941 opers/sec,   5295066 bytes/sec
[ 1813.724958] test  1 (   64 byte blocks,   16 bytes per update,   4
updates): 277607 opers/sec,  17766890 bytes/sec
[ 1816.724958] test  2 (   64 byte blocks,   64 bytes per update,   1
updates): 330251 opers/sec,  21136085 bytes/sec
[ 1819.724956] test  3 (  256 byte blocks,   16 bytes per update,  16
updates):  89849 opers/sec,  23001429 bytes/sec
[ 1822.724961] test  4 (  256 byte blocks,   64 bytes per update,   4
updates): 113344 opers/sec,  29016149 bytes/sec
[ 1825.724963] test  5 (  256 byte blocks,  256 bytes per update,   1
updates): 127466 opers/sec,  32631381 bytes/sec
[ 1828.724960] test  6 ( 1024 byte blocks,   16 bytes per update,  64
updates):  27818 opers/sec,  28485632 bytes/sec
[ 

[PATCH] [v2] crypto: sha512: add ARM NEON implementation

2014-06-29 Thread Jussi Kivilinna
This patch adds ARM NEON assembly implementation of SHA-512 and SHA-384
algorithms.

tcrypt benchmark results on Cortex-A8, sha512-generic vs sha512-neon-asm:

block-size  bytes/updateold-vs-new
16  16  2.99x
64  16  2.67x
64  64  3.00x
256 16  2.64x
256 64  3.06x
256 256 3.33x
102416  2.53x
1024256 3.39x
102410243.52x
204816  2.50x
2048256 3.41x
204810243.54x
204820483.57x
409616  2.49x
4096256 3.42x
409610243.56x
409640963.59x
819216  2.48x
8192256 3.42x
819210243.56x
819240963.60x
819281923.60x

Changes in v2:
 - Use ENTRY/ENDPROC
 - Don't provide Thumb2 version

Signed-off-by: Jussi Kivilinna jussi.kivili...@iki.fi
---
 arch/arm/crypto/Makefile|2 
 arch/arm/crypto/sha512-armv7-neon.S |  455 +++
 arch/arm/crypto/sha512_neon_glue.c  |  305 +++
 crypto/Kconfig  |   15 +
 4 files changed, 777 insertions(+)
 create mode 100644 arch/arm/crypto/sha512-armv7-neon.S
 create mode 100644 arch/arm/crypto/sha512_neon_glue.c

diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile
index 374956d..b48fa34 100644
--- a/arch/arm/crypto/Makefile
+++ b/arch/arm/crypto/Makefile
@@ -6,11 +6,13 @@ obj-$(CONFIG_CRYPTO_AES_ARM) += aes-arm.o
 obj-$(CONFIG_CRYPTO_AES_ARM_BS) += aes-arm-bs.o
 obj-$(CONFIG_CRYPTO_SHA1_ARM) += sha1-arm.o
 obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sha1-arm-neon.o
+obj-$(CONFIG_CRYPTO_SHA512_ARM_NEON) += sha512-arm-neon.o
 
 aes-arm-y  := aes-armv4.o aes_glue.o
 aes-arm-bs-y   := aesbs-core.o aesbs-glue.o
 sha1-arm-y := sha1-armv4-large.o sha1_glue.o
 sha1-arm-neon-y:= sha1-armv7-neon.o sha1_neon_glue.o
+sha512-arm-neon-y := sha512-armv7-neon.o sha512_neon_glue.o
 
 quiet_cmd_perl = PERL$@
   cmd_perl = $(PERL) $()  $(@)
diff --git a/arch/arm/crypto/sha512-armv7-neon.S 
b/arch/arm/crypto/sha512-armv7-neon.S
new file mode 100644
index 000..fe99472
--- /dev/null
+++ b/arch/arm/crypto/sha512-armv7-neon.S
@@ -0,0 +1,455 @@
+/* sha512-armv7-neon.S  -  ARM/NEON assembly implementation of SHA-512 
transform
+ *
+ * Copyright © 2013-2014 Jussi Kivilinna jussi.kivili...@iki.fi
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+#include linux/linkage.h
+
+
+.syntax unified
+.code   32
+.fpu neon
+
+.text
+
+/* structure of SHA512_CONTEXT */
+#define hd_a 0
+#define hd_b ((hd_a) + 8)
+#define hd_c ((hd_b) + 8)
+#define hd_d ((hd_c) + 8)
+#define hd_e ((hd_d) + 8)
+#define hd_f ((hd_e) + 8)
+#define hd_g ((hd_f) + 8)
+
+/* register macros */
+#define RK %r2
+
+#define RA d0
+#define RB d1
+#define RC d2
+#define RD d3
+#define RE d4
+#define RF d5
+#define RG d6
+#define RH d7
+
+#define RT0 d8
+#define RT1 d9
+#define RT2 d10
+#define RT3 d11
+#define RT4 d12
+#define RT5 d13
+#define RT6 d14
+#define RT7 d15
+
+#define RT01q q4
+#define RT23q q5
+#define RT45q q6
+#define RT67q q7
+
+#define RW0 d16
+#define RW1 d17
+#define RW2 d18
+#define RW3 d19
+#define RW4 d20
+#define RW5 d21
+#define RW6 d22
+#define RW7 d23
+#define RW8 d24
+#define RW9 d25
+#define RW10 d26
+#define RW11 d27
+#define RW12 d28
+#define RW13 d29
+#define RW14 d30
+#define RW15 d31
+
+#define RW01q q8
+#define RW23q q9
+#define RW45q q10
+#define RW67q q11
+#define RW89q q12
+#define RW1011q q13
+#define RW1213q q14
+#define RW1415q q15
+
+/***
+ * ARM assembly implementation of sha512 transform
+ ***/
+#define rounds2_0_63(ra, rb, rc, rd, re, rf, rg, rh, rw0, rw1, rw01q, rw2, \
+ rw23q, rw1415q, rw9, rw10, interleave_op, arg1) \
+   /* t1 = h + Sum1 (e) + Ch (e, f, g) + k[t] + w[t]; */ \
+   vshr.u64 RT2, re, #14; \
+   vshl.u64 RT3, re, #64 - 14; \
+   interleave_op(arg1); \
+   vshr.u64 RT4, re, #18; \
+   vshl.u64 RT5, re, #64 - 18; \
+   vld1.64 {RT0}, [RK]!; \
+   veor.64 RT23q, RT23q, RT45q; \
+   vshr.u64 RT4, re, #41; \
+   vshl.u64 RT5, re, #64 - 41; \
+   vadd.u64 RT0, RT0, rw0; \
+   veor.64 RT23q, RT23q, RT45q; \
+   vmov.64 RT7, re; \
+   veor.64 RT1, RT2, RT3; \
+   vbsl.64 RT7, rf, rg; \
+   \
+   vadd.u64 RT1, RT1, rh; \
+   vshr.u64 RT2,