v2: avoid expensive trap on unaligned LDM, reshuffled some insns
>From fa19a36985b7554517e9122b4cd193cd1a9c4f0e Mon Sep 17 00:00:00 2001
From: "Yuriy M. Kaminskiy" <[email protected]>
Date: Sun, 10 Mar 2019 11:08:46 +0300
Subject: [PATCH] Add fast constant-time ARM NEON ghash/gcm

Based on code from
https://conradoplg.cryptoland.net/software/ecc-and-ae-for-arm-neon/
and
https://hal.inria.fr/hal-01506572
Note: arm->neon is fast, neon->arm slow, so we delay bitreverse
(performed in arm) as much as possible and keep ctx->x and ctx->key
bitreversed.

Only for little-endian.

On Raspberry PI 3B+ (armv8-a/Cortex-A53 @1.4GHz):
=== bench-slopes-nettle ===
 aes128         |  nanosecs/byte   mebibytes/sec   cycles/byte
Before:
       GCM auth |     28.43 ns/B     33.54 MiB/s     39.81 c/B
After: (300% faster)
       GCM auth |      7.05 ns/B     135.3 MiB/s      9.87 c/B
---
 arm/fat/gcm-hash-2.asm |  37 ++++++++
 arm/neon/gcm-hash.asm  | 238 +++++++++++++++++++++++++++++++++++++++++++++++++
 configure.ac           |   3 +
 fat-arm.c              |  12 +++
 gcm.c                  |  40 ++++++++-
 5 files changed, 328 insertions(+), 2 deletions(-)
 create mode 100644 arm/fat/gcm-hash-2.asm
 create mode 100644 arm/neon/gcm-hash.asm

diff --git a/arm/fat/gcm-hash-2.asm b/arm/fat/gcm-hash-2.asm
new file mode 100644
index 00000000..4c9729f1
--- /dev/null
+++ b/arm/fat/gcm-hash-2.asm
@@ -0,0 +1,37 @@
+C arm/fat/gcm-hash-2.asm
+
+
+ifelse(<
+   Copyright (C) 2015 Niels Möller
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+>)
+
+dnl PROLOGUE(_nettle_gcm_hash) picked up by configure
+dnl PROLOGUE(_nettle_gcm_hash_convert) picked up by configure
+define(<fat_transform>, <$1_neon>)
+include_src(<arm/neon/gcm-hash.asm>)
diff --git a/arm/neon/gcm-hash.asm b/arm/neon/gcm-hash.asm
new file mode 100644
index 00000000..acdf2941
--- /dev/null
+++ b/arm/neon/gcm-hash.asm
@@ -0,0 +1,238 @@
+C arm/neon/gcm-hash.asm
+
+ifelse(<
+   Copyright (C) Danilo Câmara <[email protected]>
+   Copyright (C) Conrado Porto Lopes Gouvêa <[email protected]>
+   (gf(2**128) multiplication core)
+   Copyright (C) 2019 Yuriy Kaminskiy <[email protected]>
+   (nettle integration)
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+>)
+	.file "gcm-hash.asm"
+	.arch	armv7-a
+	.fpu	neon
+	.arm
+
+.macro MUL64k3t4 rq rl rh ad bd k16 k32 k48 t0q t0l t0h t1q t1l t1h t2q t2l t2h t3q t3l t3h
+
+    # a[76543210] * b[07654321]             # a[76543210] * b[10765432]             # a[76543210] * b[21076543]
+    # a[07654321] * b[76543210]             # a[10765432] * b[76543210]             # a[21076543] * b[76543210]             # a[76543210] * b[32107654]             # a[76543210] * b[76543210]
+    @A1
+    vext.8  \t0l, \ad, \ad, $1
+    @F = A1*B
+    vmull.p8 \t0q, \t0l, \bd
+    @B1
+    vext.8  \rl, \bd, \bd, $1
+    @E = A*B1 (7)
+    vmull.p8 \rq, \ad, \rl
+                                            @A2
+                                            vext.8  \t1l, \ad, \ad, $2
+                                            @H = A2*B
+                                            vmull.p8 \t1q, \t1l, \bd
+                                            @B2
+                                            vext.8  \t3l, \bd, \bd, $2
+                                            @G = A*B2
+                                            vmull.p8 \t3q, \ad, \t3l
+                                                                                    @A3
+                                                                                    vext.8  \t2l, \ad, \ad, $3
+                                                                                    @J = A3*B
+                                                                                    vmull.p8 \t2q, \t2l, \bd
+    @L = E + F
+    veor    \t0q, \t0q, \rq
+                                                                                    @B3
+                                                                                    vext.8  \rl, \bd, \bd, $3
+                                                                                    @I = A*B3
+                                                                                    vmull.p8 \rq, \ad, \rl
+                                            @M = G + H
+                                            veor    \t1q, \t1q, \t3q
+                                                                                                                            @B4
+                                                                                                                            vext.8  \t3l, \bd, \bd, $4
+                                                                                                                            @K = A*B4
+                                                                                                                            vmull.p8 \t3q, \ad, \t3l
+    @t0 = (L) (P0 + P1) shl 8
+    veor    \t0l, \t0l, \t0h
+    vand    \t0h, \t0h, \k48
+                                            @t1 = (M) (P2 + P3) shl 16
+                                            veor    \t1l, \t1l, \t1h
+                                            vand    \t1h, \t1h, \k32
+                                                                                    @N = I + J
+                                                                                    veor    \t2q, \t2q, \rq
+    veor    \t0l, \t0l, \t0h
+                                            veor    \t1l, \t1l, \t1h
+                                                                                    @t2 = (N) (P4 + P5) shl 24
+                                                                                    veor    \t2l, \t2l, \t2h
+                                                                                    vand    \t2h, \t2h, \k16
+                                                                                                                            @t3 = (K) (P6 + P7) shl 32
+                                                                                                                            veor    \t3l, \t3l, \t3h
+                                                                                                                            vmov.i64 \t3h, $0
+    vext.8  \t0q, \t0q, \t0q, $15
+                                                                                    veor    \t2l, \t2l, \t2h
+                                            vext.8  \t1q, \t1q, \t1q, $14
+                                                                                                                                                                    vmull.p8 \rq, \ad, \bd
+                                                                                    vext.8  \t2q, \t2q, \t2q, $13
+                                                                                                                            vext.8  \t3q, \t3q, \t3q, $12
+    veor    \t0q, \t0q, \t1q
+    veor    \t2q, \t2q, \t3q
+    veor    \rq, \rq, \t0q
+    veor    \rq, \rq, \t2q
+
+.endm
+
+C r0 key (pre-bitreversed)
+C r1 x (pre-bitreversed)
+C r2 count
+C r3 data
+PROLOGUE(_nettle_gcm_hash)
+	.fnstart
+	push {r4,r5,r6,r7}
+	.save {r4,r5,r6,r7}
+	vpush.64 {d8,d9,d10,d11,d12,d13,d14,d15}
+	.vsave {d8,d9,d10,d11,d12,d13,d14,d15}
+C q4 (d8 d9) = key
+	vld1.32 {d8,d9}, [r0:64]
+C q0 (d0 d1) = x
+	vld1.32 {d0,d1}, [r1:64]
+
+    vmov.i64 d14, #0x0000FFFFFFFFFFFF
+    vmov.i64 d13, #0x00000000FFFFFFFF
+    vmov.i64 d12, #0x000000000000FFFF
+	cmp	r2, #15
+	bls	.Lleftover
+.Lloop:
+	ldr r4, [r3] @ unaligned
+	ldr r5, [r3, #+4] @ unaligned
+	ldr r6, [r3, #+8] @ unaligned
+	ldr r7, [r3, #+12] @ unaligned
+	sub	r2, #16
+	add	r3, #16
+.Lentry:
+	rbit	r4, r4
+	rbit	r5, r5
+	 rbit	r6, r6
+	 rbit	r7, r7
+	vmov	d6, r4, r5
+	 vmov	d7, r6, r7
+	vrev32.i8 d6, d6
+	 vrev32.i8 d7, d7
+	vmov	q2, q4
+	 veor	q3, q0
+
+    @vld1.32 {d4,d5}, [r1:64]
+    @vld1.32 {d6,d7}, [r2:64]
+
+    MUL64k3t4 q0, d0, d1, d4, d6, d12, d13, d14, q12, d24, d25, q13, d26, d27, q14, d28, d29, q15, d30, d31
+    MUL64k3t4 q1, d2, d3, d5, d7, d12, d13, d14, q12, d24, d25, q13, d26, d27, q14, d28, d29, q15, d30, d31
+    veor d6, d7
+    veor d7, d4, d5
+    MUL64k3t4 q2, d4, d5, d6, d7, d12, d13, d14, q12, d24, d25, q13, d26, d27, q14, d28, d29, q15, d30, d31
+    veor q2, q0
+    veor q2, q1
+    veor d1, d4
+    veor d2, d5
+
+    @vst1.32 {d0-d3}, [r0:64]
+    @bx lr
+
+    vmov.i8 d30, #135
+    //[d3:d2|d1:d0]
+    //[d3:d2] * r(z)
+    //[d5:d4] = d2 * r(z)
+    //[d7:d6] = d3 * r(z)
+    vmull.p8 q2, d2, d30
+    vmull.p8 q3, d3, d30
+    vuzp.8 d4, d5
+    vuzp.8 d6, d7
+
+    vswp d5, d6
+
+    veor q0, q2
+    vshl.i64 q8, q3, #8
+    vsri.64 d17, d6, #(64-8)
+    vshr.U64 d18, d7, #(64-8)
+    veor q0, q8
+
+    vmull.p8 q2, d18, d30
+    veor d0, d4
+
+    @vst1.32 {d0,d1}, [r0:64]
+    @bx lr
+
+	cmp	r2, #15
+	bhi	.Lloop
+
+.Lleftover:
+	add	r3, r2
+	 mov	r4, #0
+	rsb	r2, r2, #15
+	 mov	r5, #0
+	 mov	r6, #0
+	cmp	r2, #14
+	 mov	r7, #0
+	addls	pc, pc, r2, asl #3
+1:
+	b .Lexit
+	.rept 3
+	ldrb	ip, [r3, #-1]!
+	orr	r7, ip, r7, lsl #8
+	.endr
+	.irpc r,654
+	.rept 4
+	ldrb	ip, [r3, #-1]!
+	orr	r\r, ip, r\r, lsl #8
+	.endr
+	.endr
+2:
+	.ifne 1b-2b+(4+8*15)
+	.error "incorrect switch table size"
+	.endif
+	mov	r2,#0
+	b .Lentry
+.Lexit:
+	vpop.64	{d8,d9,d10,d11,d12,d13,d14,d15}
+	vst1.32	{d0,d1}, [r1:64]
+	pop	{r4,r5,r6,r7}
+	bx lr
+	.fnend
+
+EPILOGUE(_nettle_gcm_hash)
+
+PROLOGUE(_nettle_gcm_hash_convert)
+	.fnstart
+    ldm r1, {r1, r2, r3, ip}
+    rbit r1, r1
+    rbit r2, r2
+    rbit r3, r3
+    rbit ip, ip
+    rev r1, r1
+    rev r2, r2
+    rev r3, r3
+    rev ip, ip
+    stm r0, {r1, r2, r3, ip}
+    bx lr
+	.fnend
+EPILOGUE(_nettle_gcm_hash_convert)
diff --git a/configure.ac b/configure.ac
index 3f409fa4..f12dbf34 100644
--- a/configure.ac
+++ b/configure.ac
@@ -473,6 +473,7 @@ asm_replace_list="aes-encrypt-internal.asm aes-decrypt-internal.asm \
 asm_nettle_optional_list="gcm-hash8.asm cpuid.asm \
   aes-encrypt-internal-2.asm aes-decrypt-internal-2.asm memxor-2.asm \
   chacha-core-internal-2.asm \
+  gcm-hash.asm gcm-hash-2.asm \
   salsa20-core-internal-2.asm sha1-compress-2.asm sha256-compress-2.asm \
   sha3-permute-2.asm sha512-compress-2.asm \
   umac-nh-n-2.asm umac-nh-2.asm"
@@ -587,6 +588,8 @@ AH_VERBATIM([HAVE_NATIVE],
 #undef HAVE_NATIVE_ecc_521_modp
 #undef HAVE_NATIVE_ecc_521_redc
 #undef HAVE_NATIVE_gcm_hash8
+#undef HAVE_NATIVE_gcm_hash
+#undef HAVE_NATIVE_gcm_hash_convert
 #undef HAVE_NATIVE_salsa20_core
 #undef HAVE_NATIVE_sha1_compress
 #undef HAVE_NATIVE_sha256_compress
diff --git a/fat-arm.c b/fat-arm.c
index 48feb5d4..6e4c8622 100644
--- a/fat-arm.c
+++ b/fat-arm.c
@@ -175,6 +175,14 @@ DECLARE_FAT_FUNC(_nettle_chacha_core, chacha_core_func)
 DECLARE_FAT_FUNC_VAR(chacha_core, chacha_core_func, c);
 DECLARE_FAT_FUNC_VAR(chacha_core, chacha_core_func, neon);
 
+typedef void gcm_hash_func(const struct gcm_key *key, union nettle_block16 *x,
+	 size_t length, const uint8_t *data);
+typedef void gcm_hash_convert_func (union nettle_block16 *dst, const void *);
+extern gcm_hash_func _nettle_gcm_hash_c, _nettle_gcm_hash_neon;
+extern gcm_hash_convert_func _nettle_gcm_hash_convert_c, _nettle_gcm_hash_convert_neon;
+extern gcm_hash_func *_nettle_gcm_hash_vec;
+extern gcm_hash_convert_func *_nettle_gcm_hash_convert_vec;
+
 static void CONSTRUCTOR
 fat_init (void)
 {
@@ -217,6 +225,10 @@ fat_init (void)
       _nettle_umac_nh_vec = _nettle_umac_nh_neon;
       _nettle_umac_nh_n_vec = _nettle_umac_nh_n_neon;
       _nettle_chacha_core_vec = _nettle_chacha_core_neon;
+#if !WORDS_BIGENDIAN && defined(HAVE_NATIVE_gcm_hash) && defined(HAVE_NATIVE_gcm_hash_convert)
+      _nettle_gcm_hash_vec = _nettle_gcm_hash_neon;
+      _nettle_gcm_hash_convert_vec = _nettle_gcm_hash_convert_neon;
+#endif
     }
   else
     {
diff --git a/gcm.c b/gcm.c
index 14a6181b..a2467e39 100644
--- a/gcm.c
+++ b/gcm.c
@@ -331,6 +331,18 @@ gcm_gf_mul (union nettle_block16 *x, const union nettle_block16 *table)
 /* Increment the rightmost 32 bits. */
 #define INC32(block) INCREMENT(4, (block.b) + GCM_BLOCK_SIZE - 4)
 
+#if !WORDS_BIGENDIAN && defined(HAVE_NATIVE_gcm_hash) && defined(HAVE_NATIVE_gcm_hash_convert)
+typedef void gcm_hash_func(const struct gcm_key *key, union nettle_block16 *x,
+	 size_t length, const uint8_t *data);
+gcm_hash_func _nettle_gcm_hash_c, _nettle_gcm_hash;
+gcm_hash_func *_nettle_gcm_hash_vec = _nettle_gcm_hash_c;
+typedef void gcm_hash_convert_func (union nettle_block16 *dst, const union nettle_block16*);
+#define gcm_hash (*_nettle_gcm_hash_vec)
+gcm_hash_convert_func _nettle_gcm_hash_convert_c, _nettle_gcm_hash_convert;
+gcm_hash_convert_func *_nettle_gcm_hash_convert_vec = _nettle_gcm_hash_convert_c;
+#define gcm_hash_convert (*_nettle_gcm_hash_convert_vec)
+#endif
+
 /* Initialization of GCM.
  * @ctx: The context of GCM
  * @cipher: The context of the underlying block cipher
@@ -347,7 +359,14 @@ gcm_set_key(struct gcm_key *key,
   /* H */  
   memset(key->h[0].b, 0, GCM_BLOCK_SIZE);
   f (cipher, GCM_BLOCK_SIZE, key->h[i].b, key->h[0].b);
-  
+
+#if !WORDS_BIGENDIAN && defined(HAVE_NATIVE_gcm_hash) && defined(HAVE_NATIVE_gcm_hash_convert)
+ if (gcm_hash_convert != _nettle_gcm_hash_convert_c) {
+   key->h[0] = key->h[i];
+   return;
+ }
+#endif
+
 #if GCM_TABLE_BITS
   /* Algorithm 3 from the gcm paper. First do powers of two, then do
      the rest by adding. */
@@ -362,10 +381,26 @@ gcm_set_key(struct gcm_key *key,
 #endif
 }
 
+#ifndef gcm_hash_convert
+static void
+gcm_hash_convert(union nettle_block16 *x, const union nettle_block16 *y)
+#else
+void
+_nettle_gcm_hash_convert_c(union nettle_block16 *x, const union nettle_block16 *y)
+#endif
+{
+  if (x != y) memcpy(x, y, sizeof(*x));
+}
+
 #ifndef gcm_hash
 static void
 gcm_hash(const struct gcm_key *key, union nettle_block16 *x,
 	 size_t length, const uint8_t *data)
+#else
+void
+_nettle_gcm_hash_c(const struct gcm_key *key, union nettle_block16 *x,
+	 size_t length, const uint8_t *data)
+#endif
 {
   for (; length >= GCM_BLOCK_SIZE;
        length -= GCM_BLOCK_SIZE, data += GCM_BLOCK_SIZE)
@@ -379,7 +414,6 @@ gcm_hash(const struct gcm_key *key, union nettle_block16 *x,
       gcm_gf_mul (x, key->h);
     }
 }
-#endif /* !gcm_hash */
 
 static void
 gcm_hash_sizes(const struct gcm_key *key, union nettle_block16 *x,
@@ -414,6 +448,7 @@ gcm_set_iv(struct gcm_ctx *ctx, const struct gcm_key *key,
       memset(ctx->iv.b, 0, GCM_BLOCK_SIZE);
       gcm_hash(key, &ctx->iv, length, iv);
       gcm_hash_sizes(key, &ctx->iv, 0, length);
+      gcm_hash_convert(&ctx->iv, &ctx->iv);
     }
 
   memcpy (ctx->ctr.b, ctx->iv.b, GCM_BLOCK_SIZE);
@@ -491,6 +526,7 @@ gcm_digest(struct gcm_ctx *ctx, const struct gcm_key *key,
   gcm_hash_sizes(key, &ctx->x, ctx->auth_size, ctx->data_size);
 
   f (cipher, GCM_BLOCK_SIZE, buffer, ctx->iv.b);
+  gcm_hash_convert (&ctx->x, &ctx->x);
   memxor3 (digest, ctx->x.b, buffer, length);
 
   return;
-- 
2.11.0

_______________________________________________
nettle-bugs mailing list
[email protected]
http://lists.lysator.liu.se/mailman/listinfo/nettle-bugs

Reply via email to