v2: avoid expensive trap on unaligned LDM, reshuffled some insns
>From fa19a36985b7554517e9122b4cd193cd1a9c4f0e Mon Sep 17 00:00:00 2001 From: "Yuriy M. Kaminskiy" <[email protected]> Date: Sun, 10 Mar 2019 11:08:46 +0300 Subject: [PATCH] Add fast constant-time ARM NEON ghash/gcm
Based on code from https://conradoplg.cryptoland.net/software/ecc-and-ae-for-arm-neon/ and https://hal.inria.fr/hal-01506572 Note: arm->neon is fast, neon->arm slow, so we delay bitreverse (performed in arm) as much as possible and keep ctx->x and ctx->key bitreversed. Only for little-endian. On Raspberry PI 3B+ (armv8-a/Cortex-A53 @1.4GHz): === bench-slopes-nettle === aes128 | nanosecs/byte mebibytes/sec cycles/byte Before: GCM auth | 28.43 ns/B 33.54 MiB/s 39.81 c/B After: (300% faster) GCM auth | 7.05 ns/B 135.3 MiB/s 9.87 c/B --- arm/fat/gcm-hash-2.asm | 37 ++++++++ arm/neon/gcm-hash.asm | 238 +++++++++++++++++++++++++++++++++++++++++++++++++ configure.ac | 3 + fat-arm.c | 12 +++ gcm.c | 40 ++++++++- 5 files changed, 328 insertions(+), 2 deletions(-) create mode 100644 arm/fat/gcm-hash-2.asm create mode 100644 arm/neon/gcm-hash.asm diff --git a/arm/fat/gcm-hash-2.asm b/arm/fat/gcm-hash-2.asm new file mode 100644 index 00000000..4c9729f1 --- /dev/null +++ b/arm/fat/gcm-hash-2.asm @@ -0,0 +1,37 @@ +C arm/fat/gcm-hash-2.asm + + +ifelse(< + Copyright (C) 2015 Niels Möller + + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +>) + +dnl PROLOGUE(_nettle_gcm_hash) picked up by configure +dnl PROLOGUE(_nettle_gcm_hash_convert) picked up by configure +define(<fat_transform>, <$1_neon>) +include_src(<arm/neon/gcm-hash.asm>) diff --git a/arm/neon/gcm-hash.asm b/arm/neon/gcm-hash.asm new file mode 100644 index 00000000..acdf2941 --- /dev/null +++ b/arm/neon/gcm-hash.asm @@ -0,0 +1,238 @@ +C arm/neon/gcm-hash.asm + +ifelse(< + Copyright (C) Danilo Câmara <[email protected]> + Copyright (C) Conrado Porto Lopes Gouvêa <[email protected]> + (gf(2**128) multiplication core) + Copyright (C) 2019 Yuriy Kaminskiy <[email protected]> + (nettle integration) + + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +>) + .file "gcm-hash.asm" + .arch armv7-a + .fpu neon + .arm + +.macro MUL64k3t4 rq rl rh ad bd k16 k32 k48 t0q t0l t0h t1q t1l t1h t2q t2l t2h t3q t3l t3h + + # a[76543210] * b[07654321] # a[76543210] * b[10765432] # a[76543210] * b[21076543] + # a[07654321] * b[76543210] # a[10765432] * b[76543210] # a[21076543] * b[76543210] # a[76543210] * b[32107654] # a[76543210] * b[76543210] + @A1 + vext.8 \t0l, \ad, \ad, $1 + @F = A1*B + vmull.p8 \t0q, \t0l, \bd + @B1 + vext.8 \rl, \bd, \bd, $1 + @E = A*B1 (7) + vmull.p8 \rq, \ad, \rl + @A2 + vext.8 \t1l, \ad, \ad, $2 + @H = A2*B + vmull.p8 \t1q, \t1l, \bd + @B2 + vext.8 \t3l, \bd, \bd, $2 + @G = A*B2 + vmull.p8 \t3q, \ad, \t3l + @A3 + vext.8 \t2l, \ad, \ad, $3 + @J = A3*B + vmull.p8 \t2q, \t2l, \bd + @L = E + F + veor \t0q, \t0q, \rq + @B3 + vext.8 \rl, \bd, \bd, $3 + @I = A*B3 + vmull.p8 \rq, \ad, \rl + @M = G + H + veor \t1q, \t1q, \t3q + @B4 + vext.8 \t3l, \bd, \bd, $4 + @K = A*B4 + vmull.p8 \t3q, \ad, \t3l + @t0 = (L) (P0 + P1) shl 8 + veor \t0l, \t0l, \t0h + vand \t0h, \t0h, \k48 + @t1 = (M) (P2 + P3) shl 16 + veor \t1l, \t1l, \t1h + vand \t1h, \t1h, \k32 + @N = I + J + veor \t2q, \t2q, \rq + veor \t0l, \t0l, \t0h + veor \t1l, \t1l, \t1h + @t2 = (N) (P4 + P5) shl 24 + veor \t2l, \t2l, \t2h + vand \t2h, \t2h, \k16 + @t3 = (K) (P6 + P7) shl 32 + veor \t3l, \t3l, \t3h + vmov.i64 \t3h, $0 + vext.8 \t0q, \t0q, \t0q, $15 + veor \t2l, \t2l, \t2h + vext.8 \t1q, \t1q, \t1q, $14 + vmull.p8 \rq, \ad, \bd + vext.8 \t2q, \t2q, \t2q, $13 + vext.8 \t3q, \t3q, \t3q, $12 + veor \t0q, \t0q, \t1q + veor \t2q, \t2q, \t3q + veor \rq, \rq, \t0q + veor \rq, \rq, \t2q + +.endm + +C r0 key (pre-bitreversed) +C r1 x (pre-bitreversed) +C r2 count +C r3 data +PROLOGUE(_nettle_gcm_hash) + .fnstart + push {r4,r5,r6,r7} + .save {r4,r5,r6,r7} + vpush.64 {d8,d9,d10,d11,d12,d13,d14,d15} + .vsave {d8,d9,d10,d11,d12,d13,d14,d15} +C q4 (d8 d9) = key + vld1.32 {d8,d9}, [r0:64] +C q0 (d0 d1) = x + vld1.32 {d0,d1}, [r1:64] + + vmov.i64 d14, #0x0000FFFFFFFFFFFF + vmov.i64 d13, #0x00000000FFFFFFFF + vmov.i64 d12, #0x000000000000FFFF + cmp r2, #15 + bls .Lleftover +.Lloop: + ldr r4, [r3] @ unaligned + ldr r5, [r3, #+4] @ unaligned + ldr r6, [r3, #+8] @ unaligned + ldr r7, [r3, #+12] @ unaligned + sub r2, #16 + add r3, #16 +.Lentry: + rbit r4, r4 + rbit r5, r5 + rbit r6, r6 + rbit r7, r7 + vmov d6, r4, r5 + vmov d7, r6, r7 + vrev32.i8 d6, d6 + vrev32.i8 d7, d7 + vmov q2, q4 + veor q3, q0 + + @vld1.32 {d4,d5}, [r1:64] + @vld1.32 {d6,d7}, [r2:64] + + MUL64k3t4 q0, d0, d1, d4, d6, d12, d13, d14, q12, d24, d25, q13, d26, d27, q14, d28, d29, q15, d30, d31 + MUL64k3t4 q1, d2, d3, d5, d7, d12, d13, d14, q12, d24, d25, q13, d26, d27, q14, d28, d29, q15, d30, d31 + veor d6, d7 + veor d7, d4, d5 + MUL64k3t4 q2, d4, d5, d6, d7, d12, d13, d14, q12, d24, d25, q13, d26, d27, q14, d28, d29, q15, d30, d31 + veor q2, q0 + veor q2, q1 + veor d1, d4 + veor d2, d5 + + @vst1.32 {d0-d3}, [r0:64] + @bx lr + + vmov.i8 d30, #135 + //[d3:d2|d1:d0] + //[d3:d2] * r(z) + //[d5:d4] = d2 * r(z) + //[d7:d6] = d3 * r(z) + vmull.p8 q2, d2, d30 + vmull.p8 q3, d3, d30 + vuzp.8 d4, d5 + vuzp.8 d6, d7 + + vswp d5, d6 + + veor q0, q2 + vshl.i64 q8, q3, #8 + vsri.64 d17, d6, #(64-8) + vshr.U64 d18, d7, #(64-8) + veor q0, q8 + + vmull.p8 q2, d18, d30 + veor d0, d4 + + @vst1.32 {d0,d1}, [r0:64] + @bx lr + + cmp r2, #15 + bhi .Lloop + +.Lleftover: + add r3, r2 + mov r4, #0 + rsb r2, r2, #15 + mov r5, #0 + mov r6, #0 + cmp r2, #14 + mov r7, #0 + addls pc, pc, r2, asl #3 +1: + b .Lexit + .rept 3 + ldrb ip, [r3, #-1]! + orr r7, ip, r7, lsl #8 + .endr + .irpc r,654 + .rept 4 + ldrb ip, [r3, #-1]! + orr r\r, ip, r\r, lsl #8 + .endr + .endr +2: + .ifne 1b-2b+(4+8*15) + .error "incorrect switch table size" + .endif + mov r2,#0 + b .Lentry +.Lexit: + vpop.64 {d8,d9,d10,d11,d12,d13,d14,d15} + vst1.32 {d0,d1}, [r1:64] + pop {r4,r5,r6,r7} + bx lr + .fnend + +EPILOGUE(_nettle_gcm_hash) + +PROLOGUE(_nettle_gcm_hash_convert) + .fnstart + ldm r1, {r1, r2, r3, ip} + rbit r1, r1 + rbit r2, r2 + rbit r3, r3 + rbit ip, ip + rev r1, r1 + rev r2, r2 + rev r3, r3 + rev ip, ip + stm r0, {r1, r2, r3, ip} + bx lr + .fnend +EPILOGUE(_nettle_gcm_hash_convert) diff --git a/configure.ac b/configure.ac index 3f409fa4..f12dbf34 100644 --- a/configure.ac +++ b/configure.ac @@ -473,6 +473,7 @@ asm_replace_list="aes-encrypt-internal.asm aes-decrypt-internal.asm \ asm_nettle_optional_list="gcm-hash8.asm cpuid.asm \ aes-encrypt-internal-2.asm aes-decrypt-internal-2.asm memxor-2.asm \ chacha-core-internal-2.asm \ + gcm-hash.asm gcm-hash-2.asm \ salsa20-core-internal-2.asm sha1-compress-2.asm sha256-compress-2.asm \ sha3-permute-2.asm sha512-compress-2.asm \ umac-nh-n-2.asm umac-nh-2.asm" @@ -587,6 +588,8 @@ AH_VERBATIM([HAVE_NATIVE], #undef HAVE_NATIVE_ecc_521_modp #undef HAVE_NATIVE_ecc_521_redc #undef HAVE_NATIVE_gcm_hash8 +#undef HAVE_NATIVE_gcm_hash +#undef HAVE_NATIVE_gcm_hash_convert #undef HAVE_NATIVE_salsa20_core #undef HAVE_NATIVE_sha1_compress #undef HAVE_NATIVE_sha256_compress diff --git a/fat-arm.c b/fat-arm.c index 48feb5d4..6e4c8622 100644 --- a/fat-arm.c +++ b/fat-arm.c @@ -175,6 +175,14 @@ DECLARE_FAT_FUNC(_nettle_chacha_core, chacha_core_func) DECLARE_FAT_FUNC_VAR(chacha_core, chacha_core_func, c); DECLARE_FAT_FUNC_VAR(chacha_core, chacha_core_func, neon); +typedef void gcm_hash_func(const struct gcm_key *key, union nettle_block16 *x, + size_t length, const uint8_t *data); +typedef void gcm_hash_convert_func (union nettle_block16 *dst, const void *); +extern gcm_hash_func _nettle_gcm_hash_c, _nettle_gcm_hash_neon; +extern gcm_hash_convert_func _nettle_gcm_hash_convert_c, _nettle_gcm_hash_convert_neon; +extern gcm_hash_func *_nettle_gcm_hash_vec; +extern gcm_hash_convert_func *_nettle_gcm_hash_convert_vec; + static void CONSTRUCTOR fat_init (void) { @@ -217,6 +225,10 @@ fat_init (void) _nettle_umac_nh_vec = _nettle_umac_nh_neon; _nettle_umac_nh_n_vec = _nettle_umac_nh_n_neon; _nettle_chacha_core_vec = _nettle_chacha_core_neon; +#if !WORDS_BIGENDIAN && defined(HAVE_NATIVE_gcm_hash) && defined(HAVE_NATIVE_gcm_hash_convert) + _nettle_gcm_hash_vec = _nettle_gcm_hash_neon; + _nettle_gcm_hash_convert_vec = _nettle_gcm_hash_convert_neon; +#endif } else { diff --git a/gcm.c b/gcm.c index 14a6181b..a2467e39 100644 --- a/gcm.c +++ b/gcm.c @@ -331,6 +331,18 @@ gcm_gf_mul (union nettle_block16 *x, const union nettle_block16 *table) /* Increment the rightmost 32 bits. */ #define INC32(block) INCREMENT(4, (block.b) + GCM_BLOCK_SIZE - 4) +#if !WORDS_BIGENDIAN && defined(HAVE_NATIVE_gcm_hash) && defined(HAVE_NATIVE_gcm_hash_convert) +typedef void gcm_hash_func(const struct gcm_key *key, union nettle_block16 *x, + size_t length, const uint8_t *data); +gcm_hash_func _nettle_gcm_hash_c, _nettle_gcm_hash; +gcm_hash_func *_nettle_gcm_hash_vec = _nettle_gcm_hash_c; +typedef void gcm_hash_convert_func (union nettle_block16 *dst, const union nettle_block16*); +#define gcm_hash (*_nettle_gcm_hash_vec) +gcm_hash_convert_func _nettle_gcm_hash_convert_c, _nettle_gcm_hash_convert; +gcm_hash_convert_func *_nettle_gcm_hash_convert_vec = _nettle_gcm_hash_convert_c; +#define gcm_hash_convert (*_nettle_gcm_hash_convert_vec) +#endif + /* Initialization of GCM. * @ctx: The context of GCM * @cipher: The context of the underlying block cipher @@ -347,7 +359,14 @@ gcm_set_key(struct gcm_key *key, /* H */ memset(key->h[0].b, 0, GCM_BLOCK_SIZE); f (cipher, GCM_BLOCK_SIZE, key->h[i].b, key->h[0].b); - + +#if !WORDS_BIGENDIAN && defined(HAVE_NATIVE_gcm_hash) && defined(HAVE_NATIVE_gcm_hash_convert) + if (gcm_hash_convert != _nettle_gcm_hash_convert_c) { + key->h[0] = key->h[i]; + return; + } +#endif + #if GCM_TABLE_BITS /* Algorithm 3 from the gcm paper. First do powers of two, then do the rest by adding. */ @@ -362,10 +381,26 @@ gcm_set_key(struct gcm_key *key, #endif } +#ifndef gcm_hash_convert +static void +gcm_hash_convert(union nettle_block16 *x, const union nettle_block16 *y) +#else +void +_nettle_gcm_hash_convert_c(union nettle_block16 *x, const union nettle_block16 *y) +#endif +{ + if (x != y) memcpy(x, y, sizeof(*x)); +} + #ifndef gcm_hash static void gcm_hash(const struct gcm_key *key, union nettle_block16 *x, size_t length, const uint8_t *data) +#else +void +_nettle_gcm_hash_c(const struct gcm_key *key, union nettle_block16 *x, + size_t length, const uint8_t *data) +#endif { for (; length >= GCM_BLOCK_SIZE; length -= GCM_BLOCK_SIZE, data += GCM_BLOCK_SIZE) @@ -379,7 +414,6 @@ gcm_hash(const struct gcm_key *key, union nettle_block16 *x, gcm_gf_mul (x, key->h); } } -#endif /* !gcm_hash */ static void gcm_hash_sizes(const struct gcm_key *key, union nettle_block16 *x, @@ -414,6 +448,7 @@ gcm_set_iv(struct gcm_ctx *ctx, const struct gcm_key *key, memset(ctx->iv.b, 0, GCM_BLOCK_SIZE); gcm_hash(key, &ctx->iv, length, iv); gcm_hash_sizes(key, &ctx->iv, 0, length); + gcm_hash_convert(&ctx->iv, &ctx->iv); } memcpy (ctx->ctr.b, ctx->iv.b, GCM_BLOCK_SIZE); @@ -491,6 +526,7 @@ gcm_digest(struct gcm_ctx *ctx, const struct gcm_key *key, gcm_hash_sizes(key, &ctx->x, ctx->auth_size, ctx->data_size); f (cipher, GCM_BLOCK_SIZE, buffer, ctx->iv.b); + gcm_hash_convert (&ctx->x, &ctx->x); memxor3 (digest, ctx->x.b, buffer, length); return; -- 2.11.0
_______________________________________________ nettle-bugs mailing list [email protected] http://lists.lysator.liu.se/mailman/listinfo/nettle-bugs
