* cipher/Makefile.am: Add 'cipher-gcm-aarch64-simd.c'. * cipher/cipher-gcm-aarch64-simd.c: New. * cipher/cipher-gcm.c [GCM_USE_AARCH64]: Add function prototypes for AArch64/SIMD implementation. (setupM) [GCM_USE_AARCH64]: Add setup for AArch64/SIMD implementation. * cipher/cipher-internal.h (GCM_USE_AARCH64): New. * configure.ac: Add 'cipher-gcm-aarch64-simd.c'. --
Patch adds GHASH/GCM intrinsics implementation for AArch64. This is for CPUs without crypto extensions instruction set support. Benchmark on Cortex-A53 (1152 Mhz): Before: | nanosecs/byte mebibytes/sec cycles/byte GMAC_AES | 12.22 ns/B 78.07 MiB/s 14.07 c/B After: | nanosecs/byte mebibytes/sec cycles/byte GMAC_AES | 7.38 ns/B 129.2 MiB/s 8.50 c/B Signed-off-by: Jussi Kivilinna <jussi.kivili...@iki.fi> --- cipher/Makefile.am | 9 +- cipher/cipher-gcm-aarch64-simd.c | 320 +++++++++++++++++++++++++++++++ cipher/cipher-gcm.c | 14 ++ cipher/cipher-internal.h | 6 + configure.ac | 1 + 5 files changed, 349 insertions(+), 1 deletion(-) create mode 100644 cipher/cipher-gcm-aarch64-simd.c diff --git a/cipher/Makefile.am b/cipher/Makefile.am index 2528bc39..633c53ed 100644 --- a/cipher/Makefile.am +++ b/cipher/Makefile.am @@ -89,7 +89,8 @@ EXTRA_libcipher_la_SOURCES = \ chacha20-amd64-avx512.S chacha20-armv7-neon.S chacha20-aarch64.S \ chacha20-ppc.c chacha20-s390x.S \ chacha20-p10le-8x.s \ - cipher-gcm-ppc.c cipher-gcm-intel-pclmul.c cipher-gcm-armv7-neon.S \ + cipher-gcm-ppc.c cipher-gcm-intel-pclmul.c \ + cipher-gcm-aarch64-simd.c cipher-gcm-armv7-neon.S \ cipher-gcm-armv8-aarch32-ce.S cipher-gcm-armv8-aarch64-ce.S \ crc.c crc-intel-pclmul.c crc-armv8-ce.c \ crc-armv8-aarch64-ce.S \ @@ -325,6 +326,12 @@ camellia-aarch64-ce.o: $(srcdir)/camellia-aarch64-ce.c Makefile camellia-aarch64-ce.lo: $(srcdir)/camellia-aarch64-ce.c Makefile `echo $(LTCOMPILE) $(aarch64_crypto_cflags) -c $< | $(instrumentation_munging) ` +cipher-gcm-aarch64-simd.o: $(srcdir)/cipher-gcm-aarch64-simd.c Makefile + `echo $(COMPILE) $(aarch64_simd_cflags) -c $< | $(instrumentation_munging) ` + +cipher-gcm-aarch64-simd.lo: $(srcdir)/cipher-gcm-aarch64-simd.c Makefile + `echo $(LTCOMPILE) $(aarch64_simd_cflags) -c $< | $(instrumentation_munging) ` + rijndael-vp-aarch64.o: $(srcdir)/rijndael-vp-aarch64.c Makefile `echo $(COMPILE) $(aarch64_simd_cflags) -c $< | $(instrumentation_munging) ` diff --git a/cipher/cipher-gcm-aarch64-simd.c b/cipher/cipher-gcm-aarch64-simd.c new file mode 100644 index 00000000..ecb55a9f --- /dev/null +++ b/cipher/cipher-gcm-aarch64-simd.c @@ -0,0 +1,320 @@ +/* cipher-gcm-aarch64-simd.c - ARM/NEON accelerated GHASH + * Copyright (C) 2019-2024 Jussi Kivilinna <jussi.kivili...@iki.fi> + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#include <config.h> + +#include "types.h" +#include "g10lib.h" +#include "cipher.h" +#include "bufhelp.h" +#include "./cipher-internal.h" + +#ifdef GCM_USE_AARCH64 + +#include "simd-common-aarch64.h" +#include <arm_neon.h> + +#define ALWAYS_INLINE inline __attribute__((always_inline)) +#define NO_INLINE __attribute__((noinline)) +#define NO_INSTRUMENT_FUNCTION __attribute__((no_instrument_function)) + +#define ASM_FUNC_ATTR NO_INSTRUMENT_FUNCTION +#define ASM_FUNC_ATTR_INLINE ASM_FUNC_ATTR ALWAYS_INLINE +#define ASM_FUNC_ATTR_NOINLINE ASM_FUNC_ATTR NO_INLINE + +static ASM_FUNC_ATTR_INLINE uint64x2_t +byteswap_u64x2(uint64x2_t vec) +{ + vec = (uint64x2_t)vrev64q_u8((uint8x16_t)vec); + vec = (uint64x2_t)vextq_u8((uint8x16_t)vec, (uint8x16_t)vec, 8); + return vec; +} + +static ASM_FUNC_ATTR_INLINE uint64x2_t +veor_u64x2(uint64x2_t va, uint64x2_t vb) +{ + return (uint64x2_t)veorq_u8((uint8x16_t)va, (uint8x16_t)vb); +} + +static ASM_FUNC_ATTR_INLINE uint64x1_t +veor_u64x1(uint64x1_t va, uint64x1_t vb) +{ + return (uint64x1_t)veor_u8((uint8x8_t)va, (uint8x8_t)vb); +} + +static ASM_FUNC_ATTR_INLINE uint64x1_t +vand_u64x1(uint64x1_t va, uint64x1_t vb) +{ + return (uint64x1_t)vand_u8((uint8x8_t)va, (uint8x8_t)vb); +} + +static ASM_FUNC_ATTR_INLINE uint64x1_t +vorr_u64x1(uint64x1_t va, uint64x1_t vb) +{ + return (uint64x1_t)vorr_u8((uint8x8_t)va, (uint8x8_t)vb); +} + +/* 64x64=>128 carry-less multiplication using vmull.p8 instruction. + * + * From "Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R. Fast Software + * Polynomial Multiplication on ARM Processors using the NEON Engine. The + * Second International Workshop on Modern Cryptography and Security + * Engineering — MoCrySEn, 2013". */ +static ASM_FUNC_ATTR_INLINE uint64x2_t +emulate_vmull_p64(uint64x1_t ad, uint64x1_t bd) +{ + static const uint64x1_t k0 = { 0 }; + static const uint64x1_t k16 = { U64_C(0xffff) }; + static const uint64x1_t k32 = { U64_C(0xffffffff) }; + static const uint64x1_t k48 = { U64_C(0xffffffffffff) }; + uint64x1_t rl; + uint64x2_t rq; + uint64x1_t t0l; + uint64x1_t t0h; + uint64x2_t t0q; + uint64x1_t t1l; + uint64x1_t t1h; + uint64x2_t t1q; + uint64x1_t t2l; + uint64x1_t t2h; + uint64x2_t t2q; + uint64x1_t t3l; + uint64x1_t t3h; + uint64x2_t t3q; + + t0l = (uint64x1_t)vext_u8((uint8x8_t)ad, (uint8x8_t)ad, 1); + t0q = (uint64x2_t)vmull_p8((poly8x8_t)t0l, (poly8x8_t)bd); + + rl = (uint64x1_t)vext_u8((uint8x8_t)bd, (uint8x8_t)bd, 1); + rq = (uint64x2_t)vmull_p8((poly8x8_t)ad, (poly8x8_t)rl); + + t1l = (uint64x1_t)vext_u8((uint8x8_t)ad, (uint8x8_t)ad, 2); + t1q = (uint64x2_t)vmull_p8((poly8x8_t)t1l, (poly8x8_t)bd); + + t3l = (uint64x1_t)vext_u8((uint8x8_t)bd, (uint8x8_t)bd, 2); + t3q = (uint64x2_t)vmull_p8((poly8x8_t)ad, (poly8x8_t)t3l); + + t2l = (uint64x1_t)vext_u8((uint8x8_t)ad, (uint8x8_t)ad, 3); + t2q = (uint64x2_t)vmull_p8((poly8x8_t)t2l, (poly8x8_t)bd); + + t0q = veor_u64x2(t0q, rq); + t0l = vget_low_u64(t0q); + t0h = vget_high_u64(t0q); + + rl = (uint64x1_t)vext_u8((uint8x8_t)bd, (uint8x8_t)bd, 3); + rq = (uint64x2_t)vmull_p8((poly8x8_t)ad, (poly8x8_t)rl); + + t1q = veor_u64x2(t1q, t3q); + t1l = vget_low_u64(t1q); + t1h = vget_high_u64(t1q); + + t3l = (uint64x1_t)vext_u8((uint8x8_t)bd, (uint8x8_t)bd, 4); + t3q = (uint64x2_t)vmull_p8((poly8x8_t)ad, (poly8x8_t)t3l); + t3l = vget_low_u64(t3q); + t3h = vget_high_u64(t3q); + + t0l = veor_u64x1(t0l, t0h); + t0h = vand_u64x1(t0h, k48); + t1l = veor_u64x1(t1l, t1h); + t1h = vand_u64x1(t1h, k32); + t2q = veor_u64x2(t2q, rq); + t2l = vget_low_u64(t2q); + t2h = vget_high_u64(t2q); + t0l = veor_u64x1(t0l, t0h); + t1l = veor_u64x1(t1l, t1h); + t2l = veor_u64x1(t2l, t2h); + t2h = vand_u64x1(t2h, k16); + t3l = veor_u64x1(t3l, t3h); + t3h = k0; + t0q = vcombine_u64(t0l, t0h); + t0q = (uint64x2_t)vextq_u8((uint8x16_t)t0q, (uint8x16_t)t0q, 15); + t2l = veor_u64x1(t2l, t2h); + t1q = vcombine_u64(t1l, t1h); + t1q = (uint64x2_t)vextq_u8((uint8x16_t)t1q, (uint8x16_t)t1q, 14); + rq = (uint64x2_t)vmull_p8((poly8x8_t)ad, (poly8x8_t)bd); + t2q = vcombine_u64(t2l, t2h); + t2q = (uint64x2_t)vextq_u8((uint8x16_t)t2q, (uint8x16_t)t2q, 13); + t3q = vcombine_u64(t3l, t3h); + t3q = (uint64x2_t)vextq_u8((uint8x16_t)t3q, (uint8x16_t)t3q, 12); + t0q = veor_u64x2(t0q, t1q); + t2q = veor_u64x2(t2q, t3q); + rq = veor_u64x2(rq, t0q); + rq = veor_u64x2(rq, t2q); + return rq; +} + +/* GHASH functions. + * + * See "Gouvêa, C. P. L. & López, J. Implementing GCM on ARMv8. Topics in + * Cryptology — CT-RSA 2015" for details. + */ +static ASM_FUNC_ATTR_INLINE uint64x2x2_t +pmul_128x128(uint64x2_t a, uint64x2_t b) +{ + uint64x1_t a_l = vget_low_u64(a); + uint64x1_t a_h = vget_high_u64(a); + uint64x1_t b_l = vget_low_u64(b); + uint64x1_t b_h = vget_high_u64(b); + uint64x1_t t1_h = veor_u64x1(b_l, b_h); + uint64x1_t t1_l = veor_u64x1(a_l, a_h); + uint64x2_t r0 = emulate_vmull_p64(a_l, b_l); + uint64x2_t r1 = emulate_vmull_p64(a_h, b_h); + uint64x2_t t2 = emulate_vmull_p64(t1_h, t1_l); + uint64x1_t t2_l, t2_h; + uint64x1_t r0_l, r0_h; + uint64x1_t r1_l, r1_h; + + t2 = veor_u64x2(t2, r0); + t2 = veor_u64x2(t2, r1); + + r0_l = vget_low_u64(r0); + r0_h = vget_high_u64(r0); + r1_l = vget_low_u64(r1); + r1_h = vget_high_u64(r1); + t2_l = vget_low_u64(t2); + t2_h = vget_high_u64(t2); + + r0_h = veor_u64x1(r0_h, t2_l); + r1_l = veor_u64x1(r1_l, t2_h); + + r0 = vcombine_u64(r0_l, r0_h); + r1 = vcombine_u64(r1_l, r1_h); + + return (const uint64x2x2_t){ .val = { r0, r1 } }; +} + +/* Reduction using Xor and Shift. + * + * See "Shay Gueron, Michael E. Kounavis. Intel Carry-Less Multiplication + * Instruction and its Usage for Computing the GCM Mode" for details. + */ +static ASM_FUNC_ATTR_INLINE uint64x2_t +reduction(uint64x2x2_t r0r1) +{ + static const uint64x2_t k0 = { U64_C(0), U64_C(0) }; + uint64x2_t r0 = r0r1.val[0]; + uint64x2_t r1 = r0r1.val[1]; + uint64x2_t t0q; + uint64x2_t t1q; + uint64x2_t t2q; + uint64x2_t t; + + t0q = (uint64x2_t)vshlq_n_u32((uint32x4_t)r0, 31); + t1q = (uint64x2_t)vshlq_n_u32((uint32x4_t)r0, 30); + t2q = (uint64x2_t)vshlq_n_u32((uint32x4_t)r0, 25); + t0q = veor_u64x2(t0q, t1q); + t0q = veor_u64x2(t0q, t2q); + t = (uint64x2_t)vextq_u8((uint8x16_t)t0q, (uint8x16_t)k0, 4); + t0q = (uint64x2_t)vextq_u8((uint8x16_t)k0, (uint8x16_t)t0q, 16 - 12); + r0 = veor_u64x2(r0, t0q); + t0q = (uint64x2_t)vshrq_n_u32((uint32x4_t)r0, 1); + t1q = (uint64x2_t)vshrq_n_u32((uint32x4_t)r0, 2); + t2q = (uint64x2_t)vshrq_n_u32((uint32x4_t)r0, 7); + t0q = veor_u64x2(t0q, t1q); + t0q = veor_u64x2(t0q, t2q); + t0q = veor_u64x2(t0q, t); + r0 = veor_u64x2(r0, t0q); + return veor_u64x2(r0, r1); +} + +ASM_FUNC_ATTR_NOINLINE unsigned int +_gcry_ghash_aarch64_simd(gcry_cipher_hd_t c, byte *result, const byte *buf, + size_t nblocks) +{ + uint64x2_t rhash; + uint64x2_t rh1; + uint64x2_t rbuf; + uint64x2x2_t rr0rr1; + + if (nblocks == 0) + return 0; + + rhash = vld1q_u64((const void *)result); + rh1 = vld1q_u64((const void *)c->u_mode.gcm.u_ghash_key.key); + + rhash = byteswap_u64x2(rhash); + + rbuf = vld1q_u64((const void *)buf); + buf += 16; + nblocks--; + + rbuf = byteswap_u64x2(rbuf); + + rhash = veor_u64x2(rhash, rbuf); + + while (nblocks) + { + rbuf = vld1q_u64((const void *)buf); + buf += 16; + nblocks--; + + rr0rr1 = pmul_128x128(rhash, rh1); + + rbuf = byteswap_u64x2(rbuf); + + rhash = reduction(rr0rr1); + + rhash = veor_u64x2(rhash, rbuf); + } + + rr0rr1 = pmul_128x128(rhash, rh1); + rhash = reduction(rr0rr1); + + rhash = byteswap_u64x2(rhash); + + vst1q_u64((void *)result, rhash); + + clear_vec_regs(); + + return 0; +} + +static ASM_FUNC_ATTR_INLINE void +gcm_lsh_1(void *r_out, uint64x2_t i) +{ + static const uint64x1_t const_d = { U64_C(0xc200000000000000) }; + uint64x1_t ia = vget_low_u64(i); + uint64x1_t ib = vget_high_u64(i); + uint64x1_t oa, ob, ma; + + ma = (uint64x1_t)vshr_n_s64((int64x1_t)ib, 63); + oa = vshr_n_u64(ib, 63); + ob = vshr_n_u64(ia, 63); + ma = vand_u64x1(ma, const_d); + ib = vshl_n_u64(ib, 1); + ia = vshl_n_u64(ia, 1); + ob = vorr_u64x1(ob, ib); + oa = vorr_u64x1(oa, ia); + ob = veor_u64x1(ob, ma); + vst2_u64(r_out, (const uint64x1x2_t){ .val = { oa, ob } }); +} + +ASM_FUNC_ATTR_NOINLINE void +_gcry_ghash_setup_aarch64_simd(gcry_cipher_hd_t c) +{ + uint64x2_t rhash = vld1q_u64((const void *)c->u_mode.gcm.u_ghash_key.key); + + rhash = byteswap_u64x2(rhash); + + gcm_lsh_1(c->u_mode.gcm.u_ghash_key.key, rhash); + + clear_vec_regs(); +} + +#endif /* GCM_USE_AARCH64 */ diff --git a/cipher/cipher-gcm.c b/cipher/cipher-gcm.c index d3c04d58..9fbdb02e 100644 --- a/cipher/cipher-gcm.c +++ b/cipher/cipher-gcm.c @@ -102,6 +102,13 @@ ghash_armv7_neon (gcry_cipher_hd_t c, byte *result, const byte *buf, } #endif /* GCM_USE_ARM_NEON */ +#ifdef GCM_USE_AARCH64 +extern void _gcry_ghash_setup_aarch64_simd(gcry_cipher_hd_t c); + +extern unsigned int _gcry_ghash_aarch64_simd(gcry_cipher_hd_t c, byte *result, + const byte *buf, size_t nblocks); +#endif /* GCM_USE_AARCH64 */ + #ifdef GCM_USE_S390X_CRYPTO #include "asm-inline-s390x.h" @@ -607,6 +614,13 @@ setupM (gcry_cipher_hd_t c) ghash_setup_armv7_neon (c); } #endif +#ifdef GCM_USE_AARCH64 + else if (features & HWF_ARM_NEON) + { + c->u_mode.gcm.ghash_fn = _gcry_ghash_aarch64_simd; + _gcry_ghash_setup_aarch64_simd (c); + } +#endif #ifdef GCM_USE_PPC_VPMSUM else if (features & HWF_PPC_VCRYPTO) { diff --git a/cipher/cipher-internal.h b/cipher/cipher-internal.h index cd8ff788..ddf8fbb5 100644 --- a/cipher/cipher-internal.h +++ b/cipher/cipher-internal.h @@ -112,6 +112,12 @@ #endif #endif /* GCM_USE_ARM_NEON */ +/* GCM_USE_AARCH64 indicates whether to compile GCM with AArch64 SIMD code. */ +#undef GCM_USE_AARCH64 +#if defined(__AARCH64EL__) && defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) +# define GCM_USE_AARCH64 1 +#endif + /* GCM_USE_S390X_CRYPTO indicates whether to enable zSeries code. */ #undef GCM_USE_S390X_CRYPTO #if defined(HAVE_GCC_INLINE_ASM_S390X) diff --git a/configure.ac b/configure.ac index 6347ea25..a7f922b1 100644 --- a/configure.ac +++ b/configure.ac @@ -3644,6 +3644,7 @@ case "${host}" in GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS cipher-gcm-armv8-aarch32-ce.lo" ;; aarch64-*-*) + GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS cipher-gcm-aarch64-simd.lo" GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS cipher-gcm-armv8-aarch64-ce.lo" ;; powerpc64le-*-* | powerpc64-*-* | powerpc-*-*) -- 2.45.2 _______________________________________________ Gcrypt-devel mailing list Gcrypt-devel@gnupg.org https://lists.gnupg.org/mailman/listinfo/gcrypt-devel