Module Name: src
Committed By: riastradh
Date: Tue Jul 28 20:11:09 UTC 2020
Modified Files:
src/sys/crypto/aes/arch/arm: aes_neon.c aes_neon_impl.h aes_neon_subr.c
arm_neon.h
Log Message:
Draft 2x vectorized neon vpaes for aarch64.
Gives a modest speed boost on rk3399 (Cortex-A53/A72), around 20% in
cgd tests, for parallelizable operations like CBC decryption; same
improvement should probably carry over to rpi4 CPU which lacks
ARMv8.0-AES.
To generate a diff of this commit:
cvs rdiff -u -r1.3 -r1.4 src/sys/crypto/aes/arch/arm/aes_neon.c \
src/sys/crypto/aes/arch/arm/aes_neon_subr.c
cvs rdiff -u -r1.1 -r1.2 src/sys/crypto/aes/arch/arm/aes_neon_impl.h
cvs rdiff -u -r1.6 -r1.7 src/sys/crypto/aes/arch/arm/arm_neon.h
Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.
Modified files:
Index: src/sys/crypto/aes/arch/arm/aes_neon.c
diff -u src/sys/crypto/aes/arch/arm/aes_neon.c:1.3 src/sys/crypto/aes/arch/arm/aes_neon.c:1.4
--- src/sys/crypto/aes/arch/arm/aes_neon.c:1.3 Tue Jun 30 20:32:11 2020
+++ src/sys/crypto/aes/arch/arm/aes_neon.c Tue Jul 28 20:11:09 2020
@@ -1,4 +1,4 @@
-/* $NetBSD: aes_neon.c,v 1.3 2020/06/30 20:32:11 riastradh Exp $ */
+/* $NetBSD: aes_neon.c,v 1.4 2020/07/28 20:11:09 riastradh Exp $ */
/*-
* Copyright (c) 2020 The NetBSD Foundation, Inc.
@@ -39,7 +39,7 @@
*/
#include <sys/cdefs.h>
-__KERNEL_RCSID(1, "$NetBSD: aes_neon.c,v 1.3 2020/06/30 20:32:11 riastradh Exp $");
+__KERNEL_RCSID(1, "$NetBSD: aes_neon.c,v 1.4 2020/07/28 20:11:09 riastradh Exp $");
#include <sys/types.h>
@@ -589,6 +589,59 @@ aes_neon_enc1(const struct aesenc *enc,
return vqtbl1q_u8(x, sr[rmod4]);
}
+uint8x16x2_t
+aes_neon_enc2(const struct aesenc *enc, uint8x16x2_t x, unsigned nrounds)
+{
+ const uint32_t *rk32 = enc->aese_aes.aes_rk;
+ uint8x16_t inv_ = *(const volatile uint8x16_t *)&inv;
+ uint8x16_t inva_ = *(const volatile uint8x16_t *)&inva;
+ uint8x16_t sb1_0 = ((const volatile uint8x16_t *)sb1)[0];
+ uint8x16_t sb1_1 = ((const volatile uint8x16_t *)sb1)[1];
+ uint8x16_t sb2_0 = ((const volatile uint8x16_t *)sb2)[0];
+ uint8x16_t sb2_1 = ((const volatile uint8x16_t *)sb2)[1];
+ uint8x16_t x0 = x.val[0], x1 = x.val[1];
+ uint8x16_t io0, jo0, io1, jo1;
+ unsigned rmod4 = 0;
+
+ x0 = aes_schedule_transform(x0, ipt);
+ x1 = aes_schedule_transform(x1, ipt);
+ x0 ^= loadroundkey(rk32);
+ x1 ^= loadroundkey(rk32);
+ for (;;) {
+ uint8x16_t A_0, A2_0, A2_B_0, A2_B_D_0;
+ uint8x16_t A_1, A2_1, A2_B_1, A2_B_D_1;
+
+ subbytes(&io0, &jo0, x0, inv_, inva_);
+ subbytes(&io1, &jo1, x1, inv_, inva_);
+
+ rk32 += 4;
+ rmod4 = (rmod4 + 1) % 4;
+ if (--nrounds == 0)
+ break;
+
+ A_0 = vqtbl1q_u8(sb1_0, io0) ^ vqtbl1q_u8(sb1_1, jo0);
+ A_1 = vqtbl1q_u8(sb1_0, io1) ^ vqtbl1q_u8(sb1_1, jo1);
+ A_0 ^= loadroundkey(rk32);
+ A_1 ^= loadroundkey(rk32);
+ A2_0 = vqtbl1q_u8(sb2_0, io0) ^ vqtbl1q_u8(sb2_1, jo0);
+ A2_1 = vqtbl1q_u8(sb2_0, io1) ^ vqtbl1q_u8(sb2_1, jo1);
+ A2_B_0 = A2_0 ^ vqtbl1q_u8(A_0, mc_forward[rmod4]);
+ A2_B_1 = A2_1 ^ vqtbl1q_u8(A_1, mc_forward[rmod4]);
+ A2_B_D_0 = A2_B_0 ^ vqtbl1q_u8(A_0, mc_backward[rmod4]);
+ A2_B_D_1 = A2_B_1 ^ vqtbl1q_u8(A_1, mc_backward[rmod4]);
+ x0 = A2_B_D_0 ^ vqtbl1q_u8(A2_B_0, mc_forward[rmod4]);
+ x1 = A2_B_D_1 ^ vqtbl1q_u8(A2_B_1, mc_forward[rmod4]);
+ }
+ x0 = vqtbl1q_u8(sbo[0], io0) ^ vqtbl1q_u8(sbo[1], jo0);
+ x1 = vqtbl1q_u8(sbo[0], io1) ^ vqtbl1q_u8(sbo[1], jo1);
+ x0 ^= loadroundkey(rk32);
+ x1 ^= loadroundkey(rk32);
+ return (uint8x16x2_t) { .val = {
+ [0] = vqtbl1q_u8(x0, sr[rmod4]),
+ [1] = vqtbl1q_u8(x1, sr[rmod4]),
+ } };
+}
+
uint8x16_t
aes_neon_dec1(const struct aesdec *dec, uint8x16_t x, unsigned nrounds)
{
@@ -628,4 +681,60 @@ aes_neon_dec1(const struct aesdec *dec,
return vqtbl1q_u8(x, sr[i]);
}
+uint8x16x2_t
+aes_neon_dec2(const struct aesdec *dec, uint8x16x2_t x, unsigned nrounds)
+{
+ const uint32_t *rk32 = dec->aesd_aes.aes_rk;
+ unsigned i = 3 & ~(nrounds - 1);
+ uint8x16_t inv_ = *(const volatile uint8x16_t *)&inv;
+ uint8x16_t inva_ = *(const volatile uint8x16_t *)&inva;
+ uint8x16_t x0 = x.val[0], x1 = x.val[1];
+ uint8x16_t io0, jo0, io1, jo1, mc;
+
+ x0 = aes_schedule_transform(x0, dipt);
+ x1 = aes_schedule_transform(x1, dipt);
+ x0 ^= loadroundkey(rk32);
+ x1 ^= loadroundkey(rk32);
+ rk32 += 4;
+
+ mc = mc_forward[3];
+ for (;;) {
+ subbytes(&io0, &jo0, x0, inv_, inva_);
+ subbytes(&io1, &jo1, x1, inv_, inva_);
+ if (--nrounds == 0)
+ break;
+
+ x0 = vqtbl1q_u8(dsb9[0], io0) ^ vqtbl1q_u8(dsb9[1], jo0);
+ x1 = vqtbl1q_u8(dsb9[0], io1) ^ vqtbl1q_u8(dsb9[1], jo1);
+ x0 ^= loadroundkey(rk32);
+ x1 ^= loadroundkey(rk32);
+ rk32 += 4; /* next round key */
+
+ x0 = vqtbl1q_u8(x0, mc);
+ x1 = vqtbl1q_u8(x1, mc);
+ x0 ^= vqtbl1q_u8(dsbd[0], io0) ^ vqtbl1q_u8(dsbd[1], jo0);
+ x1 ^= vqtbl1q_u8(dsbd[0], io1) ^ vqtbl1q_u8(dsbd[1], jo1);
+
+ x0 = vqtbl1q_u8(x0, mc);
+ x1 = vqtbl1q_u8(x1, mc);
+ x0 ^= vqtbl1q_u8(dsbb[0], io0) ^ vqtbl1q_u8(dsbb[1], jo0);
+ x1 ^= vqtbl1q_u8(dsbb[0], io1) ^ vqtbl1q_u8(dsbb[1], jo1);
+
+ x0 = vqtbl1q_u8(x0, mc);
+ x1 = vqtbl1q_u8(x1, mc);
+ x0 ^= vqtbl1q_u8(dsbe[0], io0) ^ vqtbl1q_u8(dsbe[1], jo0);
+ x1 ^= vqtbl1q_u8(dsbe[0], io1) ^ vqtbl1q_u8(dsbe[1], jo1);
+
+ mc = vextq_u8(mc, mc, 12);
+ }
+ x0 = vqtbl1q_u8(dsbo[0], io0) ^ vqtbl1q_u8(dsbo[1], jo0);
+ x1 = vqtbl1q_u8(dsbo[0], io1) ^ vqtbl1q_u8(dsbo[1], jo1);
+ x0 ^= loadroundkey(rk32);
+ x1 ^= loadroundkey(rk32);
+ return (uint8x16x2_t) { .val = {
+ [0] = vqtbl1q_u8(x0, sr[i]),
+ [1] = vqtbl1q_u8(x1, sr[i]),
+ } };
+}
+
#endif
Index: src/sys/crypto/aes/arch/arm/aes_neon_subr.c
diff -u src/sys/crypto/aes/arch/arm/aes_neon_subr.c:1.3 src/sys/crypto/aes/arch/arm/aes_neon_subr.c:1.4
--- src/sys/crypto/aes/arch/arm/aes_neon_subr.c:1.3 Sat Jul 25 22:36:06 2020
+++ src/sys/crypto/aes/arch/arm/aes_neon_subr.c Tue Jul 28 20:11:09 2020
@@ -1,4 +1,4 @@
-/* $NetBSD: aes_neon_subr.c,v 1.3 2020/07/25 22:36:06 riastradh Exp $ */
+/* $NetBSD: aes_neon_subr.c,v 1.4 2020/07/28 20:11:09 riastradh Exp $ */
/*-
* Copyright (c) 2020 The NetBSD Foundation, Inc.
@@ -27,7 +27,7 @@
*/
#include <sys/cdefs.h>
-__KERNEL_RCSID(1, "$NetBSD: aes_neon_subr.c,v 1.3 2020/07/25 22:36:06 riastradh Exp $");
+__KERNEL_RCSID(1, "$NetBSD: aes_neon_subr.c,v 1.4 2020/07/28 20:11:09 riastradh Exp $");
#include <sys/endian.h>
@@ -111,14 +111,33 @@ aes_neon_cbc_dec(const struct aesdec *de
cv = loadblock(in + nbytes - 16);
storeblock(iv, cv);
- for (;;) {
+ if (nbytes % 32) {
+ KASSERT(nbytes % 32 == 16);
b = aes_neon_dec1(dec, cv, nrounds);
if ((nbytes -= 16) == 0)
- break;
+ goto out;
+ cv = loadblock(in + nbytes - 16);
+ storeblock(out + nbytes, cv ^ b);
+ }
+
+ for (;;) {
+ uint8x16x2_t b2;
+
+ KASSERT(nbytes >= 32);
+
+ b2.val[1] = cv;
+ b2.val[0] = cv = loadblock(in + nbytes - 32);
+ b2 = aes_neon_dec2(dec, b2, nrounds);
+ storeblock(out + nbytes - 16, cv ^ b2.val[1]);
+ if ((nbytes -= 32) == 0) {
+ b = b2.val[0];
+ goto out;
+ }
cv = loadblock(in + nbytes - 16);
- storeblock(out + nbytes, b ^ cv);
+ storeblock(out + nbytes, cv ^ b2.val[0]);
}
- storeblock(out, b ^ iv0);
+
+out: storeblock(out, b ^ iv0);
}
static inline uint8x16_t
@@ -186,11 +205,28 @@ aes_neon_xts_enc(const struct aesenc *en
KASSERT(nbytes % 16 == 0);
t = loadblock(tweak);
- for (; nbytes; nbytes -= 16, in += 16, out += 16) {
+ if (nbytes % 32) {
+ KASSERT(nbytes % 32 == 16);
b = t ^ loadblock(in);
b = aes_neon_enc1(enc, b, nrounds);
storeblock(out, t ^ b);
t = aes_neon_xts_update(t);
+ nbytes -= 16;
+ in += 16;
+ out += 16;
+ }
+ for (; nbytes; nbytes -= 32, in += 32, out += 32) {
+ uint8x16_t t1;
+ uint8x16x2_t b2;
+
+ t1 = aes_neon_xts_update(t);
+ b2.val[0] = t ^ loadblock(in);
+ b2.val[1] = t1 ^ loadblock(in + 16);
+ b2 = aes_neon_enc2(enc, b2, nrounds);
+ storeblock(out, b2.val[0] ^ t);
+ storeblock(out + 16, b2.val[1] ^ t1);
+
+ t = aes_neon_xts_update(t1);
}
storeblock(tweak, t);
}
@@ -206,11 +242,28 @@ aes_neon_xts_dec(const struct aesdec *de
KASSERT(nbytes % 16 == 0);
t = loadblock(tweak);
- for (; nbytes; nbytes -= 16, in += 16, out += 16) {
+ if (nbytes % 32) {
+ KASSERT(nbytes % 32 == 16);
b = t ^ loadblock(in);
b = aes_neon_dec1(dec, b, nrounds);
storeblock(out, t ^ b);
t = aes_neon_xts_update(t);
+ nbytes -= 16;
+ in += 16;
+ out += 16;
+ }
+ for (; nbytes; nbytes -= 32, in += 32, out += 32) {
+ uint8x16_t t1;
+ uint8x16x2_t b2;
+
+ t1 = aes_neon_xts_update(t);
+ b2.val[0] = t ^ loadblock(in);
+ b2.val[1] = t1 ^ loadblock(in + 16);
+ b2 = aes_neon_dec2(dec, b2, nrounds);
+ storeblock(out, b2.val[0] ^ t);
+ storeblock(out + 16, b2.val[1] ^ t1);
+
+ t = aes_neon_xts_update(t1);
}
storeblock(tweak, t);
}
@@ -262,11 +315,16 @@ aes_neon_ccm_enc1(const struct aesenc *e
ctr_be = loadblock(authctr + 16);
ctr = vreinterpretq_u32_u8(vbetoh32q_u8(ctr_be));
for (; nbytes; nbytes -= 16, in += 16, out += 16) {
+ uint8x16x2_t b2;
ptxt = loadblock(in);
- auth = aes_neon_enc1(enc, auth ^ ptxt, nrounds);
ctr = vaddq_u32(ctr, ctr32_inc);
ctr_be = vhtobe32q_u8(vreinterpretq_u8_u32(ctr));
- storeblock(out, ptxt ^ aes_neon_enc1(enc, ctr_be, nrounds));
+
+ b2.val[0] = auth ^ ptxt;
+ b2.val[1] = ctr_be;
+ b2 = aes_neon_enc2(enc, b2, nrounds);
+ auth = b2.val[0];
+ storeblock(out, ptxt ^ b2.val[1]);
}
storeblock(authctr, auth);
storeblock(authctr + 16, ctr_be);
@@ -278,22 +336,37 @@ aes_neon_ccm_dec1(const struct aesenc *e
uint32_t nrounds)
{
const uint32x4_t ctr32_inc = {0, 0, 0, 1};
- uint8x16_t auth, ctr_be, ptxt;
+ uint8x16_t auth, ctr_be, ptxt, pad;
uint32x4_t ctr;
KASSERT(nbytes);
KASSERT(nbytes % 16 == 0);
- auth = loadblock(authctr);
ctr_be = loadblock(authctr + 16);
ctr = vreinterpretq_u32_u8(vbetoh32q_u8(ctr_be));
- for (; nbytes; nbytes -= 16, in += 16, out += 16) {
+ ctr = vaddq_u32(ctr, ctr32_inc);
+ ctr_be = vhtobe32q_u8(vreinterpretq_u8_u32(ctr));
+ pad = aes_neon_enc1(enc, ctr_be, nrounds);
+ auth = loadblock(authctr);
+ for (;; in += 16, out += 16) {
+ uint8x16x2_t b2;
+
+ ptxt = loadblock(in) ^ pad;
+ auth ^= ptxt;
+ storeblock(out, ptxt);
+
+ if ((nbytes -= 16) == 0)
+ break;
+
ctr = vaddq_u32(ctr, ctr32_inc);
ctr_be = vhtobe32q_u8(vreinterpretq_u8_u32(ctr));
- ptxt = loadblock(in) ^ aes_neon_enc1(enc, ctr_be, nrounds);
- storeblock(out, ptxt);
- auth = aes_neon_enc1(enc, auth ^ ptxt, nrounds);
+ b2.val[0] = auth;
+ b2.val[1] = ctr_be;
+ b2 = aes_neon_enc2(enc, b2, nrounds);
+ auth = b2.val[0];
+ pad = b2.val[1];
}
+ auth = aes_neon_enc1(enc, auth, nrounds);
storeblock(authctr, auth);
storeblock(authctr + 16, ctr_be);
}
Index: src/sys/crypto/aes/arch/arm/aes_neon_impl.h
diff -u src/sys/crypto/aes/arch/arm/aes_neon_impl.h:1.1 src/sys/crypto/aes/arch/arm/aes_neon_impl.h:1.2
--- src/sys/crypto/aes/arch/arm/aes_neon_impl.h:1.1 Mon Jun 29 23:56:31 2020
+++ src/sys/crypto/aes/arch/arm/aes_neon_impl.h Tue Jul 28 20:11:09 2020
@@ -1,4 +1,4 @@
-/* $NetBSD: aes_neon_impl.h,v 1.1 2020/06/29 23:56:31 riastradh Exp $ */
+/* $NetBSD: aes_neon_impl.h,v 1.2 2020/07/28 20:11:09 riastradh Exp $ */
/*-
* Copyright (c) 2020 The NetBSD Foundation, Inc.
@@ -39,4 +39,33 @@
uint8x16_t aes_neon_enc1(const struct aesenc *, uint8x16_t, unsigned);
uint8x16_t aes_neon_dec1(const struct aesdec *, uint8x16_t, unsigned);
+#ifdef __aarch64__
+
+uint8x16x2_t aes_neon_enc2(const struct aesenc *, uint8x16x2_t, unsigned);
+uint8x16x2_t aes_neon_dec2(const struct aesdec *, uint8x16x2_t, unsigned);
+
+#else
+
+static inline uint8x16x2_t
+aes_neon_enc2(const struct aesenc *enc, uint8x16x2_t b2, unsigned nrounds)
+{
+
+ return (uint8x16x2_t) { .val = {
+ [0] = aes_neon_enc1(enc, b2.val[0], nrounds),
+ [1] = aes_neon_enc1(enc, b2.val[1], nrounds),
+ } };
+}
+
+static inline uint8x16x2_t
+aes_neon_dec2(const struct aesdec *dec, uint8x16x2_t b2, unsigned nrounds)
+{
+
+ return (uint8x16x2_t) { .val = {
+ [0] = aes_neon_dec1(dec, b2.val[0], nrounds),
+ [1] = aes_neon_dec1(dec, b2.val[1], nrounds),
+ } };
+}
+
+#endif
+
#endif /* _CRYPTO_AES_ARCH_ARM_AES_NEON_IMPL_H */
Index: src/sys/crypto/aes/arch/arm/arm_neon.h
diff -u src/sys/crypto/aes/arch/arm/arm_neon.h:1.6 src/sys/crypto/aes/arch/arm/arm_neon.h:1.7
--- src/sys/crypto/aes/arch/arm/arm_neon.h:1.6 Sat Jul 25 22:43:01 2020
+++ src/sys/crypto/aes/arch/arm/arm_neon.h Tue Jul 28 20:11:09 2020
@@ -1,4 +1,4 @@
-/* $NetBSD: arm_neon.h,v 1.6 2020/07/25 22:43:01 riastradh Exp $ */
+/* $NetBSD: arm_neon.h,v 1.7 2020/07/28 20:11:09 riastradh Exp $ */
/*-
* Copyright (c) 2020 The NetBSD Foundation, Inc.
@@ -42,6 +42,7 @@ typedef __Int8x16_t int8x16_t;
typedef __Uint32x4_t uint32x4_t;
typedef __Uint64x2_t uint64x2_t;
typedef __Uint8x16_t uint8x16_t;
+typedef struct { uint8x16_t val[2]; } uint8x16x2_t;
#else
typedef __simd128_int32_t int32x4_t;
typedef __simd128_int64_t int64x2_t;
@@ -54,6 +55,7 @@ typedef __simd64_int8_t int8x8_t;
typedef __simd64_uint8_t uint8x8_t;
typedef __builtin_neon_udi uint64x1_t;
typedef struct { uint8x8_t val[2]; } uint8x8x2_t;
+typedef struct { uint8x16_t val[2]; } uint8x16x2_t;
#endif
#if defined(__AARCH64EB__) || defined(__ARM_BIG_ENDIAN)