Module Name:    src
Committed By:   riastradh
Date:           Sat Jul 25 22:36:06 UTC 2020

Modified Files:
        src/sys/crypto/aes/arch/arm: aes_neon.h aes_neon_impl.c aes_neon_subr.c
            arm_neon.h

Log Message:
Implement AES-CCM with NEON.


To generate a diff of this commit:
cvs rdiff -u -r1.2 -r1.3 src/sys/crypto/aes/arch/arm/aes_neon.h \
    src/sys/crypto/aes/arch/arm/aes_neon_subr.c
cvs rdiff -u -r1.3 -r1.4 src/sys/crypto/aes/arch/arm/aes_neon_impl.c \
    src/sys/crypto/aes/arch/arm/arm_neon.h

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/sys/crypto/aes/arch/arm/aes_neon.h
diff -u src/sys/crypto/aes/arch/arm/aes_neon.h:1.2 src/sys/crypto/aes/arch/arm/aes_neon.h:1.3
--- src/sys/crypto/aes/arch/arm/aes_neon.h:1.2	Sat Jul 25 22:12:57 2020
+++ src/sys/crypto/aes/arch/arm/aes_neon.h	Sat Jul 25 22:36:06 2020
@@ -1,4 +1,4 @@
-/*	$NetBSD: aes_neon.h,v 1.2 2020/07/25 22:12:57 riastradh Exp $	*/
+/*	$NetBSD: aes_neon.h,v 1.3 2020/07/25 22:36:06 riastradh Exp $	*/
 
 /*-
  * Copyright (c) 2020 The NetBSD Foundation, Inc.
@@ -59,6 +59,12 @@ void aes_neon_xts_enc(const struct aesen
     uint8_t[static 16], size_t, uint8_t[static 16], uint32_t);
 void aes_neon_xts_dec(const struct aesdec *, const uint8_t[static 16],
     uint8_t[static 16], size_t, uint8_t[static 16], uint32_t);
+void aes_neon_cbcmac_update1(const struct aesenc *, const uint8_t[static 16],
+    size_t, uint8_t[static 16], uint32_t);
+void aes_neon_ccm_enc1(const struct aesenc *, const uint8_t[static 16],
+    uint8_t[static 16], size_t, uint8_t[static 32], uint32_t);
+void aes_neon_ccm_dec1(const struct aesenc *, const uint8_t[static 16],
+    uint8_t[static 16], size_t, uint8_t[static 32], uint32_t);
 
 int aes_neon_selftest(void);
 
Index: src/sys/crypto/aes/arch/arm/aes_neon_subr.c
diff -u src/sys/crypto/aes/arch/arm/aes_neon_subr.c:1.2 src/sys/crypto/aes/arch/arm/aes_neon_subr.c:1.3
--- src/sys/crypto/aes/arch/arm/aes_neon_subr.c:1.2	Tue Jun 30 20:32:11 2020
+++ src/sys/crypto/aes/arch/arm/aes_neon_subr.c	Sat Jul 25 22:36:06 2020
@@ -1,4 +1,4 @@
-/*	$NetBSD: aes_neon_subr.c,v 1.2 2020/06/30 20:32:11 riastradh Exp $	*/
+/*	$NetBSD: aes_neon_subr.c,v 1.3 2020/07/25 22:36:06 riastradh Exp $	*/
 
 /*-
  * Copyright (c) 2020 The NetBSD Foundation, Inc.
@@ -27,7 +27,9 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(1, "$NetBSD: aes_neon_subr.c,v 1.2 2020/06/30 20:32:11 riastradh Exp $");
+__KERNEL_RCSID(1, "$NetBSD: aes_neon_subr.c,v 1.3 2020/07/25 22:36:06 riastradh Exp $");
+
+#include <sys/endian.h>
 
 #ifdef _KERNEL
 #include <sys/systm.h>
@@ -213,6 +215,89 @@ aes_neon_xts_dec(const struct aesdec *de
 	storeblock(tweak, t);
 }
 
+void
+aes_neon_cbcmac_update1(const struct aesenc *enc, const uint8_t in[static 16],
+    size_t nbytes, uint8_t auth0[static 16], uint32_t nrounds)
+{
+	uint8x16_t auth;
+
+	KASSERT(nbytes);
+	KASSERT(nbytes % 16 == 0);
+
+	auth = loadblock(auth0);
+	for (; nbytes; nbytes -= 16, in += 16)
+		auth = aes_neon_enc1(enc, auth ^ loadblock(in), nrounds);
+	storeblock(auth0, auth);
+}
+
+/*
+ * XXX On aarch64, we have enough registers that we should be able to
+ * pipeline two simultaneous vpaes computations in an `aes_neon_enc2'
+ * function, which should substantially improve CCM throughput.
+ */
+
+#if _BYTE_ORDER == _LITTLE_ENDIAN
+#define	vbetoh32q_u8	vrev32q_u8
+#define	vhtobe32q_u8	vrev32q_u8
+#elif _BYTE_ORDER == _BIG_ENDIAN
+#define	vbetoh32q_u8(x)	(x)
+#define	vhtobe32q_u8(x)	(x)
+#else
+#error what kind of endian are you anyway
+#endif
+
+void
+aes_neon_ccm_enc1(const struct aesenc *enc, const uint8_t in[static 16],
+    uint8_t out[static 16], size_t nbytes, uint8_t authctr[static 32],
+    uint32_t nrounds)
+{
+	const uint32x4_t ctr32_inc = {0, 0, 0, 1};
+	uint8x16_t auth, ptxt, ctr_be;
+	uint32x4_t ctr;
+
+	KASSERT(nbytes);
+	KASSERT(nbytes % 16 == 0);
+
+	auth = loadblock(authctr);
+	ctr_be = loadblock(authctr + 16);
+	ctr = vreinterpretq_u32_u8(vbetoh32q_u8(ctr_be));
+	for (; nbytes; nbytes -= 16, in += 16, out += 16) {
+		ptxt = loadblock(in);
+		auth = aes_neon_enc1(enc, auth ^ ptxt, nrounds);
+		ctr = vaddq_u32(ctr, ctr32_inc);
+		ctr_be = vhtobe32q_u8(vreinterpretq_u8_u32(ctr));
+		storeblock(out, ptxt ^ aes_neon_enc1(enc, ctr_be, nrounds));
+	}
+	storeblock(authctr, auth);
+	storeblock(authctr + 16, ctr_be);
+}
+
+void
+aes_neon_ccm_dec1(const struct aesenc *enc, const uint8_t in[static 16],
+    uint8_t out[static 16], size_t nbytes, uint8_t authctr[static 32],
+    uint32_t nrounds)
+{
+	const uint32x4_t ctr32_inc = {0, 0, 0, 1};
+	uint8x16_t auth, ctr_be, ptxt;
+	uint32x4_t ctr;
+
+	KASSERT(nbytes);
+	KASSERT(nbytes % 16 == 0);
+
+	auth = loadblock(authctr);
+	ctr_be = loadblock(authctr + 16);
+	ctr = vreinterpretq_u32_u8(vbetoh32q_u8(ctr_be));
+	for (; nbytes; nbytes -= 16, in += 16, out += 16) {
+		ctr = vaddq_u32(ctr, ctr32_inc);
+		ctr_be = vhtobe32q_u8(vreinterpretq_u8_u32(ctr));
+		ptxt = loadblock(in) ^ aes_neon_enc1(enc, ctr_be, nrounds);
+		storeblock(out, ptxt);
+		auth = aes_neon_enc1(enc, auth ^ ptxt, nrounds);
+	}
+	storeblock(authctr, auth);
+	storeblock(authctr + 16, ctr_be);
+}
+
 int
 aes_neon_selftest(void)
 {

Index: src/sys/crypto/aes/arch/arm/aes_neon_impl.c
diff -u src/sys/crypto/aes/arch/arm/aes_neon_impl.c:1.3 src/sys/crypto/aes/arch/arm/aes_neon_impl.c:1.4
--- src/sys/crypto/aes/arch/arm/aes_neon_impl.c:1.3	Sat Jul 25 22:12:57 2020
+++ src/sys/crypto/aes/arch/arm/aes_neon_impl.c	Sat Jul 25 22:36:06 2020
@@ -1,4 +1,4 @@
-/*	$NetBSD: aes_neon_impl.c,v 1.3 2020/07/25 22:12:57 riastradh Exp $	*/
+/*	$NetBSD: aes_neon_impl.c,v 1.4 2020/07/25 22:36:06 riastradh Exp $	*/
 
 /*-
  * Copyright (c) 2020 The NetBSD Foundation, Inc.
@@ -27,7 +27,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(1, "$NetBSD: aes_neon_impl.c,v 1.3 2020/07/25 22:12:57 riastradh Exp $");
+__KERNEL_RCSID(1, "$NetBSD: aes_neon_impl.c,v 1.4 2020/07/25 22:36:06 riastradh Exp $");
 
 #include <sys/types.h>
 #include <sys/proc.h>
@@ -144,6 +144,39 @@ aes_neon_xts_dec_impl(const struct aesde
 	fpu_kern_leave();
 }
 
+static void
+aes_neon_cbcmac_update1_impl(const struct aesenc *enc,
+    const uint8_t in[static 16], size_t nbytes, uint8_t auth[static 16],
+    uint32_t nrounds)
+{
+
+	fpu_kern_enter();
+	aes_neon_cbcmac_update1(enc, in, nbytes, auth, nrounds);
+	fpu_kern_leave();
+}
+
+static void
+aes_neon_ccm_enc1_impl(const struct aesenc *enc, const uint8_t in[static 16],
+    uint8_t out[static 16], size_t nbytes, uint8_t authctr[static 32],
+    uint32_t nrounds)
+{
+
+	fpu_kern_enter();
+	aes_neon_ccm_enc1(enc, in, out, nbytes, authctr, nrounds);
+	fpu_kern_leave();
+}
+
+static void
+aes_neon_ccm_dec1_impl(const struct aesenc *enc, const uint8_t in[static 16],
+    uint8_t out[static 16], size_t nbytes, uint8_t authctr[static 32],
+    uint32_t nrounds)
+{
+
+	fpu_kern_enter();
+	aes_neon_ccm_dec1(enc, in, out, nbytes, authctr, nrounds);
+	fpu_kern_leave();
+}
+
 static int
 aes_neon_probe(void)
 {
@@ -204,4 +237,7 @@ struct aes_impl aes_neon_impl = {
 	.ai_cbc_dec = aes_neon_cbc_dec_impl,
 	.ai_xts_enc = aes_neon_xts_enc_impl,
 	.ai_xts_dec = aes_neon_xts_dec_impl,
+	.ai_cbcmac_update1 = aes_neon_cbcmac_update1_impl,
+	.ai_ccm_enc1 = aes_neon_ccm_enc1_impl,
+	.ai_ccm_dec1 = aes_neon_ccm_dec1_impl,
 };
Index: src/sys/crypto/aes/arch/arm/arm_neon.h
diff -u src/sys/crypto/aes/arch/arm/arm_neon.h:1.3 src/sys/crypto/aes/arch/arm/arm_neon.h:1.4
--- src/sys/crypto/aes/arch/arm/arm_neon.h:1.3	Thu Jul 23 11:33:01 2020
+++ src/sys/crypto/aes/arch/arm/arm_neon.h	Sat Jul 25 22:36:06 2020
@@ -1,4 +1,4 @@
-/*	$NetBSD: arm_neon.h,v 1.3 2020/07/23 11:33:01 ryo Exp $	*/
+/*	$NetBSD: arm_neon.h,v 1.4 2020/07/25 22:36:06 riastradh Exp $	*/
 
 /*-
  * Copyright (c) 2020 The NetBSD Foundation, Inc.
@@ -91,6 +91,13 @@ typedef struct { uint8x8_t val[2]; } uin
 
 _INTRINSATTR
 static __inline uint32x4_t
+vaddq_u32(uint32x4_t __v0, uint32x4_t __v1)
+{
+	return __v0 + __v1;
+}
+
+_INTRINSATTR
+static __inline uint32x4_t
 vcltq_s32(int32x4_t __v0, int32x4_t __v1)
 {
 	return (uint32x4_t)(__v0 < __v1);
@@ -328,6 +335,19 @@ vreinterpretq_u8_u64(uint64x2_t __v)
 	return (uint8x16_t)__v;
 }
 
+_INTRINSATTR
+static __inline uint8x16_t
+vrev32q_u8(uint8x16_t __v)
+{
+#if defined(__GNUC__) && !defined(__clang__)
+	return __builtin_shuffle(__v,
+	    (uint8x16_t) { 3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12 });
+#elif defined(__clang__)
+	return __builtin_shufflevector(__v,
+	    3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12);
+#endif
+}
+
 #if defined(__GNUC__) && !defined(__clang__)
 _INTRINSATTR
 static __inline uint32x4_t

Reply via email to