CVS commit: src/sys/crypto/aes/arch/arm

2023-08-06 Thread Rin Okuyama
Module Name:src
Committed By:   rin
Date:   Mon Aug  7 00:58:35 UTC 2023

Modified Files:
src/sys/crypto/aes/arch/arm: arm_neon.h

Log Message:
sys/crypto/{aes,chacha}/arch/arm/arm_neon.h: Sync (whitespace fix)

No binary changes.


To generate a diff of this commit:
cvs rdiff -u -r1.11 -r1.12 src/sys/crypto/aes/arch/arm/arm_neon.h

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/sys/crypto/aes/arch/arm/arm_neon.h
diff -u src/sys/crypto/aes/arch/arm/arm_neon.h:1.11 src/sys/crypto/aes/arch/arm/arm_neon.h:1.12
--- src/sys/crypto/aes/arch/arm/arm_neon.h:1.11	Mon Sep  7 18:06:13 2020
+++ src/sys/crypto/aes/arch/arm/arm_neon.h	Mon Aug  7 00:58:35 2023
@@ -1,4 +1,4 @@
-/*	$NetBSD: arm_neon.h,v 1.11 2020/09/07 18:06:13 jakllsch Exp $	*/
+/*	$NetBSD: arm_neon.h,v 1.12 2023/08/07 00:58:35 rin Exp $	*/
 
 /*-
  * Copyright (c) 2020 The NetBSD Foundation, Inc.
@@ -232,7 +232,7 @@ static __inline uint32_t
 vgetq_lane_u32(uint32x4_t __v, uint8_t __i)
 {
 #ifdef __aarch64__
-	return __v[__neon_laneq_index(__v,__i)];
+	return __v[__neon_laneq_index(__v, __i)];
 #else
 	return (uint32_t)__builtin_neon_vget_laneuv4si((int32x4_t)__v, __i);
 #endif



CVS commit: src/sys/crypto/aes/arch/arm

2023-08-06 Thread Rin Okuyama
Module Name:src
Committed By:   rin
Date:   Mon Aug  7 00:58:35 UTC 2023

Modified Files:
src/sys/crypto/aes/arch/arm: arm_neon.h

Log Message:
sys/crypto/{aes,chacha}/arch/arm/arm_neon.h: Sync (whitespace fix)

No binary changes.


To generate a diff of this commit:
cvs rdiff -u -r1.11 -r1.12 src/sys/crypto/aes/arch/arm/arm_neon.h

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.



CVS commit: src/sys/crypto/aes/arch/arm

2022-06-26 Thread Taylor R Campbell
Module Name:src
Committed By:   riastradh
Date:   Sun Jun 26 17:52:54 UTC 2022

Modified Files:
src/sys/crypto/aes/arch/arm: aes_neon_subr.c

Log Message:
arm/aes_neon: Fix formatting of self-test failure message.

Discovered by code inspection.  Remarkably, a combination of errors
made this fail to be a stack buffer overrun.  Verified by booting
with ARMv8.0-AES disabled and with the self-test artificially made to
fail.


To generate a diff of this commit:
cvs rdiff -u -r1.7 -r1.8 src/sys/crypto/aes/arch/arm/aes_neon_subr.c

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/sys/crypto/aes/arch/arm/aes_neon_subr.c
diff -u src/sys/crypto/aes/arch/arm/aes_neon_subr.c:1.7 src/sys/crypto/aes/arch/arm/aes_neon_subr.c:1.8
--- src/sys/crypto/aes/arch/arm/aes_neon_subr.c:1.7	Sun Aug  9 02:48:38 2020
+++ src/sys/crypto/aes/arch/arm/aes_neon_subr.c	Sun Jun 26 17:52:54 2022
@@ -1,4 +1,4 @@
-/*	$NetBSD: aes_neon_subr.c,v 1.7 2020/08/09 02:48:38 riastradh Exp $	*/
+/*	$NetBSD: aes_neon_subr.c,v 1.8 2022/06/26 17:52:54 riastradh Exp $	*/
 
 /*-
  * Copyright (c) 2020 The NetBSD Foundation, Inc.
@@ -27,7 +27,7 @@
  */
 
 #include 
-__KERNEL_RCSID(1, "$NetBSD: aes_neon_subr.c,v 1.7 2020/08/09 02:48:38 riastradh Exp $");
+__KERNEL_RCSID(1, "$NetBSD: aes_neon_subr.c,v 1.8 2022/06/26 17:52:54 riastradh Exp $");
 
 #ifdef _KERNEL
 #include 
@@ -183,11 +183,11 @@ aes_neon_xts_update_selftest(void)
 	for (i = 0; i < sizeof(cases)/sizeof(cases[0]); i++) {
 		storeblock(t, aes_neon_xts_update(loadblock(cases[i].in)));
 		if (memcmp(t, cases[i].out, 16)) {
-			char buf[33];
+			char buf[3*16 + 1];
 			unsigned j;
 
 			for (j = 0; j < 16; j++) {
-snprintf(buf + 2*j, sizeof(buf) - 2*j,
+snprintf(buf + 3*j, sizeof(buf) - 3*j,
 " %02hhx", t[j]);
 			}
 			printf("%s %u: %s\n", __func__, i, buf);



CVS commit: src/sys/crypto/aes/arch/arm

2022-06-26 Thread Taylor R Campbell
Module Name:src
Committed By:   riastradh
Date:   Sun Jun 26 17:52:54 UTC 2022

Modified Files:
src/sys/crypto/aes/arch/arm: aes_neon_subr.c

Log Message:
arm/aes_neon: Fix formatting of self-test failure message.

Discovered by code inspection.  Remarkably, a combination of errors
made this fail to be a stack buffer overrun.  Verified by booting
with ARMv8.0-AES disabled and with the self-test artificially made to
fail.


To generate a diff of this commit:
cvs rdiff -u -r1.7 -r1.8 src/sys/crypto/aes/arch/arm/aes_neon_subr.c

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.



CVS commit: src/sys/crypto/aes/arch/arm

2020-11-21 Thread Rin Okuyama
Module Name:src
Committed By:   rin
Date:   Sat Nov 21 08:09:21 UTC 2020

Modified Files:
src/sys/crypto/aes/arch/arm: aes_neon.c

Log Message:
Fix build with clang for earmv7hf; loadroundkey() is used only for __aarch64__.


To generate a diff of this commit:
cvs rdiff -u -r1.5 -r1.6 src/sys/crypto/aes/arch/arm/aes_neon.c

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/sys/crypto/aes/arch/arm/aes_neon.c
diff -u src/sys/crypto/aes/arch/arm/aes_neon.c:1.5 src/sys/crypto/aes/arch/arm/aes_neon.c:1.6
--- src/sys/crypto/aes/arch/arm/aes_neon.c:1.5	Sat Aug  8 14:47:01 2020
+++ src/sys/crypto/aes/arch/arm/aes_neon.c	Sat Nov 21 08:09:21 2020
@@ -1,4 +1,4 @@
-/*	$NetBSD: aes_neon.c,v 1.5 2020/08/08 14:47:01 riastradh Exp $	*/
+/*	$NetBSD: aes_neon.c,v 1.6 2020/11/21 08:09:21 rin Exp $	*/
 
 /*-
  * Copyright (c) 2020 The NetBSD Foundation, Inc.
@@ -39,7 +39,7 @@
  */
 
 #include 
-__KERNEL_RCSID(1, "$NetBSD: aes_neon.c,v 1.5 2020/08/08 14:47:01 riastradh Exp $");
+__KERNEL_RCSID(1, "$NetBSD: aes_neon.c,v 1.6 2020/11/21 08:09:21 rin Exp $");
 
 #include 
 
@@ -196,11 +196,13 @@ inv	= VQ_N_U8(0x80,0x01,0x08,0x0D,0x0F,0
 inva	= VQ_N_U8(0x80,0x07,0x0B,0x0F,0x06,0x0A,0x04,0x01,
 	0x09,0x08,0x05,0x02,0x0C,0x0E,0x0D,0x03);
 
+#ifdef __aarch64__
 static inline uint8x16_t
 loadroundkey(const void *rkp)
 {
 	return vld1q_u8(rkp);
 }
+#endif
 
 static inline void
 storeroundkey(void *rkp, uint8x16_t rk)



CVS commit: src/sys/crypto/aes/arch/arm

2020-11-21 Thread Rin Okuyama
Module Name:src
Committed By:   rin
Date:   Sat Nov 21 08:09:21 UTC 2020

Modified Files:
src/sys/crypto/aes/arch/arm: aes_neon.c

Log Message:
Fix build with clang for earmv7hf; loadroundkey() is used only for __aarch64__.


To generate a diff of this commit:
cvs rdiff -u -r1.5 -r1.6 src/sys/crypto/aes/arch/arm/aes_neon.c

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.



CVS commit: src/sys/crypto/aes/arch/arm

2020-09-10 Thread Taylor R Campbell
Module Name:src
Committed By:   riastradh
Date:   Thu Sep 10 11:31:04 UTC 2020

Modified Files:
src/sys/crypto/aes/arch/arm: aes_neon_32.S

Log Message:
aes neon: Gather mc_forward/backward so we can load 256 bits at once.


To generate a diff of this commit:
cvs rdiff -u -r1.10 -r1.11 src/sys/crypto/aes/arch/arm/aes_neon_32.S

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/sys/crypto/aes/arch/arm/aes_neon_32.S
diff -u src/sys/crypto/aes/arch/arm/aes_neon_32.S:1.10 src/sys/crypto/aes/arch/arm/aes_neon_32.S:1.11
--- src/sys/crypto/aes/arch/arm/aes_neon_32.S:1.10	Thu Sep 10 11:30:28 2020
+++ src/sys/crypto/aes/arch/arm/aes_neon_32.S	Thu Sep 10 11:31:03 2020
@@ -1,4 +1,4 @@
-/*	$NetBSD: aes_neon_32.S,v 1.10 2020/09/10 11:30:28 riastradh Exp $	*/
+/*	$NetBSD: aes_neon_32.S,v 1.11 2020/09/10 11:31:03 riastradh Exp $	*/
 
 /*-
  * Copyright (c) 2020 The NetBSD Foundation, Inc.
@@ -28,7 +28,7 @@
 
 #include 
 
-RCSID("$NetBSD: aes_neon_32.S,v 1.10 2020/09/10 11:30:28 riastradh Exp $")
+RCSID("$NetBSD: aes_neon_32.S,v 1.11 2020/09/10 11:31:03 riastradh Exp $")
 
 	.fpu	neon
 
@@ -54,36 +54,26 @@ inva:
 	.byte	0x09,0x08,0x05,0x02,0x0C,0x0E,0x0D,0x03
 END(inva)
 
-	.type	mc_forward,_ASM_TYPE_OBJECT
-mc_forward:
-	.byte	0x01,0x02,0x03,0x00,0x05,0x06,0x07,0x04	/* 0 */
+	.type	mc,_ASM_TYPE_OBJECT
+mc:
+	.byte	0x01,0x02,0x03,0x00,0x05,0x06,0x07,0x04	/* 0 forward */
 	.byte	0x09,0x0A,0x0B,0x08,0x0D,0x0E,0x0F,0x0C
-
-	.byte	0x05,0x06,0x07,0x04,0x09,0x0A,0x0B,0x08	/* 1 */
+	.byte	0x03,0x00,0x01,0x02,0x07,0x04,0x05,0x06	/* 0 backward */
+	.byte	0x0B,0x08,0x09,0x0A,0x0F,0x0C,0x0D,0x0E
+	.byte	0x05,0x06,0x07,0x04,0x09,0x0A,0x0B,0x08	/* 1 forward */
 	.byte	0x0D,0x0E,0x0F,0x0C,0x01,0x02,0x03,0x00
-
-	.byte	0x09,0x0A,0x0B,0x08,0x0D,0x0E,0x0F,0x0C	/* 2 */
+	.byte	0x0F,0x0C,0x0D,0x0E,0x03,0x00,0x01,0x02	/* 1 backward */
+	.byte	0x07,0x04,0x05,0x06,0x0B,0x08,0x09,0x0A
+	.byte	0x09,0x0A,0x0B,0x08,0x0D,0x0E,0x0F,0x0C	/* 2 forward */
 	.byte	0x01,0x02,0x03,0x00,0x05,0x06,0x07,0x04
-
+	.byte	0x0B,0x08,0x09,0x0A,0x0F,0x0C,0x0D,0x0E	/* 2 backward */
+	.byte	0x03,0x00,0x01,0x02,0x07,0x04,0x05,0x06
 .Lmc_forward_3:
-	.byte	0x0D,0x0E,0x0F,0x0C,0x01,0x02,0x03,0x00	/* 3 */
+	.byte	0x0D,0x0E,0x0F,0x0C,0x01,0x02,0x03,0x00	/* 3 forward */
 	.byte	0x05,0x06,0x07,0x04,0x09,0x0A,0x0B,0x08
-END(mc_forward)
-
-	.type	mc_backward,_ASM_TYPE_OBJECT
-mc_backward:
-	.byte	0x03,0x00,0x01,0x02,0x07,0x04,0x05,0x06	/* 0 */
-	.byte	0x0B,0x08,0x09,0x0A,0x0F,0x0C,0x0D,0x0E
-
-	.byte	0x0F,0x0C,0x0D,0x0E,0x03,0x00,0x01,0x02	/* 1 */
-	.byte	0x07,0x04,0x05,0x06,0x0B,0x08,0x09,0x0A
-
-	.byte	0x0B,0x08,0x09,0x0A,0x0F,0x0C,0x0D,0x0E	/* 2 */
-	.byte	0x03,0x00,0x01,0x02,0x07,0x04,0x05,0x06
-
-	.byte	0x07,0x04,0x05,0x06,0x0B,0x08,0x09,0x0A	/* 3 */
+	.byte	0x07,0x04,0x05,0x06,0x0B,0x08,0x09,0x0A	/* 3 backward */
 	.byte	0x0F,0x0C,0x0D,0x0E,0x03,0x00,0x01,0x02
-END(mc_backward)
+END(mc)
 
 	.type	sr,_ASM_TYPE_OBJECT
 sr:
@@ -210,8 +200,7 @@ ENTRY(aes_neon_enc1)
 
 	/*
 	 * r3: rmod4
-	 * r4: mc_forward
-	 * r5: mc_backward
+	 * r4: mc
 	 * r6,r8,r10,ip: temporaries
 	 * q0={d0-d1}: x/ak/A
 	 * q1={d2-d3}: 0x0f0f...
@@ -225,8 +214,8 @@ ENTRY(aes_neon_enc1)
 	 * q9={d18-d19}: sb2[1]
 	 * q10={d20-d21}: inv
 	 * q11={d22-d23}: inva
-	 * q12={d24-d25}: ir/iak/iakr/sb1_0(io)/mc_backward[rmod4]
-	 * q13={d26-d27}: jr/jak/jakr/sb1_1(jo)/mc_forward[rmod4]
+	 * q12={d24-d25}: ir/iak/iakr/sb1_0(io)/mc[rmod4].backward
+	 * q13={d26-d27}: jr/jak/jakr/sb1_1(jo)/mc[rmod4].forward
 	 * q14={d28-d29}: rk/A2/A2_B_D
 	 * q15={d30-d31}: A2_B/sr[rmod4]
 	 */
@@ -254,9 +243,8 @@ ENTRY(aes_neon_enc1)
 	vld1.8	{q8-q9}, [r6 :256]	/* q8 = sb2[0], q9 = sb2[1] */
 	vld1.8	{q10-q11}, [r8 :256]	/* q10 = inv, q11 = inva */
 
-	/* (r4, r5) := (_forward[0], _backward[0]) */
-	add	r4, ip, #(mc_forward - .Lconstants)
-	add	r5, ip, #(mc_backward - .Lconstants)
+	/* r4 := mc */
+	add	r4, ip, #(mc - .Lconstants)
 
 	/* (q2, q3) := (lo, hi) */
 	vshr.u8	q3, q0, #4
@@ -291,13 +279,11 @@ ENTRY(aes_neon_enc1)
 	vtbl.8	d25, {q8}, d5
 	vtbl.8	d26, {q9}, d6
 	vtbl.8	d27, {q9}, d7
+	add	r6, r4, r3, lsl #5	/* r6 := [rmod4] */
 	veor	q14, q12, q13
 
-	/* (q12, q13) := (mc_forward[rmod4], mc_backward[rmod4]) */
-	add	r6, r4, r3, lsl #4
-	add	r8, r5, r3, lsl #4
-	vld1.8	{q12}, [r6 :128]
-	vld1.8	{q13}, [r8 :128]
+	/* (q12, q13) := (mc[rmod4].forward, mc[rmod4].backward) */
+	vld1.8	{q12-q13}, [r6 :256]
 
 	/* q15 := A2_B = A2 + A(mcf) */
 	vtbl.8	d30, {q0}, d24
@@ -474,7 +460,7 @@ ENTRY(aes_neon_dec1)
 	add	r8, ip, #(.Lmc_forward_3 - .Lconstants)
 	vld1.8	{q6-q7}, [r4 :256]	/* q6 := dsbb[0], q7 := dsbb[1] */
 	vld1.8	{q10-q11}, [r6 :256]	/* q10 := inv, q11 := inva */
-	vld1.8	{q15}, [r8 :128]	/* q15 := mc_forward[3] */
+	vld1.8	{q15}, [r8 :128]	/* q15 := mc[3].forward */
 
 	/* (q2, q3) := (lo, hi) */
 	vshr.u8	q3, q0, #4



CVS commit: src/sys/crypto/aes/arch/arm

2020-09-10 Thread Taylor R Campbell
Module Name:src
Committed By:   riastradh
Date:   Thu Sep 10 11:31:04 UTC 2020

Modified Files:
src/sys/crypto/aes/arch/arm: aes_neon_32.S

Log Message:
aes neon: Gather mc_forward/backward so we can load 256 bits at once.


To generate a diff of this commit:
cvs rdiff -u -r1.10 -r1.11 src/sys/crypto/aes/arch/arm/aes_neon_32.S

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.



CVS commit: src/sys/crypto/aes/arch/arm

2020-09-10 Thread Taylor R Campbell
Module Name:src
Committed By:   riastradh
Date:   Thu Sep 10 11:30:28 UTC 2020

Modified Files:
src/sys/crypto/aes/arch/arm: aes_neon_32.S

Log Message:
aes neon: Hoist dsbd/dsbe address calculation out of loop.


To generate a diff of this commit:
cvs rdiff -u -r1.9 -r1.10 src/sys/crypto/aes/arch/arm/aes_neon_32.S

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/sys/crypto/aes/arch/arm/aes_neon_32.S
diff -u src/sys/crypto/aes/arch/arm/aes_neon_32.S:1.9 src/sys/crypto/aes/arch/arm/aes_neon_32.S:1.10
--- src/sys/crypto/aes/arch/arm/aes_neon_32.S:1.9	Thu Sep 10 11:30:08 2020
+++ src/sys/crypto/aes/arch/arm/aes_neon_32.S	Thu Sep 10 11:30:28 2020
@@ -1,4 +1,4 @@
-/*	$NetBSD: aes_neon_32.S,v 1.9 2020/09/10 11:30:08 riastradh Exp $	*/
+/*	$NetBSD: aes_neon_32.S,v 1.10 2020/09/10 11:30:28 riastradh Exp $	*/
 
 /*-
  * Copyright (c) 2020 The NetBSD Foundation, Inc.
@@ -28,7 +28,7 @@
 
 #include 
 
-RCSID("$NetBSD: aes_neon_32.S,v 1.9 2020/09/10 11:30:08 riastradh Exp $")
+RCSID("$NetBSD: aes_neon_32.S,v 1.10 2020/09/10 11:30:28 riastradh Exp $")
 
 	.fpu	neon
 
@@ -431,6 +431,9 @@ ENTRY(aes_neon_dec1)
 
 	/*
 	 * r3: 3 & ~(nrounds - 1)
+	 * r4: dsbd
+	 * r5: dsbe
+	 * r6,r8,r10,ip: temporaries
 	 * q0={d0-d1}: x/ak
 	 * q1={d2-d3}: 0x0f0f...
 	 * q2={d4-d5}: lo/k/j/io
@@ -488,6 +491,10 @@ ENTRY(aes_neon_dec1)
 	add	r4, ip, #(dsb9 - .Lconstants)
 	vld1.8	{q4-q5}, [r4 :256]	/* q4 := dsb9[0], q5 := dsb9[1] */
 
+	/* r4 := dsbd, r5 := dsbe */
+	add	r4, ip, #(dsbd - .Lconstants)
+	add	r5, ip, #(dsbe - .Lconstants)
+
 	/* q0 := rk[0] + diptlo(lo) + dipthi(hi) */
 	veor	q0, q14, q2
 	veor	q0, q0, q3
@@ -496,7 +503,6 @@ ENTRY(aes_neon_dec1)
 
 	_ALIGN_TEXT
 1:	/* load dsbd */
-	add	r4, ip, #(dsbd - .Lconstants)
 	vld1.8	{q8-q9}, [r4 :256]	/* q8 := dsbd[0], q9 := dsbd[1] */
 
 	vld1.8	{q14}, [r0 :128]!	/* q14 = *rk++ */
@@ -522,8 +528,7 @@ ENTRY(aes_neon_dec1)
 	veor	q0, q0, q13
 
 	/* load dsbe */
-	add	r4, ip, #(dsbe - .Lconstants)
-	vld1.8	{q8-q9}, [r4 :256]!	/* q8 := dsbe[0], q9 := dsbe[1] */
+	vld1.8	{q8-q9}, [r5 :256]	/* q8 := dsbe[0], q9 := dsbe[1] */
 
 	/* q0 := x(mc) + dsbb_0(io) + dsbb_1(jo) */
 	vtbl.8	d28, {q0}, d30



CVS commit: src/sys/crypto/aes/arch/arm

2020-09-10 Thread Taylor R Campbell
Module Name:src
Committed By:   riastradh
Date:   Thu Sep 10 11:30:28 UTC 2020

Modified Files:
src/sys/crypto/aes/arch/arm: aes_neon_32.S

Log Message:
aes neon: Hoist dsbd/dsbe address calculation out of loop.


To generate a diff of this commit:
cvs rdiff -u -r1.9 -r1.10 src/sys/crypto/aes/arch/arm/aes_neon_32.S

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.



CVS commit: src/sys/crypto/aes/arch/arm

2020-09-10 Thread Taylor R Campbell
Module Name:src
Committed By:   riastradh
Date:   Thu Sep 10 11:30:08 UTC 2020

Modified Files:
src/sys/crypto/aes/arch/arm: aes_neon_32.S

Log Message:
aes neon: Tweak register usage.

- Call r12 by its usual name, ip.
- No need for r7 or r11=fp at the moment.


To generate a diff of this commit:
cvs rdiff -u -r1.8 -r1.9 src/sys/crypto/aes/arch/arm/aes_neon_32.S

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.



CVS commit: src/sys/crypto/aes/arch/arm

2020-09-10 Thread Taylor R Campbell
Module Name:src
Committed By:   riastradh
Date:   Thu Sep 10 11:30:08 UTC 2020

Modified Files:
src/sys/crypto/aes/arch/arm: aes_neon_32.S

Log Message:
aes neon: Tweak register usage.

- Call r12 by its usual name, ip.
- No need for r7 or r11=fp at the moment.


To generate a diff of this commit:
cvs rdiff -u -r1.8 -r1.9 src/sys/crypto/aes/arch/arm/aes_neon_32.S

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/sys/crypto/aes/arch/arm/aes_neon_32.S
diff -u src/sys/crypto/aes/arch/arm/aes_neon_32.S:1.8 src/sys/crypto/aes/arch/arm/aes_neon_32.S:1.9
--- src/sys/crypto/aes/arch/arm/aes_neon_32.S:1.8	Thu Sep 10 11:29:43 2020
+++ src/sys/crypto/aes/arch/arm/aes_neon_32.S	Thu Sep 10 11:30:08 2020
@@ -1,4 +1,4 @@
-/*	$NetBSD: aes_neon_32.S,v 1.8 2020/09/10 11:29:43 riastradh Exp $	*/
+/*	$NetBSD: aes_neon_32.S,v 1.9 2020/09/10 11:30:08 riastradh Exp $	*/
 
 /*-
  * Copyright (c) 2020 The NetBSD Foundation, Inc.
@@ -28,7 +28,7 @@
 
 #include 
 
-RCSID("$NetBSD: aes_neon_32.S,v 1.8 2020/09/10 11:29:43 riastradh Exp $")
+RCSID("$NetBSD: aes_neon_32.S,v 1.9 2020/09/10 11:30:08 riastradh Exp $")
 
 	.fpu	neon
 
@@ -205,14 +205,14 @@ ENTRY(aes_neon_enc1)
 	vldr	d1, [sp]		/* d1 := x hi */
 	ldr	r1, [sp, #8]		/* r1 := nrounds */
 #endif
-	push	{r4, r5, r6, r7, r8, r10, r11, lr}
+	push	{r4, r5, r6, r8, r10, lr}
 	vpush	{d8-d15}
 
 	/*
 	 * r3: rmod4
 	 * r4: mc_forward
 	 * r5: mc_backward
-	 * r6,r7,r8,r10,r11,r12: temporaries
+	 * r6,r8,r10,ip: temporaries
 	 * q0={d0-d1}: x/ak/A
 	 * q1={d2-d3}: 0x0f0f...
 	 * q2={d4-d5}: lo/k/j/io
@@ -231,32 +231,32 @@ ENTRY(aes_neon_enc1)
 	 * q15={d30-d31}: A2_B/sr[rmod4]
 	 */
 
-	/* r12 := .Lconstants - .Lconstants_addr, r11 := .Lconstants_addr */
-	ldr	r12, .Lconstants_addr
-	adr	r11, .Lconstants_addr
+	/* ip := .Lconstants - .Lconstants_addr, r10 := .Lconstants_addr */
+	ldr	ip, .Lconstants_addr
+	adr	r10, .Lconstants_addr
 
 	vld1.8	{q14}, [r0 :128]!	/* q14 = *rk++ */
 	movw	r3, #0
 	vmov.i8	q1, #0x0f
 
-	/* r12 := .Lconstants */
-	add	r12, r12, r11
+	/* ip := .Lconstants */
+	add	ip, ip, r10
 
 	/* (q4, q5) := (iptlo, ipthi) */
-	add	r6, r12, #(ipt - .Lconstants)
+	add	r6, ip, #(ipt - .Lconstants)
 	vld1.8	{q4-q5}, [r6 :256]
 
 	/* load the rest of the constants */
-	add	r4, r12, #(sb1 - .Lconstants)
-	add	r6, r12, #(sb2 - .Lconstants)
-	add	r8, r12, #(.Linv_inva - .Lconstants)
+	add	r4, ip, #(sb1 - .Lconstants)
+	add	r6, ip, #(sb2 - .Lconstants)
+	add	r8, ip, #(.Linv_inva - .Lconstants)
 	vld1.8	{q6-q7}, [r4 :256]	/* q6 = sb1[0], q7 = sb1[1] */
 	vld1.8	{q8-q9}, [r6 :256]	/* q8 = sb2[0], q9 = sb2[1] */
 	vld1.8	{q10-q11}, [r8 :256]	/* q10 = inv, q11 = inva */
 
 	/* (r4, r5) := (_forward[0], _backward[0]) */
-	add	r4, r12, #(mc_forward - .Lconstants)
-	add	r5, r12, #(mc_backward - .Lconstants)
+	add	r4, ip, #(mc_forward - .Lconstants)
+	add	r5, ip, #(mc_backward - .Lconstants)
 
 	/* (q2, q3) := (lo, hi) */
 	vshr.u8	q3, q0, #4
@@ -295,9 +295,9 @@ ENTRY(aes_neon_enc1)
 
 	/* (q12, q13) := (mc_forward[rmod4], mc_backward[rmod4]) */
 	add	r6, r4, r3, lsl #4
-	add	r7, r5, r3, lsl #4
+	add	r8, r5, r3, lsl #4
 	vld1.8	{q12}, [r6 :128]
-	vld1.8	{q13}, [r7 :128]
+	vld1.8	{q13}, [r8 :128]
 
 	/* q15 := A2_B = A2 + A(mcf) */
 	vtbl.8	d30, {q0}, d24
@@ -365,8 +365,8 @@ ENTRY(aes_neon_enc1)
 	bne	1b
 
 	/* (q6, q7, q15) := (sbo[0], sbo[1], sr[rmod4]) */
-	add	r8, r12, #(sr - .Lconstants)
-	add	r6, r12, #(sbo - .Lconstants)
+	add	r8, ip, #(sr - .Lconstants)
+	add	r6, ip, #(sbo - .Lconstants)
 	add	r8, r8, r3, lsl #4
 	vld1.8	{q6-q7}, [r6 :256]
 	vld1.8	{q15}, [r8 :128]
@@ -388,7 +388,7 @@ ENTRY(aes_neon_enc1)
 	vtbl.8	d1, {q2}, d31
 
 	vpop	{d8-d15}
-	pop	{r4, r5, r6, r7, r8, r10, r11, lr}
+	pop	{r4, r5, r6, r8, r10, lr}
 #ifdef __SOFTFP__
 #ifdef __ARM_BIG_ENDIAN
 	vmov	r1, r0, d0
@@ -426,7 +426,7 @@ ENTRY(aes_neon_dec1)
 	vldr	d1, [sp]		/* d1 := x hi */
 	ldr	r1, [sp, #8]		/* r1 := nrounds */
 #endif
-	push	{r4, r5, r6, r7, r8, r10, r11, lr}
+	push	{r4, r5, r6, r8, r10, lr}
 	vpush	{d8-d15}
 
 	/*
@@ -449,26 +449,26 @@ ENTRY(aes_neon_dec1)
 	 * q15={d30-d31}: mc/sr[3 & ~(nrounds - 1)]
 	 */
 
-	/* r12 := .Lconstants - .Lconstants_addr, r11 := .Lconstants_addr */
-	ldr	r12, .Lconstants_addr
-	adr	r11, .Lconstants_addr
+	/* ip := .Lconstants - .Lconstants_addr, r10 := .Lconstants_addr */
+	ldr	ip, .Lconstants_addr
+	adr	r10, .Lconstants_addr
 
 	vld1.8	{q14}, [r0 :128]!	/* q14 = *rk++ */
 	rsb	r3, r1, #0		/* r3 := ~(x - 1) = -x */
 	vmov.i8	q1, #0x0f
 	and	r3, r3, #3		/* r3 := 3 & ~(x - 1) */
 
-	/* r12 := .Lconstants */
-	add	r12, r12, r11
+	/* ip := .Lconstants */
+	add	ip, ip, r10
 
 	/* (q4, q5) := (diptlo, dipthi) */
-	add	r6, r12, #(dipt - .Lconstants)
+	add	r6, ip, #(dipt - .Lconstants)
 	vld1.8	{q4-q5}, [r6 :256]
 
 	/* load the rest of the constants */
-	add	r4, r12, #(dsbb - .Lconstants)
-	add	r6, r12, #(.Linv_inva - .Lconstants)
-	add	r8, r12, #(.Lmc_forward_3 - 

CVS commit: src/sys/crypto/aes/arch/arm

2020-09-10 Thread Taylor R Campbell
Module Name:src
Committed By:   riastradh
Date:   Thu Sep 10 11:29:43 UTC 2020

Modified Files:
src/sys/crypto/aes/arch/arm: aes_neon_32.S

Log Message:
aes neon: Write vtbl with {qN} rather than {d(2N)-d(2N+1)}.

Cosmetic; no functional change.


To generate a diff of this commit:
cvs rdiff -u -r1.7 -r1.8 src/sys/crypto/aes/arch/arm/aes_neon_32.S

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/sys/crypto/aes/arch/arm/aes_neon_32.S
diff -u src/sys/crypto/aes/arch/arm/aes_neon_32.S:1.7 src/sys/crypto/aes/arch/arm/aes_neon_32.S:1.8
--- src/sys/crypto/aes/arch/arm/aes_neon_32.S:1.7	Thu Sep 10 11:29:02 2020
+++ src/sys/crypto/aes/arch/arm/aes_neon_32.S	Thu Sep 10 11:29:43 2020
@@ -1,4 +1,4 @@
-/*	$NetBSD: aes_neon_32.S,v 1.7 2020/09/10 11:29:02 riastradh Exp $	*/
+/*	$NetBSD: aes_neon_32.S,v 1.8 2020/09/10 11:29:43 riastradh Exp $	*/
 
 /*-
  * Copyright (c) 2020 The NetBSD Foundation, Inc.
@@ -28,7 +28,7 @@
 
 #include 
 
-RCSID("$NetBSD: aes_neon_32.S,v 1.7 2020/09/10 11:29:02 riastradh Exp $")
+RCSID("$NetBSD: aes_neon_32.S,v 1.8 2020/09/10 11:29:43 riastradh Exp $")
 
 	.fpu	neon
 
@@ -264,10 +264,10 @@ ENTRY(aes_neon_enc1)
 	vand	q3, q3, q1		/* q3 := (x >> 4) & 0x0f0f... */
 
 	/* (q2, q3) := (iptlo(lo), ipthi(hi)) */
-	vtbl.8	d4, {d8-d9}, d4
-	vtbl.8	d5, {d8-d9}, d5
-	vtbl.8	d6, {d10-d11}, d6
-	vtbl.8	d7, {d10-d11}, d7
+	vtbl.8	d4, {q4}, d4
+	vtbl.8	d5, {q4}, d5
+	vtbl.8	d6, {q5}, d6
+	vtbl.8	d7, {q5}, d7
 
 	/* q0 := rk[0] + iptlo(lo) + ipthi(hi) */
 	veor	q0, q14, q2
@@ -279,18 +279,18 @@ ENTRY(aes_neon_enc1)
 1:	vld1.8	{q14}, [r0 :128]!	/* q14 = *rk++ */
 
 	/* q0 := A = rk[i] + sb1_0(io) + sb1_1(jo) */
-	vtbl.8	d24, {d12-d13}, d4
-	vtbl.8	d25, {d12-d13}, d5
-	vtbl.8	d26, {d14-d15}, d6
-	vtbl.8	d27, {d14-d15}, d7
+	vtbl.8	d24, {q6}, d4
+	vtbl.8	d25, {q6}, d5
+	vtbl.8	d26, {q7}, d6
+	vtbl.8	d27, {q7}, d7
 	veor	q0, q14, q12
 	veor	q0, q0, q13
 
 	/* q14 := A2 = sb2_0[io] + sb2_1[jo] */
-	vtbl.8	d24, {d16-d17}, d4
-	vtbl.8	d25, {d16-d17}, d5
-	vtbl.8	d26, {d18-d19}, d6
-	vtbl.8	d27, {d18-d19}, d7
+	vtbl.8	d24, {q8}, d4
+	vtbl.8	d25, {q8}, d5
+	vtbl.8	d26, {q9}, d6
+	vtbl.8	d27, {q9}, d7
 	veor	q14, q12, q13
 
 	/* (q12, q13) := (mc_forward[rmod4], mc_backward[rmod4]) */
@@ -300,18 +300,18 @@ ENTRY(aes_neon_enc1)
 	vld1.8	{q13}, [r7 :128]
 
 	/* q15 := A2_B = A2 + A(mcf) */
-	vtbl.8	d30, {d0-d1}, d24
-	vtbl.8	d31, {d0-d1}, d25
+	vtbl.8	d30, {q0}, d24
+	vtbl.8	d31, {q0}, d25
 	veor	q15, q15, q14
 
 	/* q14 := A2_B_D = A2_B + A(mcb) */
-	vtbl.8	d28, {d0-d1}, d26
-	vtbl.8	d29, {d0-d1}, d27
+	vtbl.8	d28, {q0}, d26
+	vtbl.8	d29, {q0}, d27
 	veor	q14, q14, q15
 
 	/* q0 := x = A2_B_D + A2_B(mcf) */
-	vtbl.8	d0, {d30-d31}, d24
-	vtbl.8	d1, {d30-d31}, d25
+	vtbl.8	d0, {q15}, d24
+	vtbl.8	d1, {q15}, d25
 	veor	q0, q0, q14
 
 2:	/*
@@ -324,19 +324,19 @@ ENTRY(aes_neon_enc1)
 	vand	q3, q3, q1		/* q3 := (x >> 4) & 0x0f0f... */
 
 	/* q0 := a/k */
-	vtbl.8	d0, {d22-d23}, d4
-	vtbl.8	d1, {d22-d23}, d5
+	vtbl.8	d0, {q11}, d4
+	vtbl.8	d1, {q11}, d5
 
 	/* q2 := j = i + k */
 	veor	q2, q3, q2
 
 	/* q12 := ir = 1/i */
-	vtbl.8	d24, {d20-d21}, d6
-	vtbl.8	d25, {d20-d21}, d7
+	vtbl.8	d24, {q10}, d6
+	vtbl.8	d25, {q10}, d7
 
 	/* q13 := jr = 1/j */
-	vtbl.8	d26, {d20-d21}, d4
-	vtbl.8	d27, {d20-d21}, d5
+	vtbl.8	d26, {q10}, d4
+	vtbl.8	d27, {q10}, d5
 
 	/* q12 := iak = 1/i + a/k */
 	veor	q12, q12, q0
@@ -345,12 +345,12 @@ ENTRY(aes_neon_enc1)
 	veor	q13, q13, q0
 
 	/* q12 := iakr = 1/(1/i + a/k) */
-	vtbl.8	d24, {d20-d21}, d24
-	vtbl.8	d25, {d20-d21}, d25
+	vtbl.8	d24, {q10}, d24
+	vtbl.8	d25, {q10}, d25
 
 	/* q13 := jakr = 1/(1/j + a/k) */
-	vtbl.8	d26, {d20-d21}, d26
-	vtbl.8	d27, {d20-d21}, d27
+	vtbl.8	d26, {q10}, d26
+	vtbl.8	d27, {q10}, d27
 
 	/* q2 := io = j + 1/(1/i + a/k) */
 	veor	q2, q2, q12
@@ -374,18 +374,18 @@ ENTRY(aes_neon_enc1)
 	vld1.8	{q14}, [r0 :128]!	/* q14 = *rk++ */
 
 	/* (q2, q3) := (sbo_0(io), sbo_1(jo)) */
-	vtbl.8	d4, {d12-d13}, d4
-	vtbl.8	d5, {d12-d13}, d5
-	vtbl.8	d6, {d14-d15}, d6
-	vtbl.8	d7, {d14-d15}, d7
+	vtbl.8	d4, {q6}, d4
+	vtbl.8	d5, {q6}, d5
+	vtbl.8	d6, {q7}, d6
+	vtbl.8	d7, {q7}, d7
 
 	/* q2 := x = rk[nr] + sbo_0(io) + sbo_1(jo) */
 	veor	q2, q2, q14
 	veor	q2, q2, q3
 
 	/* q0 := x(sr[rmod4]) */
-	vtbl.8	d0, {d4-d5}, d30
-	vtbl.8	d1, {d4-d5}, d31
+	vtbl.8	d0, {q2}, d30
+	vtbl.8	d1, {q2}, d31
 
 	vpop	{d8-d15}
 	pop	{r4, r5, r6, r7, r8, r10, r11, lr}
@@ -479,10 +479,10 @@ ENTRY(aes_neon_dec1)
 	vand	q3, q3, q1		/* q3 := (x >> 4) & 0x0f0f... */
 
 	/* (q2, q3) := (diptlo(lo), dipthi(hi)) */
-	vtbl.8	d4, {d8-d9}, d4
-	vtbl.8	d5, {d8-d9}, d5
-	vtbl.8	d6, {d10-d11}, d6
-	vtbl.8	d7, {d10-d11}, d7
+	vtbl.8	d4, {q4}, d4
+	vtbl.8	d5, {q4}, d5
+	vtbl.8	d6, {q5}, d6
+	vtbl.8	d7, {q5}, d7
 
 	/* load dsb9 */
 	add	r4, r12, #(dsb9 - .Lconstants)
@@ -502,22 +502,22 @@ ENTRY(aes_neon_dec1)
 	vld1.8	{q14}, [r0 :128]!	/* q14 = *rk++ */
 
 	/* q0 := rk[i] + dsb9_0(io) + dsb9_1(jo) 

CVS commit: src/sys/crypto/aes/arch/arm

2020-09-10 Thread Taylor R Campbell
Module Name:src
Committed By:   riastradh
Date:   Thu Sep 10 11:29:43 UTC 2020

Modified Files:
src/sys/crypto/aes/arch/arm: aes_neon_32.S

Log Message:
aes neon: Write vtbl with {qN} rather than {d(2N)-d(2N+1)}.

Cosmetic; no functional change.


To generate a diff of this commit:
cvs rdiff -u -r1.7 -r1.8 src/sys/crypto/aes/arch/arm/aes_neon_32.S

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.



CVS commit: src/sys/crypto/aes/arch/arm

2020-09-10 Thread Taylor R Campbell
Module Name:src
Committed By:   riastradh
Date:   Thu Sep 10 11:29:02 UTC 2020

Modified Files:
src/sys/crypto/aes/arch/arm: aes_neon_32.S

Log Message:
aes neon: Issue 256-bit loads rather than pairs of 128-bit loads.

Not sure why I didn't realize you could do this before!

Saves some temporary registers that can now be allocated to shave off
a few cycles.


To generate a diff of this commit:
cvs rdiff -u -r1.6 -r1.7 src/sys/crypto/aes/arch/arm/aes_neon_32.S

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.



CVS commit: src/sys/crypto/aes/arch/arm

2020-09-10 Thread Taylor R Campbell
Module Name:src
Committed By:   riastradh
Date:   Thu Sep 10 11:29:02 UTC 2020

Modified Files:
src/sys/crypto/aes/arch/arm: aes_neon_32.S

Log Message:
aes neon: Issue 256-bit loads rather than pairs of 128-bit loads.

Not sure why I didn't realize you could do this before!

Saves some temporary registers that can now be allocated to shave off
a few cycles.


To generate a diff of this commit:
cvs rdiff -u -r1.6 -r1.7 src/sys/crypto/aes/arch/arm/aes_neon_32.S

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/sys/crypto/aes/arch/arm/aes_neon_32.S
diff -u src/sys/crypto/aes/arch/arm/aes_neon_32.S:1.6 src/sys/crypto/aes/arch/arm/aes_neon_32.S:1.7
--- src/sys/crypto/aes/arch/arm/aes_neon_32.S:1.6	Sun Aug 16 18:02:03 2020
+++ src/sys/crypto/aes/arch/arm/aes_neon_32.S	Thu Sep 10 11:29:02 2020
@@ -1,4 +1,4 @@
-/*	$NetBSD: aes_neon_32.S,v 1.6 2020/08/16 18:02:03 riastradh Exp $	*/
+/*	$NetBSD: aes_neon_32.S,v 1.7 2020/09/10 11:29:02 riastradh Exp $	*/
 
 /*-
  * Copyright (c) 2020 The NetBSD Foundation, Inc.
@@ -28,7 +28,7 @@
 
 #include 
 
-RCSID("$NetBSD: aes_neon_32.S,v 1.6 2020/08/16 18:02:03 riastradh Exp $")
+RCSID("$NetBSD: aes_neon_32.S,v 1.7 2020/09/10 11:29:02 riastradh Exp $")
 
 	.fpu	neon
 
@@ -38,9 +38,10 @@ RCSID("$NetBSD: aes_neon_32.S,v 1.6 2020
 	.long	.Lconstants - .
 
 	.section .rodata
-	.p2align 4
+	.p2align 5
 .Lconstants:
 
+.Linv_inva:	/* inv and inva must be consecutive */
 	.type	inv,_ASM_TYPE_OBJECT
 inv:
 	.byte	0x80,0x01,0x08,0x0D,0x0F,0x06,0x05,0x0E
@@ -99,125 +100,85 @@ sr:
 	.byte	0x08,0x05,0x02,0x0F,0x0C,0x09,0x06,0x03
 END(sr)
 
-	.type	iptlo,_ASM_TYPE_OBJECT
-iptlo:
-	.byte	0x00,0x70,0x2A,0x5A,0x98,0xE8,0xB2,0xC2
+	.type	ipt,_ASM_TYPE_OBJECT
+ipt:
+	.byte	0x00,0x70,0x2A,0x5A,0x98,0xE8,0xB2,0xC2	/* lo */
 	.byte	0x08,0x78,0x22,0x52,0x90,0xE0,0xBA,0xCA
-END(iptlo)
-
-	.type	ipthi,_ASM_TYPE_OBJECT
-ipthi:
-	.byte	0x00,0x4D,0x7C,0x31,0x7D,0x30,0x01,0x4C
+	.byte	0x00,0x4D,0x7C,0x31,0x7D,0x30,0x01,0x4C /* hi */
 	.byte	0x81,0xCC,0xFD,0xB0,0xFC,0xB1,0x80,0xCD
-END(ipthi)
+END(ipt)
 
-	.type	sb1_0,_ASM_TYPE_OBJECT
-sb1_0:
-	.byte	0x00,0x3E,0x50,0xCB,0x8F,0xE1,0x9B,0xB1
+	.type	sb1,_ASM_TYPE_OBJECT
+sb1:
+	.byte	0x00,0x3E,0x50,0xCB,0x8F,0xE1,0x9B,0xB1 /* 0 */
 	.byte	0x44,0xF5,0x2A,0x14,0x6E,0x7A,0xDF,0xA5
-END(sb1_0)
-
-	.type	sb1_1,_ASM_TYPE_OBJECT
-sb1_1:
-	.byte	0x00,0x23,0xE2,0xFA,0x15,0xD4,0x18,0x36
+	.byte	0x00,0x23,0xE2,0xFA,0x15,0xD4,0x18,0x36 /* 1 */
 	.byte	0xEF,0xD9,0x2E,0x0D,0xC1,0xCC,0xF7,0x3B
-END(sb1_1)
+END(sb1)
 
-	.type	sb2_0,_ASM_TYPE_OBJECT
-sb2_0:
-	.byte	0x00,0x24,0x71,0x0B,0xC6,0x93,0x7A,0xE2
+	.type	sb2,_ASM_TYPE_OBJECT
+sb2:
+	.byte	0x00,0x24,0x71,0x0B,0xC6,0x93,0x7A,0xE2 /* 0 */
 	.byte	0xCD,0x2F,0x98,0xBC,0x55,0xE9,0xB7,0x5E
-END(sb2_0)
-
-	.type	sb2_1,_ASM_TYPE_OBJECT
-sb2_1:
-	.byte	0x00,0x29,0xE1,0x0A,0x40,0x88,0xEB,0x69
+	.byte	0x00,0x29,0xE1,0x0A,0x40,0x88,0xEB,0x69 /* 1 */
 	.byte	0x4A,0x23,0x82,0xAB,0xC8,0x63,0xA1,0xC2
-END(sb2_1)
+END(sb2)
 
-	.type	sbo_0,_ASM_TYPE_OBJECT
-sbo_0:
-	.byte	0x00,0xC7,0xBD,0x6F,0x17,0x6D,0xD2,0xD0
+	.type	sbo,_ASM_TYPE_OBJECT
+sbo:
+	.byte	0x00,0xC7,0xBD,0x6F,0x17,0x6D,0xD2,0xD0 /* 0 */
 	.byte	0x78,0xA8,0x02,0xC5,0x7A,0xBF,0xAA,0x15
-END(sbo_0)
-
-	.type	sbo_1,_ASM_TYPE_OBJECT
-sbo_1:
-	.byte	0x00,0x6A,0xBB,0x5F,0xA5,0x74,0xE4,0xCF
+	.byte	0x00,0x6A,0xBB,0x5F,0xA5,0x74,0xE4,0xCF /* 1 */
 	.byte	0xFA,0x35,0x2B,0x41,0xD1,0x90,0x1E,0x8E
-END(sbo_1)
+END(sbo)
 
-	.type	diptlo,_ASM_TYPE_OBJECT
-diptlo:
-	.byte	0x00,0x5F,0x54,0x0B,0x04,0x5B,0x50,0x0F
+	.type	dipt,_ASM_TYPE_OBJECT
+dipt:
+	.byte	0x00,0x5F,0x54,0x0B,0x04,0x5B,0x50,0x0F	/* lo */
 	.byte	0x1A,0x45,0x4E,0x11,0x1E,0x41,0x4A,0x15
-END(diptlo)
-
-	.type	dipthi,_ASM_TYPE_OBJECT
-dipthi:
-	.byte	0x00,0x65,0x05,0x60,0xE6,0x83,0xE3,0x86
+	.byte	0x00,0x65,0x05,0x60,0xE6,0x83,0xE3,0x86	/* hi */
 	.byte	0x94,0xF1,0x91,0xF4,0x72,0x17,0x77,0x12
-END(dipthi)
+END(dipt)
 
-	.type	dsb9_0,_ASM_TYPE_OBJECT
-dsb9_0:
-	.byte	0x00,0xD6,0x86,0x9A,0x53,0x03,0x1C,0x85
+	.type	dsb9,_ASM_TYPE_OBJECT
+dsb9:
+	.byte	0x00,0xD6,0x86,0x9A,0x53,0x03,0x1C,0x85	/* 0 */
 	.byte	0xC9,0x4C,0x99,0x4F,0x50,0x1F,0xD5,0xCA
-END(dsb9_0)
-
-	.type	dsb9_1,_ASM_TYPE_OBJECT
-dsb9_1:
-	.byte	0x00,0x49,0xD7,0xEC,0x89,0x17,0x3B,0xC0
+	.byte	0x00,0x49,0xD7,0xEC,0x89,0x17,0x3B,0xC0	/* 1 */
 	.byte	0x65,0xA5,0xFB,0xB2,0x9E,0x2C,0x5E,0x72
-END(dsb9_1)
+END(dsb9)
 
-	.type	dsbd_0,_ASM_TYPE_OBJECT
-dsbd_0:
-	.byte	0x00,0xA2,0xB1,0xE6,0xDF,0xCC,0x57,0x7D
+	.type	dsbd,_ASM_TYPE_OBJECT
+dsbd:
+	.byte	0x00,0xA2,0xB1,0xE6,0xDF,0xCC,0x57,0x7D	/* 0 */
 	.byte	0x39,0x44,0x2A,0x88,0x13,0x9B,0x6E,0xF5
-END(dsbd_0)
-
-	.type	dsbd_1,_ASM_TYPE_OBJECT
-dsbd_1:
-	.byte	0x00,0xCB,0xC6,0x24,0xF7,0xFA,0xE2,0x3C
+	.byte	0x00,0xCB,0xC6,0x24,0xF7,0xFA,0xE2,0x3C	/* 1 */
 	.byte	0xD3,0xEF,0xDE,0x15,0x0D,0x18,0x31,0x29
-END(dsbd_1)
+END(dsbd)
 
-	.type	dsbb_0,_ASM_TYPE_OBJECT
-dsbb_0:
-	.byte	0x00,0x42,0xB4,0x96,0x92,0x64,0x22,0xD0

CVS commit: src/sys/crypto/aes/arch/arm

2020-09-08 Thread Taylor R Campbell
Module Name:src
Committed By:   riastradh
Date:   Tue Sep  8 23:58:09 UTC 2020

Modified Files:
src/sys/crypto/aes/arch/arm: aes_armv8_64.S

Log Message:
aesarmv8: Reallocate registers to shave off unnecessary MOV.


To generate a diff of this commit:
cvs rdiff -u -r1.14 -r1.15 src/sys/crypto/aes/arch/arm/aes_armv8_64.S

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/sys/crypto/aes/arch/arm/aes_armv8_64.S
diff -u src/sys/crypto/aes/arch/arm/aes_armv8_64.S:1.14 src/sys/crypto/aes/arch/arm/aes_armv8_64.S:1.15
--- src/sys/crypto/aes/arch/arm/aes_armv8_64.S:1.14	Tue Sep  8 23:57:43 2020
+++ src/sys/crypto/aes/arch/arm/aes_armv8_64.S	Tue Sep  8 23:58:09 2020
@@ -1,4 +1,4 @@
-/*	$NetBSD: aes_armv8_64.S,v 1.14 2020/09/08 23:57:43 riastradh Exp $	*/
+/*	$NetBSD: aes_armv8_64.S,v 1.15 2020/09/08 23:58:09 riastradh Exp $	*/
 
 /*-
  * Copyright (c) 2020 The NetBSD Foundation, Inc.
@@ -28,7 +28,7 @@
 
 #include 
 
-RCSID("$NetBSD: aes_armv8_64.S,v 1.14 2020/09/08 23:57:43 riastradh Exp $")
+RCSID("$NetBSD: aes_armv8_64.S,v 1.15 2020/09/08 23:58:09 riastradh Exp $")
 
 	.arch_extension	aes
 
@@ -917,13 +917,12 @@ END(aesarmv8_cbcmac_update1)
 ENTRY(aesarmv8_ccm_enc1)
 	stp	fp, lr, [sp, #-16]!	/* push stack frame */
 	mov	fp, sp
-	ld1	{v0.16b, v1.16b}, [x4]	/* q0 := auth, q2 := ctr (be) */
-	mov	v2.16b, v1.16b
+	ld1	{v0.16b-v1.16b}, [x4]	/* q0 := auth, q1 := ctr (be) */
 	adrl	x11, ctr32_inc		/* x11 := _inc */
 	ld1	{v5.4s}, [x11]		/* q5 := (0,0,0,1) (host-endian) */
 	mov	x9, x0			/* x9 := enckey */
 	mov	x10, x3			/* x10 := nbytes */
-	rev32	v2.16b, v2.16b		/* q2 := ctr (host-endian) */
+	rev32	v2.16b, v1.16b		/* q2 := ctr (host-endian) */
 	_ALIGN_TEXT
 1:	ld1	{v3.16b}, [x1], #0x10	/* q3 := plaintext block */
 	add	v2.4s, v2.4s, v5.4s	/* increment ctr (32-bit) */
@@ -937,9 +936,8 @@ ENTRY(aesarmv8_ccm_enc1)
 	subs	x10, x10, #0x10		/* count down bytes */
 	st1	{v3.16b}, [x2], #0x10	/* store ciphertext block */
 	b.ne	1b			/* repeat if more blocks */
-	rev32	v2.16b, v2.16b		/* q2 := ctr (big-endian) */
-	mov	v1.16b, v2.16b		/* store updated auth/ctr */
-	st1	{v0.16b-v1.16b}, [x4]
+	rev32	v1.16b, v2.16b		/* q1 := ctr (big-endian) */
+	st1	{v0.16b-v1.16b}, [x4]	/* store updated auth/ctr */
 	ldp	fp, lr, [sp], #16	/* pop stack frame */
 	ret
 END(aesarmv8_ccm_enc1)



CVS commit: src/sys/crypto/aes/arch/arm

2020-09-08 Thread Taylor R Campbell
Module Name:src
Committed By:   riastradh
Date:   Tue Sep  8 23:57:43 UTC 2020

Modified Files:
src/sys/crypto/aes/arch/arm: aes_armv8_64.S

Log Message:
aesarmv8: Issue two 4-register ld/st, not four 2-register ld/st.


To generate a diff of this commit:
cvs rdiff -u -r1.13 -r1.14 src/sys/crypto/aes/arch/arm/aes_armv8_64.S

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/sys/crypto/aes/arch/arm/aes_armv8_64.S
diff -u src/sys/crypto/aes/arch/arm/aes_armv8_64.S:1.13 src/sys/crypto/aes/arch/arm/aes_armv8_64.S:1.14
--- src/sys/crypto/aes/arch/arm/aes_armv8_64.S:1.13	Tue Sep  8 23:57:13 2020
+++ src/sys/crypto/aes/arch/arm/aes_armv8_64.S	Tue Sep  8 23:57:43 2020
@@ -1,4 +1,4 @@
-/*	$NetBSD: aes_armv8_64.S,v 1.13 2020/09/08 23:57:13 riastradh Exp $	*/
+/*	$NetBSD: aes_armv8_64.S,v 1.14 2020/09/08 23:57:43 riastradh Exp $	*/
 
 /*-
  * Copyright (c) 2020 The NetBSD Foundation, Inc.
@@ -28,7 +28,7 @@
 
 #include 
 
-RCSID("$NetBSD: aes_armv8_64.S,v 1.13 2020/09/08 23:57:13 riastradh Exp $")
+RCSID("$NetBSD: aes_armv8_64.S,v 1.14 2020/09/08 23:57:43 riastradh Exp $")
 
 	.arch_extension	aes
 
@@ -693,10 +693,8 @@ ENTRY(aesarmv8_xts_enc8)
 	mov	v30.16b, v31.16b	/* q30 := tweak[6] */
 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
 	/* q31 := tweak[7] */
-	ld1	{v0.16b,v1.16b}, [x1], #0x20	/* q[i] := ptxt[i] */
-	ld1	{v2.16b,v3.16b}, [x1], #0x20
-	ld1	{v4.16b,v5.16b}, [x1], #0x20
-	ld1	{v6.16b,v7.16b}, [x1], #0x20
+	ld1	{v0.16b-v3.16b}, [x1], #0x40	/* q[i] := ptxt[i] */
+	ld1	{v4.16b-v7.16b}, [x1], #0x40
 	eor	v0.16b, v0.16b, v24.16b	/* q[i] := ptxt[i] ^ tweak[i] */
 	eor	v1.16b, v1.16b, v25.16b
 	eor	v2.16b, v2.16b, v26.16b
@@ -716,10 +714,8 @@ ENTRY(aesarmv8_xts_enc8)
 	eor	v5.16b, v5.16b, v29.16b
 	eor	v6.16b, v6.16b, v30.16b
 	eor	v7.16b, v7.16b, v31.16b
-	st1	{v0.16b,v1.16b}, [x2], #0x20	/* store ciphertext blocks */
-	st1	{v2.16b,v3.16b}, [x2], #0x20
-	st1	{v4.16b,v5.16b}, [x2], #0x20
-	st1	{v6.16b,v7.16b}, [x2], #0x20
+	st1	{v0.16b-v3.16b}, [x2], #0x40	/* store ciphertext blocks */
+	st1	{v4.16b-v7.16b}, [x2], #0x40
 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
 	subs	x10, x10, #0x80		/* count down nbytes */
 	b.ne	1b			/* repeat if more block groups */



CVS commit: src/sys/crypto/aes/arch/arm

2020-09-08 Thread Taylor R Campbell
Module Name:src
Committed By:   riastradh
Date:   Tue Sep  8 23:57:43 UTC 2020

Modified Files:
src/sys/crypto/aes/arch/arm: aes_armv8_64.S

Log Message:
aesarmv8: Issue two 4-register ld/st, not four 2-register ld/st.


To generate a diff of this commit:
cvs rdiff -u -r1.13 -r1.14 src/sys/crypto/aes/arch/arm/aes_armv8_64.S

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.



CVS commit: src/sys/crypto/aes/arch/arm

2020-09-08 Thread Taylor R Campbell
Module Name:src
Committed By:   riastradh
Date:   Tue Sep  8 23:57:13 UTC 2020

Modified Files:
src/sys/crypto/aes/arch/arm: aes_armv8_64.S

Log Message:
aesarmv8: Adapt aes_armv8_64.S to big-endian.

Patch mainly from (and tested by) jakllsch@ with minor tweaks by me.


To generate a diff of this commit:
cvs rdiff -u -r1.12 -r1.13 src/sys/crypto/aes/arch/arm/aes_armv8_64.S

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.



CVS commit: src/sys/crypto/aes/arch/arm

2020-09-08 Thread Taylor R Campbell
Module Name:src
Committed By:   riastradh
Date:   Tue Sep  8 23:58:09 UTC 2020

Modified Files:
src/sys/crypto/aes/arch/arm: aes_armv8_64.S

Log Message:
aesarmv8: Reallocate registers to shave off unnecessary MOV.


To generate a diff of this commit:
cvs rdiff -u -r1.14 -r1.15 src/sys/crypto/aes/arch/arm/aes_armv8_64.S

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.



CVS commit: src/sys/crypto/aes/arch/arm

2020-09-08 Thread Taylor R Campbell
Module Name:src
Committed By:   riastradh
Date:   Tue Sep  8 23:57:13 UTC 2020

Modified Files:
src/sys/crypto/aes/arch/arm: aes_armv8_64.S

Log Message:
aesarmv8: Adapt aes_armv8_64.S to big-endian.

Patch mainly from (and tested by) jakllsch@ with minor tweaks by me.


To generate a diff of this commit:
cvs rdiff -u -r1.12 -r1.13 src/sys/crypto/aes/arch/arm/aes_armv8_64.S

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/sys/crypto/aes/arch/arm/aes_armv8_64.S
diff -u src/sys/crypto/aes/arch/arm/aes_armv8_64.S:1.12 src/sys/crypto/aes/arch/arm/aes_armv8_64.S:1.13
--- src/sys/crypto/aes/arch/arm/aes_armv8_64.S:1.12	Sat Aug  8 14:47:01 2020
+++ src/sys/crypto/aes/arch/arm/aes_armv8_64.S	Tue Sep  8 23:57:13 2020
@@ -1,4 +1,4 @@
-/*	$NetBSD: aes_armv8_64.S,v 1.12 2020/08/08 14:47:01 riastradh Exp $	*/
+/*	$NetBSD: aes_armv8_64.S,v 1.13 2020/09/08 23:57:13 riastradh Exp $	*/
 
 /*-
  * Copyright (c) 2020 The NetBSD Foundation, Inc.
@@ -28,7 +28,7 @@
 
 #include 
 
-RCSID("$NetBSD: aes_armv8_64.S,v 1.12 2020/08/08 14:47:01 riastradh Exp $")
+RCSID("$NetBSD: aes_armv8_64.S,v 1.13 2020/09/08 23:57:13 riastradh Exp $")
 
 	.arch_extension	aes
 
@@ -114,11 +114,11 @@ END(unshiftrows_rotword_3)
  *	Standard ABI calling convention.
  */
 ENTRY(aesarmv8_setenckey128)
-	ldr	q1, [x1]	/* q1 := master key */
+	ld1	{v1.16b}, [x1]	/* q1 := master key */
 
 	adrl	x4, unshiftrows_rotword_3
 	eor	v0.16b, v0.16b, v0.16b	/* q0 := 0 */
-	ldr	q16, [x4]	/* q16 := unshiftrows_rotword_3 table */
+	ld1	{v16.16b}, [x4]	/* q16 := unshiftrows_rotword_3 table */
 
 	str	q1, [x0], #0x10	/* store master key as first round key */
 	mov	x2, #10		/* round count */
@@ -171,14 +171,14 @@ END(aesarmv8_setenckey128)
  *	Standard ABI calling convention.
  */
 ENTRY(aesarmv8_setenckey192)
-	ldr	q1, [x1], #0x10	/* q1 := master key[0:128) */
-	ldr	d2, [x1]	/* d2 := master key[128:192) */
+	ld1	{v1.16b}, [x1], #0x10	/* q1 := master key[0:128) */
+	ld1	{v2.8b}, [x1]	/* d2 := master key[128:192) */
 
 	adrl	x4, unshiftrows_rotword_1
 	adrl	x5, unshiftrows_rotword_3
 	eor	v0.16b, v0.16b, v0.16b	/* q0 := 0 */
-	ldr	q16, [x4]	/* q16 := unshiftrows_rotword_1 */
-	ldr	q17, [x5]	/* q17 := unshiftrows_rotword_3 */
+	ld1	{v16.16b}, [x4]	/* q16 := unshiftrows_rotword_1 */
+	ld1	{v17.16b}, [x5]	/* q17 := unshiftrows_rotword_3 */
 
 	str	q1, [x0], #0x10	/* store master key[0:128) as round key */
 	mov	x2, #12		/* round count */
@@ -351,13 +351,13 @@ END(aesarmv8_setenckey192)
  */
 ENTRY(aesarmv8_setenckey256)
 	/* q1 := key[0:128), q2 := key[128:256) */
-	ldp	q1, q2, [x1], #0x20
+	ld1	{v1.16b-v2.16b}, [x1], #0x20
 
 	adrl	x4, unshiftrows_rotword_3
 	adrl	x5, unshiftrows_3
 	eor	v0.16b, v0.16b, v0.16b	/* q0 := 0 */
-	ldr	q16, [x4]	/* q16 := unshiftrows_rotword_3 */
-	ldr	q17, [x5]	/* q17 := unshiftrows_3 */
+	ld1	{v16.16b}, [x4]	/* q16 := unshiftrows_rotword_3 */
+	ld1	{v17.16b}, [x5]	/* q17 := unshiftrows_3 */
 
 	/* store master key as first two round keys */
 	stp	q1, q2, [x0], #0x20
@@ -461,9 +461,9 @@ END(aesarmv8_enctodec)
 ENTRY(aesarmv8_enc)
 	stp	fp, lr, [sp, #-16]!	/* push stack frame */
 	mov	fp, sp
-	ldr	q0, [x1]	/* q0 := ptxt */
+	ld1	{v0.16b}, [x1]	/* q0 := ptxt */
 	bl	aesarmv8_enc1	/* q0 := ctxt; trash x0/x3/q16 */
-	str	q0, [x2]	/* store ctxt */
+	st1	{v0.16b}, [x2]	/* store ctxt */
 	ldp	fp, lr, [sp], #16	/* pop stack frame */
 	ret
 END(aesarmv8_enc)
@@ -479,9 +479,9 @@ END(aesarmv8_enc)
 ENTRY(aesarmv8_dec)
 	stp	fp, lr, [sp, #-16]!	/* push stack frame */
 	mov	fp, sp
-	ldr	q0, [x1]	/* q0 := ctxt */
+	ld1	{v0.16b}, [x1]	/* q0 := ctxt */
 	bl	aesarmv8_dec1	/* q0 := ptxt; trash x0/x3/q16 */
-	str	q0, [x2]	/* store ptxt */
+	st1	{v0.16b}, [x2]	/* store ptxt */
 	ldp	fp, lr, [sp], #16	/* pop stack frame */
 	ret
 END(aesarmv8_dec)
@@ -503,17 +503,17 @@ ENTRY(aesarmv8_cbc_enc)
 	mov	fp, sp
 	mov	x9, x0			/* x9 := enckey */
 	mov	x10, x3			/* x10 := nbytes */
-	ldr	q0, [x4]		/* q0 := chaining value */
+	ld1	{v0.16b}, [x4]		/* q0 := chaining value */
 	_ALIGN_TEXT
-1:	ldr	q1, [x1], #0x10		/* q1 := plaintext block */
+1:	ld1	{v1.16b}, [x1], #0x10	/* q1 := plaintext block */
 	eor	v0.16b, v0.16b, v1.16b	/* q0 := cv ^ ptxt */
 	mov	x0, x9			/* x0 := enckey */
 	mov	x3, x5			/* x3 := nrounds */
 	bl	aesarmv8_enc1		/* q0 := ctxt; trash x0/x3/q16 */
 	subs	x10, x10, #0x10		/* count down nbytes */
-	str	q0, [x2], #0x10		/* store ciphertext block */
+	st1	{v0.16b}, [x2], #0x10	/* store ciphertext block */
 	b.ne	1b			/* repeat if x10 is nonzero */
-	str	q0, [x4]		/* store chaining value */
+	st1	{v0.16b}, [x4]		/* store chaining value */
 	ldp	fp, lr, [sp], #16	/* pop stack frame */
 2:	ret
 END(aesarmv8_cbc_enc)
@@ -533,18 +533,21 @@ END(aesarmv8_cbc_enc)
 ENTRY(aesarmv8_cbc_dec1)
 	stp	fp, lr, [sp, #-16]!	/* push stack frame */
 	mov	fp, sp
-	ldr	q24, [x4]		/* q24 := iv */
+	ld1	{v24.16b}, [x4]		/* q24 := iv */
 	mov	x9, x0			/* x9 := enckey */
 	

CVS commit: src/sys/crypto/aes/arch/arm

2020-08-16 Thread Taylor R Campbell
Module Name:src
Committed By:   riastradh
Date:   Sun Aug 16 18:02:03 UTC 2020

Modified Files:
src/sys/crypto/aes/arch/arm: aes_neon_32.S files.aesneon

Log Message:
Fix AES NEON code for big-endian softfp ARM.

...which is how the kernel runs.  Switch to using __SOFTFP__ for
consistency with how it gets exposed to C, although I'm not sure how
to get it defined automagically in the toolchain for .S files so
that's set manually in files.aesneon for now.


To generate a diff of this commit:
cvs rdiff -u -r1.5 -r1.6 src/sys/crypto/aes/arch/arm/aes_neon_32.S
cvs rdiff -u -r1.3 -r1.4 src/sys/crypto/aes/arch/arm/files.aesneon

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/sys/crypto/aes/arch/arm/aes_neon_32.S
diff -u src/sys/crypto/aes/arch/arm/aes_neon_32.S:1.5 src/sys/crypto/aes/arch/arm/aes_neon_32.S:1.6
--- src/sys/crypto/aes/arch/arm/aes_neon_32.S:1.5	Sat Aug  8 14:47:01 2020
+++ src/sys/crypto/aes/arch/arm/aes_neon_32.S	Sun Aug 16 18:02:03 2020
@@ -1,4 +1,4 @@
-/*	$NetBSD: aes_neon_32.S,v 1.5 2020/08/08 14:47:01 riastradh Exp $	*/
+/*	$NetBSD: aes_neon_32.S,v 1.6 2020/08/16 18:02:03 riastradh Exp $	*/
 
 /*-
  * Copyright (c) 2020 The NetBSD Foundation, Inc.
@@ -28,7 +28,7 @@
 
 #include 
 
-RCSID("$NetBSD: aes_neon_32.S,v 1.5 2020/08/08 14:47:01 riastradh Exp $")
+RCSID("$NetBSD: aes_neon_32.S,v 1.6 2020/08/16 18:02:03 riastradh Exp $")
 
 	.fpu	neon
 
@@ -228,15 +228,19 @@ END(dsbo_1)
  * aes_neon_enc1(const struct aesenc *enc@r0, uint8x16_t x@q0,
  * unsigned nrounds@r1)
  *
- *	With -mfloat-abi=soft(fp) (here spelled `#ifdef _KERNEL'):
+ *	With -mfloat-abi=soft(fp) (i.e., __SOFTFP__):
  *
  * uint8x16_t@(r0,r1,r2,r3)
  * aes_neon_enc1(const struct aesenc *enc@r0,
  * uint8x16_t x@(r2,r3,sp[0],sp[4]), nrounds@sp[8])
  */
 ENTRY(aes_neon_enc1)
-#ifdef _KERNEL
+#ifdef __SOFTFP__
+#ifdef __ARM_BIG_ENDIAN
+	vmov	d0, r3, r2		/* d0 := x lo */
+#else
 	vmov	d0, r2, r3		/* d0 := x lo */
+#endif
 	vldr	d1, [sp]		/* d1 := x hi */
 	ldr	r1, [sp, #8]		/* r1 := nrounds */
 #endif
@@ -434,10 +438,15 @@ ENTRY(aes_neon_enc1)
 
 	vpop	{d8-d15}
 	pop	{r4, r5, r6, r7, r8, r10, r11, lr}
-#ifdef _KERNEL
+#ifdef __SOFTFP__
+#ifdef __ARM_BIG_ENDIAN
+	vmov	r1, r0, d0
+	vmov	r3, r2, d1
+#else
 	vmov	r0, r1, d0
 	vmov	r2, r3, d1
 #endif
+#endif
 	bx	lr
 END(aes_neon_enc1)
 
@@ -457,8 +466,12 @@ END(aes_neon_enc1)
  * uint8x16_t x@(r2,r3,sp[0],sp[4]), nrounds@sp[8])
  */
 ENTRY(aes_neon_dec1)
-#ifdef _KERNEL
+#ifdef __SOFTFP__
+#ifdef __ARM_BIG_ENDIAN
+	vmov	d0, r3, r2		/* d0 := x lo */
+#else
 	vmov	d0, r2, r3		/* d0 := x lo */
+#endif
 	vldr	d1, [sp]		/* d1 := x hi */
 	ldr	r1, [sp, #8]		/* r1 := nrounds */
 #endif
@@ -669,9 +682,14 @@ ENTRY(aes_neon_dec1)
 
 	vpop	{d8-d15}
 	pop	{r4, r5, r6, r7, r8, r10, r11, lr}
-#ifdef _KERNEL
+#ifdef __SOFTFP__
+#ifdef __ARM_BIG_ENDIAN
+	vmov	r1, r0, d0
+	vmov	r3, r2, d1
+#else
 	vmov	r0, r1, d0
 	vmov	r2, r3, d1
 #endif
+#endif
 	bx	lr
 END(aes_neon_dec1)

Index: src/sys/crypto/aes/arch/arm/files.aesneon
diff -u src/sys/crypto/aes/arch/arm/files.aesneon:1.3 src/sys/crypto/aes/arch/arm/files.aesneon:1.4
--- src/sys/crypto/aes/arch/arm/files.aesneon:1.3	Tue Jun 30 17:03:13 2020
+++ src/sys/crypto/aes/arch/arm/files.aesneon	Sun Aug 16 18:02:03 2020
@@ -1,4 +1,4 @@
-#	$NetBSD: files.aesneon,v 1.3 2020/06/30 17:03:13 riastradh Exp $
+#	$NetBSD: files.aesneon,v 1.4 2020/08/16 18:02:03 riastradh Exp $
 
 ifdef aarch64
 makeoptions	aes	"COPTS.aes_neon.c"+="-march=armv8-a"
@@ -8,6 +8,8 @@ makeoptions	aes	"COPTS.aes_neon.c"+="-mf
 makeoptions	aes	"COPTS.aes_neon_subr.c"+="-mfloat-abi=softfp -mfpu=neon"
 endif
 
+makeoptions	aes	"AOPTS.aes_neon_32.S"+="-D__SOFTFP__"
+
 file	crypto/aes/arch/arm/aes_neon.c		aes & (cpu_cortex | aarch64)
 file	crypto/aes/arch/arm/aes_neon_impl.c	aes & (cpu_cortex | aarch64)
 file	crypto/aes/arch/arm/aes_neon_subr.c	aes & (cpu_cortex | aarch64)



CVS commit: src/sys/crypto/aes/arch/arm

2020-08-16 Thread Taylor R Campbell
Module Name:src
Committed By:   riastradh
Date:   Sun Aug 16 18:02:03 UTC 2020

Modified Files:
src/sys/crypto/aes/arch/arm: aes_neon_32.S files.aesneon

Log Message:
Fix AES NEON code for big-endian softfp ARM.

...which is how the kernel runs.  Switch to using __SOFTFP__ for
consistency with how it gets exposed to C, although I'm not sure how
to get it defined automagically in the toolchain for .S files so
that's set manually in files.aesneon for now.


To generate a diff of this commit:
cvs rdiff -u -r1.5 -r1.6 src/sys/crypto/aes/arch/arm/aes_neon_32.S
cvs rdiff -u -r1.3 -r1.4 src/sys/crypto/aes/arch/arm/files.aesneon

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.



CVS commit: src/sys/crypto/aes/arch/arm

2020-08-08 Thread Taylor R Campbell
Module Name:src
Committed By:   riastradh
Date:   Sun Aug  9 02:00:57 UTC 2020

Modified Files:
src/sys/crypto/aes/arch/arm: aes_neon_subr.c

Log Message:
Nix outdated comment.

I implemented this parallelism a couple weeks ago.


To generate a diff of this commit:
cvs rdiff -u -r1.5 -r1.6 src/sys/crypto/aes/arch/arm/aes_neon_subr.c

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.



CVS commit: src/sys/crypto/aes/arch/arm

2020-08-08 Thread Taylor R Campbell
Module Name:src
Committed By:   riastradh
Date:   Sun Aug  9 02:00:57 UTC 2020

Modified Files:
src/sys/crypto/aes/arch/arm: aes_neon_subr.c

Log Message:
Nix outdated comment.

I implemented this parallelism a couple weeks ago.


To generate a diff of this commit:
cvs rdiff -u -r1.5 -r1.6 src/sys/crypto/aes/arch/arm/aes_neon_subr.c

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/sys/crypto/aes/arch/arm/aes_neon_subr.c
diff -u src/sys/crypto/aes/arch/arm/aes_neon_subr.c:1.5 src/sys/crypto/aes/arch/arm/aes_neon_subr.c:1.6
--- src/sys/crypto/aes/arch/arm/aes_neon_subr.c:1.5	Sat Aug  8 14:47:01 2020
+++ src/sys/crypto/aes/arch/arm/aes_neon_subr.c	Sun Aug  9 02:00:57 2020
@@ -1,4 +1,4 @@
-/*	$NetBSD: aes_neon_subr.c,v 1.5 2020/08/08 14:47:01 riastradh Exp $	*/
+/*	$NetBSD: aes_neon_subr.c,v 1.6 2020/08/09 02:00:57 riastradh Exp $	*/
 
 /*-
  * Copyright (c) 2020 The NetBSD Foundation, Inc.
@@ -27,7 +27,7 @@
  */
 
 #include 
-__KERNEL_RCSID(1, "$NetBSD: aes_neon_subr.c,v 1.5 2020/08/08 14:47:01 riastradh Exp $");
+__KERNEL_RCSID(1, "$NetBSD: aes_neon_subr.c,v 1.6 2020/08/09 02:00:57 riastradh Exp $");
 
 #ifdef _KERNEL
 #include 
@@ -287,12 +287,6 @@ aes_neon_cbcmac_update1(const struct aes
 	storeblock(auth0, auth);
 }
 
-/*
- * XXX On aarch64, we have enough registers that we should be able to
- * pipeline two simultaneous vpaes computations in an `aes_neon_enc2'
- * function, which should substantially improve CCM throughput.
- */
-
 void
 aes_neon_ccm_enc1(const struct aesenc *enc, const uint8_t in[static 16],
 uint8_t out[static 16], size_t nbytes, uint8_t authctr[static 32],



CVS commit: src/sys/crypto/aes/arch/arm

2020-07-28 Thread Taylor R Campbell
Module Name:src
Committed By:   riastradh
Date:   Tue Jul 28 20:11:09 UTC 2020

Modified Files:
src/sys/crypto/aes/arch/arm: aes_neon.c aes_neon_impl.h aes_neon_subr.c
arm_neon.h

Log Message:
Draft 2x vectorized neon vpaes for aarch64.

Gives a modest speed boost on rk3399 (Cortex-A53/A72), around 20% in
cgd tests, for parallelizable operations like CBC decryption; same
improvement should probably carry over to rpi4 CPU which lacks
ARMv8.0-AES.


To generate a diff of this commit:
cvs rdiff -u -r1.3 -r1.4 src/sys/crypto/aes/arch/arm/aes_neon.c \
src/sys/crypto/aes/arch/arm/aes_neon_subr.c
cvs rdiff -u -r1.1 -r1.2 src/sys/crypto/aes/arch/arm/aes_neon_impl.h
cvs rdiff -u -r1.6 -r1.7 src/sys/crypto/aes/arch/arm/arm_neon.h

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/sys/crypto/aes/arch/arm/aes_neon.c
diff -u src/sys/crypto/aes/arch/arm/aes_neon.c:1.3 src/sys/crypto/aes/arch/arm/aes_neon.c:1.4
--- src/sys/crypto/aes/arch/arm/aes_neon.c:1.3	Tue Jun 30 20:32:11 2020
+++ src/sys/crypto/aes/arch/arm/aes_neon.c	Tue Jul 28 20:11:09 2020
@@ -1,4 +1,4 @@
-/*	$NetBSD: aes_neon.c,v 1.3 2020/06/30 20:32:11 riastradh Exp $	*/
+/*	$NetBSD: aes_neon.c,v 1.4 2020/07/28 20:11:09 riastradh Exp $	*/
 
 /*-
  * Copyright (c) 2020 The NetBSD Foundation, Inc.
@@ -39,7 +39,7 @@
  */
 
 #include 
-__KERNEL_RCSID(1, "$NetBSD: aes_neon.c,v 1.3 2020/06/30 20:32:11 riastradh Exp $");
+__KERNEL_RCSID(1, "$NetBSD: aes_neon.c,v 1.4 2020/07/28 20:11:09 riastradh Exp $");
 
 #include 
 
@@ -589,6 +589,59 @@ aes_neon_enc1(const struct aesenc *enc, 
 	return vqtbl1q_u8(x, sr[rmod4]);
 }
 
+uint8x16x2_t
+aes_neon_enc2(const struct aesenc *enc, uint8x16x2_t x, unsigned nrounds)
+{
+	const uint32_t *rk32 = enc->aese_aes.aes_rk;
+	uint8x16_t inv_ = *(const volatile uint8x16_t *)
+	uint8x16_t inva_ = *(const volatile uint8x16_t *)
+	uint8x16_t sb1_0 = ((const volatile uint8x16_t *)sb1)[0];
+	uint8x16_t sb1_1 = ((const volatile uint8x16_t *)sb1)[1];
+	uint8x16_t sb2_0 = ((const volatile uint8x16_t *)sb2)[0];
+	uint8x16_t sb2_1 = ((const volatile uint8x16_t *)sb2)[1];
+	uint8x16_t x0 = x.val[0], x1 = x.val[1];
+	uint8x16_t io0, jo0, io1, jo1;
+	unsigned rmod4 = 0;
+
+	x0 = aes_schedule_transform(x0, ipt);
+	x1 = aes_schedule_transform(x1, ipt);
+	x0 ^= loadroundkey(rk32);
+	x1 ^= loadroundkey(rk32);
+	for (;;) {
+		uint8x16_t A_0, A2_0, A2_B_0, A2_B_D_0;
+		uint8x16_t A_1, A2_1, A2_B_1, A2_B_D_1;
+
+		subbytes(, , x0, inv_, inva_);
+		subbytes(, , x1, inv_, inva_);
+
+		rk32 += 4;
+		rmod4 = (rmod4 + 1) % 4;
+		if (--nrounds == 0)
+			break;
+
+		A_0 = vqtbl1q_u8(sb1_0, io0) ^ vqtbl1q_u8(sb1_1, jo0);
+		A_1 = vqtbl1q_u8(sb1_0, io1) ^ vqtbl1q_u8(sb1_1, jo1);
+		A_0 ^= loadroundkey(rk32);
+		A_1 ^= loadroundkey(rk32);
+		A2_0 = vqtbl1q_u8(sb2_0, io0) ^ vqtbl1q_u8(sb2_1, jo0);
+		A2_1 = vqtbl1q_u8(sb2_0, io1) ^ vqtbl1q_u8(sb2_1, jo1);
+		A2_B_0 = A2_0 ^ vqtbl1q_u8(A_0, mc_forward[rmod4]);
+		A2_B_1 = A2_1 ^ vqtbl1q_u8(A_1, mc_forward[rmod4]);
+		A2_B_D_0 = A2_B_0 ^ vqtbl1q_u8(A_0, mc_backward[rmod4]);
+		A2_B_D_1 = A2_B_1 ^ vqtbl1q_u8(A_1, mc_backward[rmod4]);
+		x0 = A2_B_D_0 ^ vqtbl1q_u8(A2_B_0, mc_forward[rmod4]);
+		x1 = A2_B_D_1 ^ vqtbl1q_u8(A2_B_1, mc_forward[rmod4]);
+	}
+	x0 = vqtbl1q_u8(sbo[0], io0) ^ vqtbl1q_u8(sbo[1], jo0);
+	x1 = vqtbl1q_u8(sbo[0], io1) ^ vqtbl1q_u8(sbo[1], jo1);
+	x0 ^= loadroundkey(rk32);
+	x1 ^= loadroundkey(rk32);
+	return (uint8x16x2_t) { .val = {
+		[0] = vqtbl1q_u8(x0, sr[rmod4]),
+		[1] = vqtbl1q_u8(x1, sr[rmod4]),
+	} };
+}
+
 uint8x16_t
 aes_neon_dec1(const struct aesdec *dec, uint8x16_t x, unsigned nrounds)
 {
@@ -628,4 +681,60 @@ aes_neon_dec1(const struct aesdec *dec, 
 	return vqtbl1q_u8(x, sr[i]);
 }
 
+uint8x16x2_t
+aes_neon_dec2(const struct aesdec *dec, uint8x16x2_t x, unsigned nrounds)
+{
+	const uint32_t *rk32 = dec->aesd_aes.aes_rk;
+	unsigned i = 3 & ~(nrounds - 1);
+	uint8x16_t inv_ = *(const volatile uint8x16_t *)
+	uint8x16_t inva_ = *(const volatile uint8x16_t *)
+	uint8x16_t x0 = x.val[0], x1 = x.val[1];
+	uint8x16_t io0, jo0, io1, jo1, mc;
+
+	x0 = aes_schedule_transform(x0, dipt);
+	x1 = aes_schedule_transform(x1, dipt);
+	x0 ^= loadroundkey(rk32);
+	x1 ^= loadroundkey(rk32);
+	rk32 += 4;
+
+	mc = mc_forward[3];
+	for (;;) {
+		subbytes(, , x0, inv_, inva_);
+		subbytes(, , x1, inv_, inva_);
+		if (--nrounds == 0)
+			break;
+
+		x0 = vqtbl1q_u8(dsb9[0], io0) ^ vqtbl1q_u8(dsb9[1], jo0);
+		x1 = vqtbl1q_u8(dsb9[0], io1) ^ vqtbl1q_u8(dsb9[1], jo1);
+		x0 ^= loadroundkey(rk32);
+		x1 ^= loadroundkey(rk32);
+		rk32 += 4;/* next round key */
+
+		x0 = vqtbl1q_u8(x0, mc);
+		x1 = vqtbl1q_u8(x1, mc);
+		x0 ^= vqtbl1q_u8(dsbd[0], io0) ^ vqtbl1q_u8(dsbd[1], jo0);
+		x1 ^= vqtbl1q_u8(dsbd[0], io1) ^ vqtbl1q_u8(dsbd[1], jo1);
+
+		x0 = vqtbl1q_u8(x0, mc);
+		x1 = vqtbl1q_u8(x1, mc);
+		x0 ^= vqtbl1q_u8(dsbb[0], io0) ^ vqtbl1q_u8(dsbb[1], jo0);
+		x1 ^= vqtbl1q_u8(dsbb[0], 

CVS commit: src/sys/crypto/aes/arch/arm

2020-07-28 Thread Taylor R Campbell
Module Name:src
Committed By:   riastradh
Date:   Tue Jul 28 20:11:09 UTC 2020

Modified Files:
src/sys/crypto/aes/arch/arm: aes_neon.c aes_neon_impl.h aes_neon_subr.c
arm_neon.h

Log Message:
Draft 2x vectorized neon vpaes for aarch64.

Gives a modest speed boost on rk3399 (Cortex-A53/A72), around 20% in
cgd tests, for parallelizable operations like CBC decryption; same
improvement should probably carry over to rpi4 CPU which lacks
ARMv8.0-AES.


To generate a diff of this commit:
cvs rdiff -u -r1.3 -r1.4 src/sys/crypto/aes/arch/arm/aes_neon.c \
src/sys/crypto/aes/arch/arm/aes_neon_subr.c
cvs rdiff -u -r1.1 -r1.2 src/sys/crypto/aes/arch/arm/aes_neon_impl.h
cvs rdiff -u -r1.6 -r1.7 src/sys/crypto/aes/arch/arm/arm_neon.h

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.



CVS commit: src/sys/crypto/aes/arch/arm

2020-07-27 Thread Taylor R Campbell
Module Name:src
Committed By:   riastradh
Date:   Mon Jul 27 20:54:12 UTC 2020

Modified Files:
src/sys/crypto/aes/arch/arm: aes_armv8_64.S

Log Message:
Issue aese/aesmc and aesd/aesimc in pairs.

Advised by the aarch64 optimization guide; increases cgd throughput
by about 10%.


To generate a diff of this commit:
cvs rdiff -u -r1.9 -r1.10 src/sys/crypto/aes/arch/arm/aes_armv8_64.S

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.



CVS commit: src/sys/crypto/aes/arch/arm

2020-07-27 Thread Taylor R Campbell
Module Name:src
Committed By:   riastradh
Date:   Mon Jul 27 20:54:12 UTC 2020

Modified Files:
src/sys/crypto/aes/arch/arm: aes_armv8_64.S

Log Message:
Issue aese/aesmc and aesd/aesimc in pairs.

Advised by the aarch64 optimization guide; increases cgd throughput
by about 10%.


To generate a diff of this commit:
cvs rdiff -u -r1.9 -r1.10 src/sys/crypto/aes/arch/arm/aes_armv8_64.S

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/sys/crypto/aes/arch/arm/aes_armv8_64.S
diff -u src/sys/crypto/aes/arch/arm/aes_armv8_64.S:1.9 src/sys/crypto/aes/arch/arm/aes_armv8_64.S:1.10
--- src/sys/crypto/aes/arch/arm/aes_armv8_64.S:1.9	Mon Jul 27 20:53:22 2020
+++ src/sys/crypto/aes/arch/arm/aes_armv8_64.S	Mon Jul 27 20:54:11 2020
@@ -1,4 +1,4 @@
-/*	$NetBSD: aes_armv8_64.S,v 1.9 2020/07/27 20:53:22 riastradh Exp $	*/
+/*	$NetBSD: aes_armv8_64.S,v 1.10 2020/07/27 20:54:11 riastradh Exp $	*/
 
 /*-
  * Copyright (c) 2020 The NetBSD Foundation, Inc.
@@ -1041,15 +1041,18 @@ END(ctr32_inc)
 	.type	aesarmv8_enc1,@function
 aesarmv8_enc1:
 	ldr	q16, [x0], #0x10	/* load round key */
-	b	2f
+	sub	x3, x3, #1
 	_ALIGN_TEXT
-1:	/* q0 := MixColumns(q0) */
+1:	/* q0 := MixColumns(ShiftRows(SubBytes(AddRoundKey_q16(q0 */
+	aese	v0.16b, v16.16b
 	aesmc	v0.16b, v0.16b
-2:	subs	x3, x3, #1
+	ldr	q16, [x0], #0x10
+	subs	x3, x3, #1
+	b.ne	1b
 	/* q0 := ShiftRows(SubBytes(AddRoundKey_q16(q0))) */
 	aese	v0.16b, v16.16b
-	ldr	q16, [x0], #0x10		/* load next round key */
-	b.ne	1b
+	ldr	q16, [x0]		/* load last round key */
+	/* q0 := AddRoundKey_q16(q0) */
 	eor	v0.16b, v0.16b, v16.16b
 	ret
 END(aesarmv8_enc1)
@@ -1067,17 +1070,21 @@ END(aesarmv8_enc1)
 	.type	aesarmv8_enc2,@function
 aesarmv8_enc2:
 	ldr	q16, [x0], #0x10	/* load round key */
-	b	2f
+	sub	x3, x3, #1
 	_ALIGN_TEXT
-1:	/* q[i] := MixColumns(q[i]) */
+1:	/* q[i] := MixColumns(ShiftRows(SubBytes(AddRoundKey_q16(q[i] */
+	aese	v0.16b, v16.16b
 	aesmc	v0.16b, v0.16b
+	aese	v1.16b, v16.16b
 	aesmc	v1.16b, v1.16b
-2:	subs	x3, x3, #1
+	ldr	q16, [x0], #0x10	/* load next round key */
+	subs	x3, x3, #1
+	b.ne	1b
 	/* q[i] := ShiftRows(SubBytes(AddRoundKey_q16(q[i]))) */
 	aese	v0.16b, v16.16b
 	aese	v1.16b, v16.16b
-	ldr	q16, [x0], #0x10		/* load next round key */
-	b.ne	1b
+	ldr	q16, [x0]		/* load last round key */
+	/* q[i] := AddRoundKey_q16(q[i]) */
 	eor	v0.16b, v0.16b, v16.16b
 	eor	v1.16b, v1.16b, v16.16b
 	ret
@@ -1097,18 +1104,28 @@ END(aesarmv8_enc2)
 	.type	aesarmv8_enc8,@function
 aesarmv8_enc8:
 	ldr	q16, [x0], #0x10	/* load round key */
-	b	2f
+	sub	x3, x3, #1
 	_ALIGN_TEXT
-1:	/* q[i] := MixColumns(q[i]) */
+1:	/* q[i] := MixColumns(ShiftRows(SubBytes(AddRoundKey_q16(q[i] */
+	aese	v0.16b, v16.16b
 	aesmc	v0.16b, v0.16b
+	aese	v1.16b, v16.16b
 	aesmc	v1.16b, v1.16b
+	aese	v2.16b, v16.16b
 	aesmc	v2.16b, v2.16b
+	aese	v3.16b, v16.16b
 	aesmc	v3.16b, v3.16b
+	aese	v4.16b, v16.16b
 	aesmc	v4.16b, v4.16b
+	aese	v5.16b, v16.16b
 	aesmc	v5.16b, v5.16b
+	aese	v6.16b, v16.16b
 	aesmc	v6.16b, v6.16b
+	aese	v7.16b, v16.16b
 	aesmc	v7.16b, v7.16b
-2:	subs	x3, x3, #1
+	ldr	q16, [x0], #0x10	/* load next round key */
+	subs	x3, x3, #1
+	b.ne	1b
 	/* q[i] := ShiftRows(SubBytes(AddRoundKey_q16(q[i]))) */
 	aese	v0.16b, v16.16b
 	aese	v1.16b, v16.16b
@@ -1118,9 +1135,9 @@ aesarmv8_enc8:
 	aese	v5.16b, v16.16b
 	aese	v6.16b, v16.16b
 	aese	v7.16b, v16.16b
-	ldr	q16, [x0], #0x10	/* load next round key */
-	b.ne	1b
-	eor	v0.16b, v0.16b, v16.16b	/* AddRoundKey */
+	ldr	q16, [x0]		/* load last round key */
+	/* q[i] := AddRoundKey_q16(q[i]) */
+	eor	v0.16b, v0.16b, v16.16b
 	eor	v1.16b, v1.16b, v16.16b
 	eor	v2.16b, v2.16b, v16.16b
 	eor	v3.16b, v3.16b, v16.16b
@@ -1144,15 +1161,19 @@ END(aesarmv8_enc8)
 	.type	aesarmv8_dec1,@function
 aesarmv8_dec1:
 	ldr	q16, [x0], #0x10	/* load round key */
-	b	2f
+	sub	x3, x3, #1
 	_ALIGN_TEXT
-1:	/* q0 := InMixColumns(q0) */
-	aesimc	v0.16b, v0.16b
-2:	subs	x3, x3, #1
-	/* q0 := InSubBytes(InShiftRows(AddRoundKey_q16(q0))) */
+1:	/* q0 := InSubBytes(InShiftRows(AddRoundKey_q16(q0))) */
 	aesd	v0.16b, v16.16b
+	/* q0 := InMixColumns(q0) */
+	aesimc	v0.16b, v0.16b
 	ldr	q16, [x0], #0x10	/* load next round key */
+	subs	x3, x3, #1
 	b.ne	1b
+	/* q0 := InSubBytes(InShiftRows(AddRoundKey_q16(q0))) */
+	aesd	v0.16b, v16.16b
+	ldr	q16, [x0]		/* load last round key */
+	/* q0 := AddRoundKey_q16(q0) */
 	eor	v0.16b, v0.16b, v16.16b
 	ret
 END(aesarmv8_dec1)
@@ -1171,18 +1192,29 @@ END(aesarmv8_dec1)
 	.type	aesarmv8_dec8,@function
 aesarmv8_dec8:
 	ldr	q16, [x0], #0x10	/* load round key */
-	b	2f
+	sub	x3, x3, #1
 	_ALIGN_TEXT
-1:	/* q[i] := InMixColumns(q[i]) */
+1:	/* q[i] := InSubBytes(InShiftRows(AddRoundKey_q16(q[i]))) */
+	aesd	v0.16b, v16.16b
+	/* q[i] := InMixColumns(q[i]) */
 	aesimc	v0.16b, v0.16b
+	aesd	v1.16b, v16.16b
 	aesimc	v1.16b, v1.16b
+	aesd	v2.16b, v16.16b
 	aesimc	v2.16b, v2.16b
+	aesd	v3.16b, v16.16b
 	aesimc	v3.16b, 

CVS commit: src/sys/crypto/aes/arch/arm

2020-07-27 Thread Taylor R Campbell
Module Name:src
Committed By:   riastradh
Date:   Mon Jul 27 20:52:11 UTC 2020

Modified Files:
src/sys/crypto/aes/arch/arm: aes_neon_32.S

Log Message:
PIC for aes_neon_32.S.

Without this, tests/sys/crypto/aes/t_aes fails to start on armv7
because of R_ARM_ABS32 relocations in a nonwritable text segment for
a PIE -- which atf quietly ignores in the final report!  Yikes.


To generate a diff of this commit:
cvs rdiff -u -r1.1 -r1.2 src/sys/crypto/aes/arch/arm/aes_neon_32.S

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/sys/crypto/aes/arch/arm/aes_neon_32.S
diff -u src/sys/crypto/aes/arch/arm/aes_neon_32.S:1.1 src/sys/crypto/aes/arch/arm/aes_neon_32.S:1.2
--- src/sys/crypto/aes/arch/arm/aes_neon_32.S:1.1	Mon Jun 29 23:57:56 2020
+++ src/sys/crypto/aes/arch/arm/aes_neon_32.S	Mon Jul 27 20:52:10 2020
@@ -1,4 +1,4 @@
-/*	$NetBSD: aes_neon_32.S,v 1.1 2020/06/29 23:57:56 riastradh Exp $	*/
+/*	$NetBSD: aes_neon_32.S,v 1.2 2020/07/27 20:52:10 riastradh Exp $	*/
 
 /*-
  * Copyright (c) 2020 The NetBSD Foundation, Inc.
@@ -30,8 +30,14 @@
 
 	.fpu	neon
 
+	.text
+	.p2align 2
+.Lconstants_addr:
+	.long	.Lconstants - .
+
 	.section .rodata
 	.p2align 4
+.Lconstants:
 
 	.type	inv,_ASM_TYPE_OBJECT
 inv:
@@ -239,7 +245,7 @@ ENTRY(aes_neon_enc1)
 	 * r3: rmod4
 	 * r4: mc_forward
 	 * r5: mc_backward
-	 * r6,r7,r8,r10,r11: temporaries
+	 * r6,r7,r8,r10,r11,r12: temporaries
 	 * q0={d0-d1}: x/ak/A
 	 * q1={d2-d3}: 0x0f0f...
 	 * q2={d4-d5}: lo/k/j/io
@@ -258,23 +264,30 @@ ENTRY(aes_neon_enc1)
 	 * q15={d30-d31}: A2_B/sr[rmod4]
 	 */
 
+	/* r12 := .Lconstants - .Lconstants_addr, r11 := .Lconstants_addr */
+	ldr	r12, .Lconstants_addr
+	adr	r11, .Lconstants_addr
+
 	vld1.64	{d28-d29}, [r0 :128]!	/* q14 = *rk++ */
 	movw	r3, #0
 	vmov.i8	q1, #0x0f
 
+	/* r12 := .Lconstants */
+	add	r12, r12, r11
+
 	/* (q4, q5) := (iptlo, ipthi) */
-	ldr	r6, =iptlo
-	ldr	r7, =ipthi
+	add	r6, r12, #(iptlo - .Lconstants)
+	add	r7, r12, #(ipthi - .Lconstants)
 	vld1.64	{d8-d9}, [r6 :128]
 	vld1.64	{d10-d11}, [r7 :128]
 
 	/* load the rest of the constants */
-	ldr	r4, =sb1_0
-	ldr	r5, =sb1_1
-	ldr	r6, =sb2_0
-	ldr	r7, =sb2_1
-	ldr	r8, =inv
-	ldr	r10, =inva
+	add	r4, r12, #(sb1_0 - .Lconstants)
+	add	r5, r12, #(sb1_1 - .Lconstants)
+	add	r6, r12, #(sb2_0 - .Lconstants)
+	add	r7, r12, #(sb2_1 - .Lconstants)
+	add	r8, r12, #(inv - .Lconstants)
+	add	r10, r12, #(inva - .Lconstants)
 	vld1.64	{d12-d13}, [r4 :128]	/* q6 = sb1[0] */
 	vld1.64	{d14-d15}, [r5 :128]	/* q7 = sb1[1] */
 	vld1.64	{d16-d17}, [r6 :128]	/* q8 = sb2[0] */
@@ -283,8 +296,8 @@ ENTRY(aes_neon_enc1)
 	vld1.64	{d22-d23}, [r10 :128]	/* q11 = inva */
 
 	/* (r4, r5) := (_forward[0], _backward[0]) */
-	ldr	r4, =mc_forward
-	ldr	r5, =mc_backward
+	add	r4, r12, #(mc_forward - .Lconstants)
+	add	r5, r12, #(mc_backward - .Lconstants)
 
 	/* (q2, q3) := (lo, hi) */
 	vshr.u8	q3, q0, #4
@@ -392,9 +405,9 @@ ENTRY(aes_neon_enc1)
 	bne	1b
 
 	/* (q6, q7, q15) := (sbo[0], sbo[1], sr[rmod4]) */
-	ldr	r8, =sr
-	ldr	r6, =sbo_0
-	ldr	r7, =sbo_1
+	add	r8, r12, #(sr - .Lconstants)
+	add	r6, r12, #(sbo_0 - .Lconstants)
+	add	r7, r12, #(sbo_1 - .Lconstants)
 	add	r8, r8, r3, lsl #4
 	vld1.64	{d12-d13}, [r6 :128]
 	vld1.64	{d14-d15}, [r7 :128]
@@ -469,23 +482,30 @@ ENTRY(aes_neon_dec1)
 	 * q15={d30-d31}: mc/sr[3 & ~(nrounds - 1)]
 	 */
 
+	/* r12 := .Lconstants - .Lconstants_addr, r11 := .Lconstants_addr */
+	ldr	r12, .Lconstants_addr
+	adr	r11, .Lconstants_addr
+
 	vld1.64	{d28-d29}, [r0 :128]!	/* q14 = *rk++ */
 	rsb	r3, r1, #0		/* r3 := ~(x - 1) = -x */
 	vmov.i8	q1, #0x0f
 	and	r3, r3, #3		/* r3 := 3 & ~(x - 1) */
 
+	/* r12 := .Lconstants */
+	add	r12, r12, r11
+
 	/* (q4, q5) := (diptlo, dipthi) */
-	ldr	r6, =diptlo
-	ldr	r7, =dipthi
+	add	r6, r12, #(diptlo - .Lconstants)
+	add	r7, r12, #(dipthi - .Lconstants)
 	vld1.64	{d8-d9}, [r6 :128]
 	vld1.64	{d10-d11}, [r7 :128]
 
 	/* load the rest of the constants */
-	ldr	r4, =dsbb_0
-	ldr	r5, =dsbb_1
-	ldr	r6, =inv
-	ldr	r7, =inva
-	ldr	r8, =.Lmc_forward_3
+	add	r4, r12, #(dsbb_0 - .Lconstants)
+	add	r5, r12, #(dsbb_1 - .Lconstants)
+	add	r6, r12, #(inv - .Lconstants)
+	add	r7, r12, #(inva - .Lconstants)
+	add	r8, r12, #(.Lmc_forward_3 - .Lconstants)
 	vld1.64	{d12-d13}, [r4 :128]	/* q6 := dsbb[0] */
 	vld1.64	{d14-d15}, [r5 :128]	/* q7 := dsbb[1] */
 	vld1.64	{d20-d21}, [r6 :128]	/* q10 := inv */
@@ -504,8 +524,8 @@ ENTRY(aes_neon_dec1)
 	vtbl.8	d7, {d10-d11}, d7
 
 	/* load dsb9 */
-	ldr	r4, =dsb9_0
-	ldr	r5, =dsb9_1
+	add	r4, r12, #(dsb9_0 - .Lconstants)
+	add	r5, r12, #(dsb9_1 - .Lconstants)
 	vld1.64	{d8-d9}, [r4 :128]	/* q4 := dsb9[0] */
 	vld1.64	{d10-d11}, [r5 :128]	/* q5 := dsb9[1] */
 
@@ -516,7 +536,7 @@ ENTRY(aes_neon_dec1)
 	b	2f
 
 1:	/* load dsbd */
-	ldr	r4, =dsbd_0
+	add	r4, r12, #(dsbd_0 - .Lconstants)
 	vld1.64	{d16-d17}, [r4 :128]!	/* q8 := dsbd[0] */
 	vld1.64	{d18-d19}, [r4 :128]	/* q9 := dsbd[1] */
 
@@ -543,7 +563,7 

CVS commit: src/sys/crypto/aes/arch/arm

2020-07-27 Thread Taylor R Campbell
Module Name:src
Committed By:   riastradh
Date:   Mon Jul 27 20:52:11 UTC 2020

Modified Files:
src/sys/crypto/aes/arch/arm: aes_neon_32.S

Log Message:
PIC for aes_neon_32.S.

Without this, tests/sys/crypto/aes/t_aes fails to start on armv7
because of R_ARM_ABS32 relocations in a nonwritable text segment for
a PIE -- which atf quietly ignores in the final report!  Yikes.


To generate a diff of this commit:
cvs rdiff -u -r1.1 -r1.2 src/sys/crypto/aes/arch/arm/aes_neon_32.S

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.



CVS commit: src/sys/crypto/aes/arch/arm

2020-07-25 Thread Taylor R Campbell
Module Name:src
Committed By:   riastradh
Date:   Sat Jul 25 22:43:01 UTC 2020

Modified Files:
src/sys/crypto/aes/arch/arm: arm_neon.h

Log Message:
Add 32-bit load, store, and shift intrinsics.

vld1q_u32
vst1q_u32
vshlq_n_u32
vshrq_n_u32


To generate a diff of this commit:
cvs rdiff -u -r1.5 -r1.6 src/sys/crypto/aes/arch/arm/arm_neon.h

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.



CVS commit: src/sys/crypto/aes/arch/arm

2020-07-25 Thread Taylor R Campbell
Module Name:src
Committed By:   riastradh
Date:   Sat Jul 25 22:42:31 UTC 2020

Modified Files:
src/sys/crypto/aes/arch/arm: arm_neon.h

Log Message:
Fix missing clang big-endian case.


To generate a diff of this commit:
cvs rdiff -u -r1.4 -r1.5 src/sys/crypto/aes/arch/arm/arm_neon.h

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.



CVS commit: src/sys/crypto/aes/arch/arm

2020-07-25 Thread Taylor R Campbell
Module Name:src
Committed By:   riastradh
Date:   Sat Jul 25 22:42:31 UTC 2020

Modified Files:
src/sys/crypto/aes/arch/arm: arm_neon.h

Log Message:
Fix missing clang big-endian case.


To generate a diff of this commit:
cvs rdiff -u -r1.4 -r1.5 src/sys/crypto/aes/arch/arm/arm_neon.h

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/sys/crypto/aes/arch/arm/arm_neon.h
diff -u src/sys/crypto/aes/arch/arm/arm_neon.h:1.4 src/sys/crypto/aes/arch/arm/arm_neon.h:1.5
--- src/sys/crypto/aes/arch/arm/arm_neon.h:1.4	Sat Jul 25 22:36:06 2020
+++ src/sys/crypto/aes/arch/arm/arm_neon.h	Sat Jul 25 22:42:31 2020
@@ -1,4 +1,4 @@
-/*	$NetBSD: arm_neon.h,v 1.4 2020/07/25 22:36:06 riastradh Exp $	*/
+/*	$NetBSD: arm_neon.h,v 1.5 2020/07/25 22:42:31 riastradh Exp $	*/
 
 /*-
  * Copyright (c) 2020 The NetBSD Foundation, Inc.
@@ -237,7 +237,12 @@ vld1q_u8(const uint8_t *__p8)
 	return (uint8x16_t)__builtin_neon_vld1v16qi(__p);
 #endif
 #elif defined(__clang__)
-	return (uint8x16_t)__builtin_neon_vld1q_v(__p8, 48);
+	uint8x16_t __v = (uint8x16_t)__builtin_neon_vld1q_v(__p8, 48);
+#ifndef __LITTLE_ENDIAN__
+	__v = __builtin_shufflevector(__v, __v,
+	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
+#endif
+	return __v;
 #endif
 }
 
@@ -442,7 +447,7 @@ vst1q_u8(uint8_t *__p8, uint8x16_t __v)
 #elif defined(__clang__)
 #ifndef __LITTLE_ENDIAN__
 	__v = __builtin_shufflevector(__v, __v,
-	15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
 #endif
 	__builtin_neon_vst1q_v(__p8, __v, 48);
 #endif



CVS commit: src/sys/crypto/aes/arch/arm

2020-07-25 Thread Taylor R Campbell
Module Name:src
Committed By:   riastradh
Date:   Sat Jul 25 22:43:01 UTC 2020

Modified Files:
src/sys/crypto/aes/arch/arm: arm_neon.h

Log Message:
Add 32-bit load, store, and shift intrinsics.

vld1q_u32
vst1q_u32
vshlq_n_u32
vshrq_n_u32


To generate a diff of this commit:
cvs rdiff -u -r1.5 -r1.6 src/sys/crypto/aes/arch/arm/arm_neon.h

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/sys/crypto/aes/arch/arm/arm_neon.h
diff -u src/sys/crypto/aes/arch/arm/arm_neon.h:1.5 src/sys/crypto/aes/arch/arm/arm_neon.h:1.6
--- src/sys/crypto/aes/arch/arm/arm_neon.h:1.5	Sat Jul 25 22:42:31 2020
+++ src/sys/crypto/aes/arch/arm/arm_neon.h	Sat Jul 25 22:43:01 2020
@@ -1,4 +1,4 @@
-/*	$NetBSD: arm_neon.h,v 1.5 2020/07/25 22:42:31 riastradh Exp $	*/
+/*	$NetBSD: arm_neon.h,v 1.6 2020/07/25 22:43:01 riastradh Exp $	*/
 
 /*-
  * Copyright (c) 2020 The NetBSD Foundation, Inc.
@@ -222,6 +222,30 @@ vgetq_lane_u32(uint32x4_t __v, uint8_t _
 #endif
 
 _INTRINSATTR
+static __inline uint32x4_t
+vld1q_u32(const uint32_t *__p32)
+{
+#if defined(__GNUC__) && !defined(__clang__)
+#ifdef __aarch64__
+	const __builtin_aarch64_simd_si *__p =
+	(const __builtin_aarch64_simd_si *)__p32;
+
+	return (uint32x4_t)__builtin_aarch64_ld1v4si(__p);
+#else
+	const __builtin_neon_si *__p = (const __builtin_neon_si *)__p32;
+
+	return (uint32x4_t)__builtin_neon_vld1v4si(__p);
+#endif
+#elif defined(__clang__)
+	uint32x4_t __v = (uint32x4_t)__builtin_neon_vld1q_v(__p32, 50);
+#ifndef __LITTLE_ENDIAN__
+	__v = __builtin_shufflevector(__v, __v, 3,2,1,0);
+#endif
+	return __v;
+#endif
+}
+
+_INTRINSATTR
 static __inline uint8x16_t
 vld1q_u8(const uint8_t *__p8)
 {
@@ -383,6 +407,38 @@ vsetq_lane_u64(uint64_t __x, uint64x2_t 
 
 #if defined(__GNUC__) && !defined(__clang__)
 _INTRINSATTR
+static __inline uint32x4_t
+vshlq_n_u32(uint32x4_t __v, uint8_t __bits)
+{
+#ifdef __aarch64__
+	return (uint32x4_t)__builtin_aarch64_ashlv4si((int32x4_t)__v, __bits);
+#else
+	return (uint32x4_t)__builtin_neon_vshl_nv4si((int32x4_t)__v, __bits);
+#endif
+}
+#elif defined(__clang__)
+#define	vshlq_n_u32(__v, __bits)	  \
+	(uint32x4_t)__builtin_neon_vshlq_n_v((int32x4_t)(__v), (__bits), 50)
+#endif
+
+#if defined(__GNUC__) && !defined(__clang__)
+_INTRINSATTR
+static __inline uint32x4_t
+vshrq_n_u32(uint32x4_t __v, uint8_t __bits)
+{
+#ifdef __aarch64__
+	return (uint32x4_t)__builtin_aarch64_lshrv4si((int32x4_t)__v, __bits);
+#else
+	return (uint32x4_t)__builtin_neon_vshru_nv4si((int32x4_t)__v, __bits);
+#endif
+}
+#elif defined(__clang__)
+#define	vshrq_n_u8(__v, __bits)		  \
+	(uint32x4_t)__builtin_neon_vshrq_n_v((int32x4_t)(__v), (__bits), 50)
+#endif
+
+#if defined(__GNUC__) && !defined(__clang__)
+_INTRINSATTR
 static __inline uint8x16_t
 vshrq_n_u8(uint8x16_t __v, uint8_t __bits)
 {
@@ -432,6 +488,28 @@ vsliq_n_s32(int32x4_t __vins, int32x4_t 
 
 _INTRINSATTR
 static __inline void
+vst1q_u32(uint32_t *__p32, uint32x4_t __v)
+{
+#if defined(__GNUC__) && !defined(__clang__)
+#ifdef __aarch64__
+	__builtin_aarch64_simd_si *__p = (__builtin_aarch64_simd_si *)__p32;
+
+	__builtin_aarch64_st1v4si(__p, (int32x4_t)__v);
+#else
+	__builtin_neon_si *__p = (__builtin_neon_si *)__p32;
+
+	__builtin_neon_vst1v4si(__p, (int32x4_t)__v);
+#endif
+#elif defined(__clang__)
+#ifndef __LITTLE_ENDIAN__
+	__v = __builtin_shufflevector(__v, __v, 3,2,1,0);
+#endif
+	__builtin_neon_vst1q_v(__p32, __v, 50);
+#endif
+}
+
+_INTRINSATTR
+static __inline void
 vst1q_u8(uint8_t *__p8, uint8x16_t __v)
 {
 #if defined(__GNUC__) && !defined(__clang__)



CVS commit: src/sys/crypto/aes/arch/arm

2020-07-25 Thread Taylor R Campbell
Module Name:src
Committed By:   riastradh
Date:   Sat Jul 25 22:36:06 UTC 2020

Modified Files:
src/sys/crypto/aes/arch/arm: aes_neon.h aes_neon_impl.c aes_neon_subr.c
arm_neon.h

Log Message:
Implement AES-CCM with NEON.


To generate a diff of this commit:
cvs rdiff -u -r1.2 -r1.3 src/sys/crypto/aes/arch/arm/aes_neon.h \
src/sys/crypto/aes/arch/arm/aes_neon_subr.c
cvs rdiff -u -r1.3 -r1.4 src/sys/crypto/aes/arch/arm/aes_neon_impl.c \
src/sys/crypto/aes/arch/arm/arm_neon.h

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/sys/crypto/aes/arch/arm/aes_neon.h
diff -u src/sys/crypto/aes/arch/arm/aes_neon.h:1.2 src/sys/crypto/aes/arch/arm/aes_neon.h:1.3
--- src/sys/crypto/aes/arch/arm/aes_neon.h:1.2	Sat Jul 25 22:12:57 2020
+++ src/sys/crypto/aes/arch/arm/aes_neon.h	Sat Jul 25 22:36:06 2020
@@ -1,4 +1,4 @@
-/*	$NetBSD: aes_neon.h,v 1.2 2020/07/25 22:12:57 riastradh Exp $	*/
+/*	$NetBSD: aes_neon.h,v 1.3 2020/07/25 22:36:06 riastradh Exp $	*/
 
 /*-
  * Copyright (c) 2020 The NetBSD Foundation, Inc.
@@ -59,6 +59,12 @@ void aes_neon_xts_enc(const struct aesen
 uint8_t[static 16], size_t, uint8_t[static 16], uint32_t);
 void aes_neon_xts_dec(const struct aesdec *, const uint8_t[static 16],
 uint8_t[static 16], size_t, uint8_t[static 16], uint32_t);
+void aes_neon_cbcmac_update1(const struct aesenc *, const uint8_t[static 16],
+size_t, uint8_t[static 16], uint32_t);
+void aes_neon_ccm_enc1(const struct aesenc *, const uint8_t[static 16],
+uint8_t[static 16], size_t, uint8_t[static 32], uint32_t);
+void aes_neon_ccm_dec1(const struct aesenc *, const uint8_t[static 16],
+uint8_t[static 16], size_t, uint8_t[static 32], uint32_t);
 
 int aes_neon_selftest(void);
 
Index: src/sys/crypto/aes/arch/arm/aes_neon_subr.c
diff -u src/sys/crypto/aes/arch/arm/aes_neon_subr.c:1.2 src/sys/crypto/aes/arch/arm/aes_neon_subr.c:1.3
--- src/sys/crypto/aes/arch/arm/aes_neon_subr.c:1.2	Tue Jun 30 20:32:11 2020
+++ src/sys/crypto/aes/arch/arm/aes_neon_subr.c	Sat Jul 25 22:36:06 2020
@@ -1,4 +1,4 @@
-/*	$NetBSD: aes_neon_subr.c,v 1.2 2020/06/30 20:32:11 riastradh Exp $	*/
+/*	$NetBSD: aes_neon_subr.c,v 1.3 2020/07/25 22:36:06 riastradh Exp $	*/
 
 /*-
  * Copyright (c) 2020 The NetBSD Foundation, Inc.
@@ -27,7 +27,9 @@
  */
 
 #include 
-__KERNEL_RCSID(1, "$NetBSD: aes_neon_subr.c,v 1.2 2020/06/30 20:32:11 riastradh Exp $");
+__KERNEL_RCSID(1, "$NetBSD: aes_neon_subr.c,v 1.3 2020/07/25 22:36:06 riastradh Exp $");
+
+#include 
 
 #ifdef _KERNEL
 #include 
@@ -213,6 +215,89 @@ aes_neon_xts_dec(const struct aesdec *de
 	storeblock(tweak, t);
 }
 
+void
+aes_neon_cbcmac_update1(const struct aesenc *enc, const uint8_t in[static 16],
+size_t nbytes, uint8_t auth0[static 16], uint32_t nrounds)
+{
+	uint8x16_t auth;
+
+	KASSERT(nbytes);
+	KASSERT(nbytes % 16 == 0);
+
+	auth = loadblock(auth0);
+	for (; nbytes; nbytes -= 16, in += 16)
+		auth = aes_neon_enc1(enc, auth ^ loadblock(in), nrounds);
+	storeblock(auth0, auth);
+}
+
+/*
+ * XXX On aarch64, we have enough registers that we should be able to
+ * pipeline two simultaneous vpaes computations in an `aes_neon_enc2'
+ * function, which should substantially improve CCM throughput.
+ */
+
+#if _BYTE_ORDER == _LITTLE_ENDIAN
+#define	vbetoh32q_u8	vrev32q_u8
+#define	vhtobe32q_u8	vrev32q_u8
+#elif _BYTE_ORDER == _BIG_ENDIAN
+#define	vbetoh32q_u8(x)	(x)
+#define	vhtobe32q_u8(x)	(x)
+#else
+#error what kind of endian are you anyway
+#endif
+
+void
+aes_neon_ccm_enc1(const struct aesenc *enc, const uint8_t in[static 16],
+uint8_t out[static 16], size_t nbytes, uint8_t authctr[static 32],
+uint32_t nrounds)
+{
+	const uint32x4_t ctr32_inc = {0, 0, 0, 1};
+	uint8x16_t auth, ptxt, ctr_be;
+	uint32x4_t ctr;
+
+	KASSERT(nbytes);
+	KASSERT(nbytes % 16 == 0);
+
+	auth = loadblock(authctr);
+	ctr_be = loadblock(authctr + 16);
+	ctr = vreinterpretq_u32_u8(vbetoh32q_u8(ctr_be));
+	for (; nbytes; nbytes -= 16, in += 16, out += 16) {
+		ptxt = loadblock(in);
+		auth = aes_neon_enc1(enc, auth ^ ptxt, nrounds);
+		ctr = vaddq_u32(ctr, ctr32_inc);
+		ctr_be = vhtobe32q_u8(vreinterpretq_u8_u32(ctr));
+		storeblock(out, ptxt ^ aes_neon_enc1(enc, ctr_be, nrounds));
+	}
+	storeblock(authctr, auth);
+	storeblock(authctr + 16, ctr_be);
+}
+
+void
+aes_neon_ccm_dec1(const struct aesenc *enc, const uint8_t in[static 16],
+uint8_t out[static 16], size_t nbytes, uint8_t authctr[static 32],
+uint32_t nrounds)
+{
+	const uint32x4_t ctr32_inc = {0, 0, 0, 1};
+	uint8x16_t auth, ctr_be, ptxt;
+	uint32x4_t ctr;
+
+	KASSERT(nbytes);
+	KASSERT(nbytes % 16 == 0);
+
+	auth = loadblock(authctr);
+	ctr_be = loadblock(authctr + 16);
+	ctr = vreinterpretq_u32_u8(vbetoh32q_u8(ctr_be));
+	for (; nbytes; nbytes -= 16, in += 16, out += 16) {
+		ctr = vaddq_u32(ctr, ctr32_inc);
+		ctr_be = vhtobe32q_u8(vreinterpretq_u8_u32(ctr));
+		ptxt = loadblock(in) ^ aes_neon_enc1(enc, 

CVS commit: src/sys/crypto/aes/arch/arm

2020-07-25 Thread Taylor R Campbell
Module Name:src
Committed By:   riastradh
Date:   Sat Jul 25 22:36:06 UTC 2020

Modified Files:
src/sys/crypto/aes/arch/arm: aes_neon.h aes_neon_impl.c aes_neon_subr.c
arm_neon.h

Log Message:
Implement AES-CCM with NEON.


To generate a diff of this commit:
cvs rdiff -u -r1.2 -r1.3 src/sys/crypto/aes/arch/arm/aes_neon.h \
src/sys/crypto/aes/arch/arm/aes_neon_subr.c
cvs rdiff -u -r1.3 -r1.4 src/sys/crypto/aes/arch/arm/aes_neon_impl.c \
src/sys/crypto/aes/arch/arm/arm_neon.h

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.



CVS commit: src/sys/crypto/aes/arch/arm

2020-07-25 Thread Taylor R Campbell
Module Name:src
Committed By:   riastradh
Date:   Sat Jul 25 22:32:09 UTC 2020

Modified Files:
src/sys/crypto/aes/arch/arm: aes_armv8_64.S

Log Message:
Invert some loops to save a branch instruction on every iteration.


To generate a diff of this commit:
cvs rdiff -u -r1.6 -r1.7 src/sys/crypto/aes/arch/arm/aes_armv8_64.S

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/sys/crypto/aes/arch/arm/aes_armv8_64.S
diff -u src/sys/crypto/aes/arch/arm/aes_armv8_64.S:1.6 src/sys/crypto/aes/arch/arm/aes_armv8_64.S:1.7
--- src/sys/crypto/aes/arch/arm/aes_armv8_64.S:1.6	Wed Jul 22 06:15:21 2020
+++ src/sys/crypto/aes/arch/arm/aes_armv8_64.S	Sat Jul 25 22:32:09 2020
@@ -1,4 +1,4 @@
-/*	$NetBSD: aes_armv8_64.S,v 1.6 2020/07/22 06:15:21 riastradh Exp $	*/
+/*	$NetBSD: aes_armv8_64.S,v 1.7 2020/07/25 22:32:09 riastradh Exp $	*/
 
 /*-
  * Copyright (c) 2020 The NetBSD Foundation, Inc.
@@ -437,13 +437,13 @@ END(aesarmv8_setenckey256)
  */
 ENTRY(aesarmv8_enctodec)
 	ldr	q0, [x0, x2, lsl #4]	/* load last round key */
-1:	str	q0, [x1], #0x10	/* store round key */
+	b	2f
+1:	aesimc	v0.16b, v0.16b	/* convert encryption to decryption */
+2:	str	q0, [x1], #0x10	/* store round key */
 	subs	x2, x2, #1	/* count down round */
 	ldr	q0, [x0, x2, lsl #4]	/* load previous round key */
-	b.eq	2f		/* stop if this is the last one */
-	aesimc	v0.16b, v0.16b	/* convert encryption to decryption */
-	b	1b
-2:	str	q0, [x1]	/* store first round key verbatim */
+	b.ne	1b		/* repeat if there's more */
+	str	q0, [x1]	/* store first round key verbatim */
 	ret
 END(aesarmv8_enctodec)
 
@@ -536,17 +536,17 @@ ENTRY(aesarmv8_cbc_dec1)
 	add	x2, x2, x3		/* x2 := pointer past end of out */
 	ldr	q0, [x1, #-0x10]!	/* q0 := last ciphertext block */
 	str	q0, [x4]		/* update iv */
-1:	mov	x0, x9			/* x0 := enckey */
-	mov	x3, x5			/* x3 := nrounds */
-	bl	aesarmv8_dec1		/* q0 := cv ^ ptxt; trash x0/x3/q16 */
-	subs	x10, x10, #0x10		/* count down nbytes */
-	b.eq	2f			/* stop if this is the first block */
-	ldr	q31, [x1, #-0x10]!	/* q31 := chaining value */
+	b	2f
+1:	ldr	q31, [x1, #-0x10]!	/* q31 := chaining value */
 	eor	v0.16b, v0.16b, v31.16b	/* q0 := plaintext block */
 	str	q0, [x2, #-0x10]!	/* store plaintext block */
 	mov	v0.16b, v31.16b		/* move cv = ciphertext block */
-	b	1b
-2:	eor	v0.16b, v0.16b, v24.16b	/* q0 := first plaintext block */
+2:	mov	x0, x9			/* x0 := enckey */
+	mov	x3, x5			/* x3 := nrounds */
+	bl	aesarmv8_dec1		/* q0 := cv ^ ptxt; trash x0/x3/q16 */
+	subs	x10, x10, #0x10		/* count down nbytes */
+	b.ne	1b			/* repeat if more blocks */
+	eor	v0.16b, v0.16b, v24.16b	/* q0 := first plaintext block */
 	str	q0, [x2, #-0x10]!	/* store first plaintext block */
 	ldp	fp, lr, [sp], #16	/* pop stack frame */
 	ret
@@ -573,7 +573,11 @@ ENTRY(aesarmv8_cbc_dec8)
 	add	x2, x2, x3		/* x2 := pointer past end of out */
 	ldp	q6, q7, [x1, #-0x20]!	/* q6, q7 := last ciphertext blocks */
 	str	q7, [x4]		/* update iv */
-1:	ldp	q4, q5, [x1, #-0x20]!
+	b	2f
+1:	ldp	q6, q7, [x1, #-0x20]!
+	eor	v0.16b, v0.16b, v7.16b	/* q0 := pt0 */
+	stp	q0, q1, [x2, #-0x20]!
+2:	ldp	q4, q5, [x1, #-0x20]!
 	ldp	q2, q3, [x1, #-0x20]!
 	ldp	q0, q1, [x1, #-0x20]!
 	mov	v31.16b, v6.16b		/* q[24+i] := cv[i], 0

CVS commit: src/sys/crypto/aes/arch/arm

2020-07-25 Thread Taylor R Campbell
Module Name:src
Committed By:   riastradh
Date:   Sat Jul 25 22:32:09 UTC 2020

Modified Files:
src/sys/crypto/aes/arch/arm: aes_armv8_64.S

Log Message:
Invert some loops to save a branch instruction on every iteration.


To generate a diff of this commit:
cvs rdiff -u -r1.6 -r1.7 src/sys/crypto/aes/arch/arm/aes_armv8_64.S

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.



CVS commit: src/sys/crypto/aes/arch/arm

2020-07-23 Thread Ryo Shimizu
Module Name:src
Committed By:   ryo
Date:   Thu Jul 23 11:33:01 UTC 2020

Modified Files:
src/sys/crypto/aes/arch/arm: arm_neon.h

Log Message:
fix build with llvm/clang.


To generate a diff of this commit:
cvs rdiff -u -r1.2 -r1.3 src/sys/crypto/aes/arch/arm/arm_neon.h

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/sys/crypto/aes/arch/arm/arm_neon.h
diff -u src/sys/crypto/aes/arch/arm/arm_neon.h:1.2 src/sys/crypto/aes/arch/arm/arm_neon.h:1.3
--- src/sys/crypto/aes/arch/arm/arm_neon.h:1.2	Tue Jun 30 21:24:00 2020
+++ src/sys/crypto/aes/arch/arm/arm_neon.h	Thu Jul 23 11:33:01 2020
@@ -1,4 +1,4 @@
-/*	$NetBSD: arm_neon.h,v 1.2 2020/06/30 21:24:00 riastradh Exp $	*/
+/*	$NetBSD: arm_neon.h,v 1.3 2020/07/23 11:33:01 ryo Exp $	*/
 
 /*-
  * Copyright (c) 2020 The NetBSD Foundation, Inc.
@@ -65,7 +65,7 @@ typedef struct { uint8x8_t val[2]; } uin
 #elif defined(__clang__)
 
 #define	_INTRINSATTR			  \
-	__attribute__((__always_inline__, __nodebug))
+	__attribute__((__always_inline__, __nodebug__))
 
 typedef __attribute__((neon_vector_type(16))) int8_t int8x16_t;
 typedef __attribute__((neon_vector_type(2))) int64_t int64x2_t;



CVS commit: src/sys/crypto/aes/arch/arm

2020-07-23 Thread Ryo Shimizu
Module Name:src
Committed By:   ryo
Date:   Thu Jul 23 11:33:01 UTC 2020

Modified Files:
src/sys/crypto/aes/arch/arm: arm_neon.h

Log Message:
fix build with llvm/clang.


To generate a diff of this commit:
cvs rdiff -u -r1.2 -r1.3 src/sys/crypto/aes/arch/arm/arm_neon.h

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.



CVS commit: src/sys/crypto/aes/arch/arm

2020-07-22 Thread Taylor R Campbell
Module Name:src
Committed By:   riastradh
Date:   Wed Jul 22 06:15:21 UTC 2020

Modified Files:
src/sys/crypto/aes/arch/arm: aes_armv8_64.S

Log Message:
Fix register name in comment.

Some time ago I reallocated the registers to avoid inadvertently
clobbering the callee-saves v9, but neglected to update the comment.


To generate a diff of this commit:
cvs rdiff -u -r1.5 -r1.6 src/sys/crypto/aes/arch/arm/aes_armv8_64.S

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.



CVS commit: src/sys/crypto/aes/arch/arm

2020-07-22 Thread Taylor R Campbell
Module Name:src
Committed By:   riastradh
Date:   Wed Jul 22 06:15:21 UTC 2020

Modified Files:
src/sys/crypto/aes/arch/arm: aes_armv8_64.S

Log Message:
Fix register name in comment.

Some time ago I reallocated the registers to avoid inadvertently
clobbering the callee-saves v9, but neglected to update the comment.


To generate a diff of this commit:
cvs rdiff -u -r1.5 -r1.6 src/sys/crypto/aes/arch/arm/aes_armv8_64.S

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/sys/crypto/aes/arch/arm/aes_armv8_64.S
diff -u src/sys/crypto/aes/arch/arm/aes_armv8_64.S:1.5 src/sys/crypto/aes/arch/arm/aes_armv8_64.S:1.6
--- src/sys/crypto/aes/arch/arm/aes_armv8_64.S:1.5	Sun Jul 19 07:32:43 2020
+++ src/sys/crypto/aes/arch/arm/aes_armv8_64.S	Wed Jul 22 06:15:21 2020
@@ -1,4 +1,4 @@
-/*	$NetBSD: aes_armv8_64.S,v 1.5 2020/07/19 07:32:43 ryo Exp $	*/
+/*	$NetBSD: aes_armv8_64.S,v 1.6 2020/07/22 06:15:21 riastradh Exp $	*/
 
 /*-
  * Copyright (c) 2020 The NetBSD Foundation, Inc.
@@ -827,7 +827,7 @@ aesarmv8_xts_mulx:
 	 * carried into x^128 = x^7 + x^2 + x + 1.
 	 */
 	adrl	x0, xtscarry
-	cmlt	v1.2d, v31.2d, #0 /* v1.2d[i] := -1 if v9.2d[i] < 0, else 0 */
+	cmlt	v1.2d, v31.2d, #0 /* v1.2d[i] := -1 if v31.2d[i] < 0, else 0 */
 	ldr	q0, [x0]		/* q0 := xtscarry */
 	ext	v1.16b, v1.16b, v1.16b, #8 /* swap halves of q1 */
 	shl	v31.2d, v31.2d, #1	/* shift */



CVS commit: src/sys/crypto/aes/arch/arm

2020-07-19 Thread Ryo Shimizu
Module Name:src
Committed By:   ryo
Date:   Sun Jul 19 07:32:43 UTC 2020

Modified Files:
src/sys/crypto/aes/arch/arm: aes_armv8_64.S

Log Message:
fix build with clang/llvm.

clang aarch64 assembler doesn't accept optional number of lanes of vector 
register.
(but ARMARM says that an assembler must accept it)


To generate a diff of this commit:
cvs rdiff -u -r1.4 -r1.5 src/sys/crypto/aes/arch/arm/aes_armv8_64.S

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.



CVS commit: src/sys/crypto/aes/arch/arm

2020-07-19 Thread Ryo Shimizu
Module Name:src
Committed By:   ryo
Date:   Sun Jul 19 07:32:43 UTC 2020

Modified Files:
src/sys/crypto/aes/arch/arm: aes_armv8_64.S

Log Message:
fix build with clang/llvm.

clang aarch64 assembler doesn't accept optional number of lanes of vector 
register.
(but ARMARM says that an assembler must accept it)


To generate a diff of this commit:
cvs rdiff -u -r1.4 -r1.5 src/sys/crypto/aes/arch/arm/aes_armv8_64.S

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/sys/crypto/aes/arch/arm/aes_armv8_64.S
diff -u src/sys/crypto/aes/arch/arm/aes_armv8_64.S:1.4 src/sys/crypto/aes/arch/arm/aes_armv8_64.S:1.5
--- src/sys/crypto/aes/arch/arm/aes_armv8_64.S:1.4	Tue Jun 30 23:06:02 2020
+++ src/sys/crypto/aes/arch/arm/aes_armv8_64.S	Sun Jul 19 07:32:43 2020
@@ -1,4 +1,4 @@
-/*	$NetBSD: aes_armv8_64.S,v 1.4 2020/06/30 23:06:02 riastradh Exp $	*/
+/*	$NetBSD: aes_armv8_64.S,v 1.5 2020/07/19 07:32:43 ryo Exp $	*/
 
 /*-
  * Copyright (c) 2020 The NetBSD Foundation, Inc.
@@ -238,8 +238,8 @@ ENTRY(aesarmv8_setenckey192)
 	 */
 
 	/* v1.4s := (nrk[0], nrk[1], nrk[1], nrk[1]) */
-	dup	v1.4s, v5.4s[3]
-	mov	v1.4s[0], v5.4s[2]
+	dup	v1.4s, v5.s[3]
+	mov	v1.s[0], v5.s[2]
 
 	/*
 	 * v6.4s := (0, 0, rklo[0], rklo[1])
@@ -257,7 +257,7 @@ ENTRY(aesarmv8_setenckey192)
 	 * and v5.4s = (rk[2], rk[3], xxx, xxx).  Set
 	 * v2.4s := (rk[0], rk[1], rk[2], rk[3])
 	 */
-	mov	v2.2d[1], v5.2d[0]
+	mov	v2.d[1], v5.d[0]
 
 	/* store two round keys */
 	stp	q2, q3, [x0], #0x20
@@ -325,7 +325,7 @@ ENTRY(aesarmv8_setenckey192)
 	ext	v5.16b, v0.16b, v4.16b, #12
 
 	/* v2.4s := (nnrk[3], nnrk[3], xxx, xxx) */
-	dup	v2.4s, v1.4s[3]
+	dup	v2.4s, v1.s[3]
 
 	/*
 	 * v2.4s := (nnnrklo[0] = nnrk[3] ^ nrk[2],



CVS commit: src/sys/crypto/aes/arch/arm

2020-06-30 Thread Taylor R Campbell
Module Name:src
Committed By:   riastradh
Date:   Tue Jun 30 23:06:02 UTC 2020

Modified Files:
src/sys/crypto/aes/arch/arm: aes_armv8_64.S

Log Message:
Reallocate registers to avoid abusing callee-saves registers, v8-v15.

Forgot to consult the AAPCS before committing this before -- oops!

While here, take advantage of the 32 aarch64 simd registers to avoid
all stack spills.


To generate a diff of this commit:
cvs rdiff -u -r1.3 -r1.4 src/sys/crypto/aes/arch/arm/aes_armv8_64.S

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/sys/crypto/aes/arch/arm/aes_armv8_64.S
diff -u src/sys/crypto/aes/arch/arm/aes_armv8_64.S:1.3 src/sys/crypto/aes/arch/arm/aes_armv8_64.S:1.4
--- src/sys/crypto/aes/arch/arm/aes_armv8_64.S:1.3	Tue Jun 30 21:53:39 2020
+++ src/sys/crypto/aes/arch/arm/aes_armv8_64.S	Tue Jun 30 23:06:02 2020
@@ -1,4 +1,4 @@
-/*	$NetBSD: aes_armv8_64.S,v 1.3 2020/06/30 21:53:39 riastradh Exp $	*/
+/*	$NetBSD: aes_armv8_64.S,v 1.4 2020/06/30 23:06:02 riastradh Exp $	*/
 
 /*-
  * Copyright (c) 2020 The NetBSD Foundation, Inc.
@@ -116,7 +116,7 @@ ENTRY(aesarmv8_setenckey128)
 
 	adrl	x4, unshiftrows_rotword_3
 	eor	v0.16b, v0.16b, v0.16b	/* q0 := 0 */
-	ldr	q8, [x4]	/* q8 := unshiftrows_rotword_3 table */
+	ldr	q16, [x4]	/* q16 := unshiftrows_rotword_3 table */
 
 	str	q1, [x0], #0x10	/* store master key as first round key */
 	mov	x2, #10		/* round count */
@@ -136,7 +136,7 @@ ENTRY(aesarmv8_setenckey128)
 
 	/* v3.4s[i] := RotWords(SubBytes(prk[3])) ^ RCON */
 	ld1r	{v4.4s}, [x3], #4
-	tbl	v3.16b, {v3.16b}, v8.16b
+	tbl	v3.16b, {v3.16b}, v16.16b
 	eor	v3.16b, v3.16b, v4.16b
 
 	/*
@@ -175,8 +175,8 @@ ENTRY(aesarmv8_setenckey192)
 	adrl	x4, unshiftrows_rotword_1
 	adrl	x5, unshiftrows_rotword_3
 	eor	v0.16b, v0.16b, v0.16b	/* q0 := 0 */
-	ldr	q8, [x4]	/* q8 := unshiftrows_rotword_1 */
-	ldr	q9, [x5]	/* q9 := unshiftrows_rotword_3 */
+	ldr	q16, [x4]	/* q16 := unshiftrows_rotword_1 */
+	ldr	q17, [x5]	/* q17 := unshiftrows_rotword_3 */
 
 	str	q1, [x0], #0x10	/* store master key[0:128) as round key */
 	mov	x2, #12		/* round count */
@@ -197,7 +197,7 @@ ENTRY(aesarmv8_setenckey192)
 
 	/* v3.4s[i] := RotWords(SubBytes(rklo[1])) ^ RCON */
 	ld1r	{v4.4s}, [x3], #4
-	tbl	v3.16b, {v3.16b}, v8.16b
+	tbl	v3.16b, {v3.16b}, v16.16b
 	eor	v3.16b, v3.16b, v4.16b
 
 	/*
@@ -269,8 +269,8 @@ ENTRY(aesarmv8_setenckey192)
 	 *	q2 = rk
 	 *	q3 = nrk
 	 *	v5.4s = (rk[2], rk[3], nrk[0], nrk[1])
-	 *	q8 = unshiftrows_rotword_1
-	 *	q9 = unshiftrows_rotword_3
+	 *	q16 = unshiftrows_rotword_1
+	 *	q17 = unshiftrows_rotword_3
 	 *
 	 * We have to compute, in q1:
 	 *
@@ -294,7 +294,7 @@ ENTRY(aesarmv8_setenckey192)
 
 	/* v1.4s[i] := RotWords(SubBytes(nrk[3])) ^ RCON' */
 	ld1r	{v4.4s}, [x3], #4
-	tbl	v1.16b, {v1.16b}, v9.16b
+	tbl	v1.16b, {v1.16b}, v17.16b
 	eor	v1.16b, v1.16b, v4.16b
 
 	/*
@@ -354,8 +354,8 @@ ENTRY(aesarmv8_setenckey256)
 	adrl	x4, unshiftrows_rotword_3
 	adrl	x5, unshiftrows_3
 	eor	v0.16b, v0.16b, v0.16b	/* q0 := 0 */
-	ldr	q8, [x4]	/* q8 := unshiftrows_rotword_3 */
-	ldr	q9, [x5]	/* q9 := unshiftrows_3 */
+	ldr	q16, [x4]	/* q16 := unshiftrows_rotword_3 */
+	ldr	q17, [x5]	/* q17 := unshiftrows_3 */
 
 	/* store master key as first two round keys */
 	stp	q1, q2, [x0], #0x20
@@ -376,7 +376,7 @@ ENTRY(aesarmv8_setenckey256)
 
 	/* v3.4s[i] := RotWords(SubBytes(prk[3])) ^ RCON */
 	ld1r	{v4.4s}, [x3], #4
-	tbl	v3.16b, {v3.16b}, v8.16b
+	tbl	v3.16b, {v3.16b}, v16.16b
 	eor	v3.16b, v3.16b, v4.16b
 
 	/*
@@ -402,7 +402,7 @@ ENTRY(aesarmv8_setenckey256)
 	aese	v3.16b, v0.16b
 
 	/* v3.4s[i] := SubBytes(rk[3]) */
-	tbl	v3.16b, {v3.16b}, v9.16b
+	tbl	v3.16b, {v3.16b}, v17.16b
 
 	/*
 	 * v5.4s := (0,prk[0],prk[1],prk[2])
@@ -458,9 +458,9 @@ END(aesarmv8_enctodec)
 ENTRY(aesarmv8_enc)
 	stp	fp, lr, [sp, #-16]!	/* push stack frame */
 	mov	fp, sp
-	ldr	q0, [x1]	/* q0 := block */
-	bl	aesarmv8_enc1
-	str	q0, [x2]	/* store block */
+	ldr	q0, [x1]	/* q0 := ptxt */
+	bl	aesarmv8_enc1	/* q0 := ctxt; trash x0/x3/q16 */
+	str	q0, [x2]	/* store ctxt */
 	ldp	fp, lr, [sp], #16	/* pop stack frame */
 	ret
 END(aesarmv8_enc)
@@ -476,9 +476,9 @@ END(aesarmv8_enc)
 ENTRY(aesarmv8_dec)
 	stp	fp, lr, [sp, #-16]!	/* push stack frame */
 	mov	fp, sp
-	ldr	q0, [x1]	/* q0 := block */
-	bl	aesarmv8_dec1
-	str	q0, [x2]	/* store block */
+	ldr	q0, [x1]	/* q0 := ctxt */
+	bl	aesarmv8_dec1	/* q0 := ptxt; trash x0/x3/q16 */
+	str	q0, [x2]	/* store ptxt */
 	ldp	fp, lr, [sp], #16	/* pop stack frame */
 	ret
 END(aesarmv8_dec)
@@ -505,7 +505,7 @@ ENTRY(aesarmv8_cbc_enc)
 	eor	v0.16b, v0.16b, v1.16b	/* q0 := cv ^ ptxt */
 	mov	x0, x9			/* x0 := enckey */
 	mov	x3, x5			/* x3 := nrounds */
-	bl	aesarmv8_enc1		/* q0 := ciphertext block */
+	bl	aesarmv8_enc1		/* q0 := ctxt; trash x0/x3/q16 */
 	subs	x10, x10, #0x10		/* count down nbytes */
 	str	q0, [x2], #0x10		/* store ciphertext block */
 	b.ne	1b			/* repeat if x10 is nonzero */
@@ 

CVS commit: src/sys/crypto/aes/arch/arm

2020-06-30 Thread Taylor R Campbell
Module Name:src
Committed By:   riastradh
Date:   Tue Jun 30 23:06:02 UTC 2020

Modified Files:
src/sys/crypto/aes/arch/arm: aes_armv8_64.S

Log Message:
Reallocate registers to avoid abusing callee-saves registers, v8-v15.

Forgot to consult the AAPCS before committing this before -- oops!

While here, take advantage of the 32 aarch64 simd registers to avoid
all stack spills.


To generate a diff of this commit:
cvs rdiff -u -r1.3 -r1.4 src/sys/crypto/aes/arch/arm/aes_armv8_64.S

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.



CVS commit: src/sys/crypto/aes/arch/arm

2020-06-30 Thread Taylor R Campbell
Module Name:src
Committed By:   riastradh
Date:   Tue Jun 30 21:53:39 UTC 2020

Modified Files:
src/sys/crypto/aes/arch/arm: aes_armv8_64.S

Log Message:
Use `.arch_extension aes' for aese/aesmc/aesd/aesimc.

Unlike `.arch_extension crypto', this works with clang; both work
with gas, so we'll go with this.

Clang still can't handle aes_armv8_64.S yet -- it gets confused by
dup and mov on lanes, but this makes progress.


To generate a diff of this commit:
cvs rdiff -u -r1.2 -r1.3 src/sys/crypto/aes/arch/arm/aes_armv8_64.S

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.



CVS commit: src/sys/crypto/aes/arch/arm

2020-06-30 Thread Taylor R Campbell
Module Name:src
Committed By:   riastradh
Date:   Tue Jun 30 21:53:39 UTC 2020

Modified Files:
src/sys/crypto/aes/arch/arm: aes_armv8_64.S

Log Message:
Use `.arch_extension aes' for aese/aesmc/aesd/aesimc.

Unlike `.arch_extension crypto', this works with clang; both work
with gas, so we'll go with this.

Clang still can't handle aes_armv8_64.S yet -- it gets confused by
dup and mov on lanes, but this makes progress.


To generate a diff of this commit:
cvs rdiff -u -r1.2 -r1.3 src/sys/crypto/aes/arch/arm/aes_armv8_64.S

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/sys/crypto/aes/arch/arm/aes_armv8_64.S
diff -u src/sys/crypto/aes/arch/arm/aes_armv8_64.S:1.2 src/sys/crypto/aes/arch/arm/aes_armv8_64.S:1.3
--- src/sys/crypto/aes/arch/arm/aes_armv8_64.S:1.2	Tue Jun 30 21:41:03 2020
+++ src/sys/crypto/aes/arch/arm/aes_armv8_64.S	Tue Jun 30 21:53:39 2020
@@ -1,4 +1,4 @@
-/*	$NetBSD: aes_armv8_64.S,v 1.2 2020/06/30 21:41:03 riastradh Exp $	*/
+/*	$NetBSD: aes_armv8_64.S,v 1.3 2020/06/30 21:53:39 riastradh Exp $	*/
 
 /*-
  * Copyright (c) 2020 The NetBSD Foundation, Inc.
@@ -28,7 +28,7 @@
 
 #include 
 
-	.arch_extension	crypto
+	.arch_extension	aes
 
 /*
  * uint32_t rcon[10]



CVS commit: src/sys/crypto/aes/arch/arm

2020-06-30 Thread Taylor R Campbell
Module Name:src
Committed By:   riastradh
Date:   Tue Jun 30 21:24:00 UTC 2020

Modified Files:
src/sys/crypto/aes/arch/arm: arm_neon.h

Log Message:
Tweak clang neon intrinsics so they build.

(this file is still a kludge)


To generate a diff of this commit:
cvs rdiff -u -r1.1 -r1.2 src/sys/crypto/aes/arch/arm/arm_neon.h

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.



CVS commit: src/sys/crypto/aes/arch/arm

2020-06-30 Thread Taylor R Campbell
Module Name:src
Committed By:   riastradh
Date:   Tue Jun 30 21:24:00 UTC 2020

Modified Files:
src/sys/crypto/aes/arch/arm: arm_neon.h

Log Message:
Tweak clang neon intrinsics so they build.

(this file is still a kludge)


To generate a diff of this commit:
cvs rdiff -u -r1.1 -r1.2 src/sys/crypto/aes/arch/arm/arm_neon.h

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/sys/crypto/aes/arch/arm/arm_neon.h
diff -u src/sys/crypto/aes/arch/arm/arm_neon.h:1.1 src/sys/crypto/aes/arch/arm/arm_neon.h:1.2
--- src/sys/crypto/aes/arch/arm/arm_neon.h:1.1	Mon Jun 29 23:56:31 2020
+++ src/sys/crypto/aes/arch/arm/arm_neon.h	Tue Jun 30 21:24:00 2020
@@ -1,4 +1,4 @@
-/*	$NetBSD: arm_neon.h,v 1.1 2020/06/29 23:56:31 riastradh Exp $	*/
+/*	$NetBSD: arm_neon.h,v 1.2 2020/06/30 21:24:00 riastradh Exp $	*/
 
 /*-
  * Copyright (c) 2020 The NetBSD Foundation, Inc.
@@ -73,6 +73,8 @@ typedef __attribute__((neon_vector_type(
 typedef __attribute__((neon_vector_type(16))) uint8_t uint8x16_t;
 typedef __attribute__((neon_vector_type(2))) uint64_t uint64x2_t;
 typedef __attribute__((neon_vector_type(4))) uint32_t uint32x4_t;
+
+typedef __attribute__((neon_vector_type(8))) uint8_t uint8x8_t;
 typedef struct { uint8x8_t val[2]; } uint8x8x2_t;
 
 #ifdef __LITTLE_ENDIAN__
@@ -118,11 +120,11 @@ vdupq_n_u8(uint8_t __x)
 	};
 }
 
+#if defined(__GNUC__) && !defined(__clang__)
 _INTRINSATTR
 static __inline uint32x4_t
 vextq_u32(uint32x4_t __lo, uint32x4_t __hi, uint8_t __i)
 {
-#if defined(__GNUC__) && !defined(__clang__)
 #if defined(__AARCH64EB__) || defined(__ARM_BIG_ENDIAN)
 	return __builtin_shuffle(__hi, __lo,
 	(uint32x4_t) { 4 - __i, 5 - __i, 6 - __i, 7 - __i });
@@ -130,25 +132,31 @@ vextq_u32(uint32x4_t __lo, uint32x4_t __
 	return __builtin_shuffle(__lo, __hi,
 	(uint32x4_t) { __i + 0, __i + 1, __i + 2, __i + 3 });
 #endif
+}
 #elif defined(__clang__)
 #ifdef __LITTLE_ENDIAN__
-	return __builtin_neon_vextq_v((int8x16_t)__lo, (int8x16_t)__hi, __i,
-	50);
-#else
-	uint32x4_t __lo_r = __builtin_shufflevector(__lo, __lo, 3, 2, 1, 0);
-	uint32x4_t __hi_r = __builtin_shufflevector(__hi, __hi, 3, 2, 1, 0);
-	uint32x4_t __r = __builtin_neon_vextq_v((int8x16_t)__lo_r,
-	(int8x16_t)__hi_r, __i, 50);
-	return __builtin_shufflevector(__r, __r, 3, 2, 1, 0);
-#endif
+#define	vextq_u32(__lo, __hi, __i)	  \
+	(uint32x4_t)__builtin_neon_vextq_v((int8x16_t)(__lo),		  \
+	(int8x16_t)(__hi), (__i), 50)
+#else
+#define	vextq_u32(__lo, __hi, __i) (	  \
+{	  \
+	uint32x4_t __tlo = (__lo);	  \
+	uint32x4_t __thi = (__hi);	  \
+	uint32x4_t __lo_r = __builtin_shufflevector(__tlo, __tlo, 3,2,1,0);   \
+	uint32x4_t __hi_r = __builtin_shufflevector(__thi, __thi, 3,2,1,0);   \
+	uint32x4_t __r = __builtin_neon_vextq_v((int8x16_t)__lo_r,	  \
+	(int8x16_t)__hi_r, __i, 50);  \
+	__builtin_shufflevector(__r, __r, 3,2,1,0);			  \
+})
+#endif	/* __LITTLE_ENDIAN__ */
 #endif
-}
 
+#if defined(__GNUC__) && !defined(__clang__)
 _INTRINSATTR
 static __inline uint8x16_t
 vextq_u8(uint8x16_t __lo, uint8x16_t __hi, uint8_t __i)
 {
-#if defined(__GNUC__) && !defined(__clang__)
 #if defined(__AARCH64EB__) || defined(__ARM_BIG_ENDIAN)
 	return __builtin_shuffle(__hi, __lo,
 	(uint8x16_t) {
@@ -166,38 +174,45 @@ vextq_u8(uint8x16_t __lo, uint8x16_t __h
 		__i + 12, __i + 13, __i + 14, __i + 15,
 	});
 #endif
+}
 #elif defined(__clang__)
 #ifdef __LITTLE_ENDIAN__
-	return __builtin_neon_vextq_v((int8x16_t)__lo, (int8x16_t)__hi, __i,
-	48);
-#else
-	uint32x4_t __lo_r = __builtin_shufflevector(__lo, __lo,
-	15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-	uint32x4_t __hi_r = __builtin_shufflevector(__hi, __hi,
-	15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-	uint32x4_t __r = __builtin_neon_vextq_v((int8x16_t)__lo_r,
-	(int8x16_t)__hi_r, __i, 50);
-	return __builtin_shufflevector(__r, __r,
-	15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-#endif
+#define	vextq_u8(__lo, __hi, __i)	  \
+	(uint8x16_t)__builtin_neon_vextq_v((int8x16_t)(__lo),		  \
+	(int8x16_t)(__hi), (__i), 48)
+#else
+#define	vextq_u8(__lo, __hi, __i) (	  \
+{	  \
+	uint8x16_t __tlo = (__lo);	  \
+	uint8x16_t __thi = (__hi);	  \
+	uint8x16_t __lo_r = __builtin_shufflevector(__tlo, __tlo,	  \
+	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);			  \
+	uint8x16_t __hi_r = __builtin_shufflevector(__thi, __thi,	  \
+	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);			  \
+	uint8x16_t __r = __builtin_neon_vextq_v((int8x16_t)__lo_r,	  \
+	(int8x16_t)__hi_r, (__i), 48);  \
+	return __builtin_shufflevector(__r, __r,			  \
+	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);			  \
+})
+#endif	/* __LITTLE_ENDIAN */
 #endif
-}
 
+#if defined(__GNUC__) && !defined(__clang__)
 _INTRINSATTR
 static __inline 

CVS commit: src/sys/crypto/aes/arch/arm

2020-06-30 Thread Taylor R Campbell
Module Name:src
Committed By:   riastradh
Date:   Tue Jun 30 17:03:14 UTC 2020

Modified Files:
src/sys/crypto/aes/arch/arm: files.aesneon

Log Message:
Limit aes_neon to cpu_cortex | aarch64.

We won't use it on any other systems, and it doesn't build without
NEON anyway.  Verified earmv7hf GENERIC, aarch64 GENERIC64, and
earmv6 RPI2 all build with this.


To generate a diff of this commit:
cvs rdiff -u -r1.2 -r1.3 src/sys/crypto/aes/arch/arm/files.aesneon

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.



CVS commit: src/sys/crypto/aes/arch/arm

2020-06-30 Thread Taylor R Campbell
Module Name:src
Committed By:   riastradh
Date:   Tue Jun 30 17:03:14 UTC 2020

Modified Files:
src/sys/crypto/aes/arch/arm: files.aesneon

Log Message:
Limit aes_neon to cpu_cortex | aarch64.

We won't use it on any other systems, and it doesn't build without
NEON anyway.  Verified earmv7hf GENERIC, aarch64 GENERIC64, and
earmv6 RPI2 all build with this.


To generate a diff of this commit:
cvs rdiff -u -r1.2 -r1.3 src/sys/crypto/aes/arch/arm/files.aesneon

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/sys/crypto/aes/arch/arm/files.aesneon
diff -u src/sys/crypto/aes/arch/arm/files.aesneon:1.2 src/sys/crypto/aes/arch/arm/files.aesneon:1.3
--- src/sys/crypto/aes/arch/arm/files.aesneon:1.2	Mon Jun 29 23:57:56 2020
+++ src/sys/crypto/aes/arch/arm/files.aesneon	Tue Jun 30 17:03:13 2020
@@ -1,4 +1,4 @@
-#	$NetBSD: files.aesneon,v 1.2 2020/06/29 23:57:56 riastradh Exp $
+#	$NetBSD: files.aesneon,v 1.3 2020/06/30 17:03:13 riastradh Exp $
 
 ifdef aarch64
 makeoptions	aes	"COPTS.aes_neon.c"+="-march=armv8-a"
@@ -8,10 +8,8 @@ makeoptions	aes	"COPTS.aes_neon.c"+="-mf
 makeoptions	aes	"COPTS.aes_neon_subr.c"+="-mfloat-abi=softfp -mfpu=neon"
 endif
 
-file	crypto/aes/arch/arm/aes_neon.c		aes
-file	crypto/aes/arch/arm/aes_neon_impl.c	aes
-file	crypto/aes/arch/arm/aes_neon_subr.c	aes
+file	crypto/aes/arch/arm/aes_neon.c		aes & (cpu_cortex | aarch64)
+file	crypto/aes/arch/arm/aes_neon_impl.c	aes & (cpu_cortex | aarch64)
+file	crypto/aes/arch/arm/aes_neon_subr.c	aes & (cpu_cortex | aarch64)
 
-ifndef aarch64
-file	crypto/aes/arch/arm/aes_neon_32.S	aes
-endif
+file	crypto/aes/arch/arm/aes_neon_32.S	aes & cpu_cortex & !aarch64



CVS commit: src/sys/crypto/aes/arch/arm

2020-06-29 Thread Taylor R Campbell
Module Name:src
Committed By:   riastradh
Date:   Mon Jun 29 23:57:56 UTC 2020

Modified Files:
src/sys/crypto/aes/arch/arm: aes_neon.c files.aesneon
Added Files:
src/sys/crypto/aes/arch/arm: aes_neon_32.S

Log Message:
Provide hand-written AES NEON assembly for arm32.

gcc does a lousy job at compiling 128-bit NEON intrinsics on arm32;
hand-writing it made it about 12x faster, by avoiding a zillion loads
and stores to spill everything and the kitchen sink onto the stack.
(But gcc does fine on aarch64, presumably because it has twice as
many registers and doesn't have to deal with q2=d4/d5 overlapping.)


To generate a diff of this commit:
cvs rdiff -u -r1.1 -r1.2 src/sys/crypto/aes/arch/arm/aes_neon.c \
src/sys/crypto/aes/arch/arm/files.aesneon
cvs rdiff -u -r0 -r1.1 src/sys/crypto/aes/arch/arm/aes_neon_32.S

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/sys/crypto/aes/arch/arm/aes_neon.c
diff -u src/sys/crypto/aes/arch/arm/aes_neon.c:1.1 src/sys/crypto/aes/arch/arm/aes_neon.c:1.2
--- src/sys/crypto/aes/arch/arm/aes_neon.c:1.1	Mon Jun 29 23:56:31 2020
+++ src/sys/crypto/aes/arch/arm/aes_neon.c	Mon Jun 29 23:57:56 2020
@@ -1,4 +1,4 @@
-/*	$NetBSD: aes_neon.c,v 1.1 2020/06/29 23:56:31 riastradh Exp $	*/
+/*	$NetBSD: aes_neon.c,v 1.2 2020/06/29 23:57:56 riastradh Exp $	*/
 
 /*-
  * Copyright (c) 2020 The NetBSD Foundation, Inc.
@@ -39,7 +39,7 @@
  */
 
 #include 
-__KERNEL_RCSID(1, "$NetBSD: aes_neon.c,v 1.1 2020/06/29 23:56:31 riastradh Exp $");
+__KERNEL_RCSID(1, "$NetBSD: aes_neon.c,v 1.2 2020/06/29 23:57:56 riastradh Exp $");
 
 #include 
 
@@ -47,6 +47,12 @@ __KERNEL_RCSID(1, "$NetBSD: aes_neon.c,v
 
 #include "aes_neon_impl.h"
 
+#ifdef __aarch64__
+#define	__aarch64_used
+#else
+#define	__aarch64_used	__unused
+#endif
+
 static const uint8x16_t
 mc_forward[4] = {
 	{0x01,0x02,0x03,0x00,0x05,0x06,0x07,0x04,
@@ -58,7 +64,7 @@ mc_forward[4] = {
 	{0x0D,0x0E,0x0F,0x0C,0x01,0x02,0x03,0x00,
 	 0x05,0x06,0x07,0x04,0x09,0x0A,0x0B,0x08},
 },
-mc_backward[4] = {
+mc_backward[4] __aarch64_used = {
 	{0x03,0x00,0x01,0x02,0x07,0x04,0x05,0x06,
 	 0x0B,0x08,0x09,0x0A,0x0F,0x0C,0x0D,0x0E},
 	{0x0F,0x0C,0x0D,0x0E,0x03,0x00,0x01,0x02,
@@ -68,7 +74,7 @@ mc_backward[4] = {
 	{0x07,0x04,0x05,0x06,0x0B,0x08,0x09,0x0A,
 	 0x0F,0x0C,0x0D,0x0E,0x03,0x00,0x01,0x02},
 },
-ipt[2] = {
+ipt[2] __aarch64_used = {
 	{0x00,0x70,0x2A,0x5A,0x98,0xE8,0xB2,0xC2,
 	 0x08,0x78,0x22,0x52,0x90,0xE0,0xBA,0xCA},
 	{0x00,0x4D,0x7C,0x31,0x7D,0x30,0x01,0x4C,
@@ -80,55 +86,55 @@ opt[2] = {
 	{0x00,0xEC,0xBC,0x50,0x51,0xBD,0xED,0x01,
 	 0xE0,0x0C,0x5C,0xB0,0xB1,0x5D,0x0D,0xE1},
 },
-dipt[2] = {
+dipt[2] __aarch64_used = {
 	{0x00,0x5F,0x54,0x0B,0x04,0x5B,0x50,0x0F,
 	 0x1A,0x45,0x4E,0x11,0x1E,0x41,0x4A,0x15},
 	{0x00,0x65,0x05,0x60,0xE6,0x83,0xE3,0x86,
 	 0x94,0xF1,0x91,0xF4,0x72,0x17,0x77,0x12},
 },
-sb1[2] = {
+sb1[2] __aarch64_used = {
 	{0x00,0x3E,0x50,0xCB,0x8F,0xE1,0x9B,0xB1,
 	 0x44,0xF5,0x2A,0x14,0x6E,0x7A,0xDF,0xA5},
 	{0x00,0x23,0xE2,0xFA,0x15,0xD4,0x18,0x36,
 	 0xEF,0xD9,0x2E,0x0D,0xC1,0xCC,0xF7,0x3B},
 },
-sb2[2] = {
+sb2[2] __aarch64_used = {
 	{0x00,0x24,0x71,0x0B,0xC6,0x93,0x7A,0xE2,
 	 0xCD,0x2F,0x98,0xBC,0x55,0xE9,0xB7,0x5E},
 	{0x00,0x29,0xE1,0x0A,0x40,0x88,0xEB,0x69,
 	 0x4A,0x23,0x82,0xAB,0xC8,0x63,0xA1,0xC2},
 },
-sbo[2] = {
+sbo[2] __aarch64_used = {
 	{0x00,0xC7,0xBD,0x6F,0x17,0x6D,0xD2,0xD0,
 	 0x78,0xA8,0x02,0xC5,0x7A,0xBF,0xAA,0x15},
 	{0x00,0x6A,0xBB,0x5F,0xA5,0x74,0xE4,0xCF,
 	 0xFA,0x35,0x2B,0x41,0xD1,0x90,0x1E,0x8E},
 },
-dsb9[2] = {
+dsb9[2] __aarch64_used = {
 	{0x00,0xD6,0x86,0x9A,0x53,0x03,0x1C,0x85,
 	 0xC9,0x4C,0x99,0x4F,0x50,0x1F,0xD5,0xCA},
 	{0x00,0x49,0xD7,0xEC,0x89,0x17,0x3B,0xC0,
 	 0x65,0xA5,0xFB,0xB2,0x9E,0x2C,0x5E,0x72},
 },
-dsbd[2] = {
+dsbd[2] __aarch64_used = {
 	{0x00,0xA2,0xB1,0xE6,0xDF,0xCC,0x57,0x7D,
 	 0x39,0x44,0x2A,0x88,0x13,0x9B,0x6E,0xF5},
 	{0x00,0xCB,0xC6,0x24,0xF7,0xFA,0xE2,0x3C,
 	 0xD3,0xEF,0xDE,0x15,0x0D,0x18,0x31,0x29},
 },
-dsbb[2] = {
+dsbb[2] __aarch64_used = {
 	{0x00,0x42,0xB4,0x96,0x92,0x64,0x22,0xD0,
 	 0x04,0xD4,0xF2,0xB0,0xF6,0x46,0x26,0x60},
 	{0x00,0x67,0x59,0xCD,0xA6,0x98,0x94,0xC1,
 	 0x6B,0xAA,0x55,0x32,0x3E,0x0C,0xFF,0xF3},
 },
-dsbe[2] = {
+dsbe[2] __aarch64_used = {
 	{0x00,0xD0,0xD4,0x26,0x96,0x92,0xF2,0x46,
 	 0xB0,0xF6,0xB4,0x64,0x04,0x60,0x42,0x22},
 	{0x00,0xC1,0xAA,0xFF,0xCD,0xA6,0x55,0x0C,
 	 0x32,0x3E,0x59,0x98,0x6B,0xF3,0x67,0x94},
 },
-dsbo[2] = {
+dsbo[2] __aarch64_used = {
 	{0x00,0x40,0xF9,0x7E,0x53,0xEA,0x87,0x13,
 	 0x2D,0x3E,0x94,0xD4,0xB9,0x6D,0xAA,0xC7},
 	{0x00,0x1D,0x44,0x93,0x0F,0x56,0xD7,0x12,
@@ -164,7 +170,7 @@ deskew[2] = {
 	{0x00,0x69,0xEA,0x83,0xDC,0xB5,0x36,0x5F,
 	 0x77,0x1E,0x9D,0xF4,0xAB,0xC2,0x41,0x28},
 },
-sr[4] = {
+sr[4] __aarch64_used = {
 	{0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
 	 0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F},
 	{0x00,0x05,0x0A,0x0F,0x04,0x09,0x0E,0x03,
@@ -533,6 +539,14 @@ 

CVS commit: src/sys/crypto/aes/arch/arm

2020-06-29 Thread Taylor R Campbell
Module Name:src
Committed By:   riastradh
Date:   Mon Jun 29 23:57:56 UTC 2020

Modified Files:
src/sys/crypto/aes/arch/arm: aes_neon.c files.aesneon
Added Files:
src/sys/crypto/aes/arch/arm: aes_neon_32.S

Log Message:
Provide hand-written AES NEON assembly for arm32.

gcc does a lousy job at compiling 128-bit NEON intrinsics on arm32;
hand-writing it made it about 12x faster, by avoiding a zillion loads
and stores to spill everything and the kitchen sink onto the stack.
(But gcc does fine on aarch64, presumably because it has twice as
many registers and doesn't have to deal with q2=d4/d5 overlapping.)


To generate a diff of this commit:
cvs rdiff -u -r1.1 -r1.2 src/sys/crypto/aes/arch/arm/aes_neon.c \
src/sys/crypto/aes/arch/arm/files.aesneon
cvs rdiff -u -r0 -r1.1 src/sys/crypto/aes/arch/arm/aes_neon_32.S

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.