CVS commit: src/sys/crypto/chacha/arch/arm

2020-09-08 Thread Jonathan A. Kollasch
Module Name:src
Committed By:   jakllsch
Date:   Tue Sep  8 17:17:32 UTC 2020

Modified Files:
src/sys/crypto/chacha/arch/arm: files.chacha_arm

Log Message:
use correct condition


To generate a diff of this commit:
cvs rdiff -u -r1.3 -r1.4 src/sys/crypto/chacha/arch/arm/files.chacha_arm

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/sys/crypto/chacha/arch/arm/files.chacha_arm
diff -u src/sys/crypto/chacha/arch/arm/files.chacha_arm:1.3 src/sys/crypto/chacha/arch/arm/files.chacha_arm:1.4
--- src/sys/crypto/chacha/arch/arm/files.chacha_arm:1.3	Tue Jul 28 20:08:48 2020
+++ src/sys/crypto/chacha/arch/arm/files.chacha_arm	Tue Sep  8 17:17:32 2020
@@ -1,9 +1,9 @@
-#	$NetBSD: files.chacha_arm,v 1.3 2020/07/28 20:08:48 riastradh Exp $
+#	$NetBSD: files.chacha_arm,v 1.4 2020/09/08 17:17:32 jakllsch Exp $
 
 ifdef aarch64
 makeoptions	chacha	"COPTS.chacha_neon.c"+="-march=armv8-a"
 else
-makeoptions	aes	"COPTS.chacha_neon.c"+="-mfloat-abi=softfp -mfpu=neon"
+makeoptions	chacha	"COPTS.chacha_neon.c"+="-mfloat-abi=softfp -mfpu=neon"
 endif
 
 file	crypto/chacha/arch/arm/chacha_neon.c	chacha & (cpu_cortex | aarch64)



CVS commit: src/sys/crypto/chacha/arch/arm

2020-09-07 Thread Jonathan A. Kollasch
Module Name:src
Committed By:   jakllsch
Date:   Mon Sep  7 18:05:17 UTC 2020

Modified Files:
src/sys/crypto/chacha/arch/arm: chacha_neon_64.S

Log Message:
Use a working macro to detect big endian aarch64.

Fixes aarch64eb NEON ChaCha.


To generate a diff of this commit:
cvs rdiff -u -r1.6 -r1.7 src/sys/crypto/chacha/arch/arm/chacha_neon_64.S

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/sys/crypto/chacha/arch/arm/chacha_neon_64.S
diff -u src/sys/crypto/chacha/arch/arm/chacha_neon_64.S:1.6 src/sys/crypto/chacha/arch/arm/chacha_neon_64.S:1.7
--- src/sys/crypto/chacha/arch/arm/chacha_neon_64.S:1.6	Sat Aug  8 14:47:01 2020
+++ src/sys/crypto/chacha/arch/arm/chacha_neon_64.S	Mon Sep  7 18:05:17 2020
@@ -1,4 +1,4 @@
-/*	$NetBSD: chacha_neon_64.S,v 1.6 2020/08/08 14:47:01 riastradh Exp $	*/
+/*	$NetBSD: chacha_neon_64.S,v 1.7 2020/09/07 18:05:17 jakllsch Exp $	*/
 
 /*-
  * Copyright (c) 2020 The NetBSD Foundation, Inc.
@@ -28,7 +28,7 @@
 
 #include 
 
-RCSID("$NetBSD: chacha_neon_64.S,v 1.6 2020/08/08 14:47:01 riastradh Exp $")
+RCSID("$NetBSD: chacha_neon_64.S,v 1.7 2020/09/07 18:05:17 jakllsch Exp $")
 
 #define	ROUND(a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r) \
 STEP(STEP0,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);   \
@@ -130,12 +130,12 @@ STEP(STEP19,a0,b0,c0,d0,a1,b1,c1,d1,a2,b
 #define	STEP19(a,b,c,d, t, r)	/* nothing */
 #endif
 
-#if _BYTE_ORDER == _LITTLE_ENDIAN
-#define	HTOLE32(x)
-#define	LE32TOH(x)
-#elif _BYTE_ORDER == _BIG_ENDIAN
+#if defined(__AARCH64EB__)
 #define	HTOLE32(x)	rev32	x, x
 #define	LE32TOH(x)	rev32	x, x
+#else
+#define	LE32TOH(x)
+#define	HTOLE32(x)
 #endif
 
 /*



CVS commit: src/sys/crypto/chacha/arch/arm

2020-08-23 Thread Taylor R Campbell
Module Name:src
Committed By:   riastradh
Date:   Sun Aug 23 16:39:06 UTC 2020

Modified Files:
src/sys/crypto/chacha/arch/arm: chacha_neon_32.S

Log Message:
Adjust sp, not fp, to allocate a 32-byte temporary.

Costs another couple MOV instructions, but we can't skimp on this --
there's no red zone below sp for interrupts on arm, so we can't touch
anything there.  So just use fp to save sp and then adjust sp itself,
rather than using fp as a temporary register to point just below sp.

Should fix PR port-arm/55598 -- previously the ChaCha self-test
failed 33/1 trials triggered by sysctl during running system;
with the patch it has failed 0/1 trials.

(Presumably it happened more often at boot time, leading to 5/26
failures in the test bed, because we just enabled interrupts and some
devices are starting to deliver interrupts.)


To generate a diff of this commit:
cvs rdiff -u -r1.3 -r1.4 src/sys/crypto/chacha/arch/arm/chacha_neon_32.S

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/sys/crypto/chacha/arch/arm/chacha_neon_32.S
diff -u src/sys/crypto/chacha/arch/arm/chacha_neon_32.S:1.3 src/sys/crypto/chacha/arch/arm/chacha_neon_32.S:1.4
--- src/sys/crypto/chacha/arch/arm/chacha_neon_32.S:1.3	Sat Aug  8 14:47:01 2020
+++ src/sys/crypto/chacha/arch/arm/chacha_neon_32.S	Sun Aug 23 16:39:06 2020
@@ -1,4 +1,4 @@
-/*	$NetBSD: chacha_neon_32.S,v 1.3 2020/08/08 14:47:01 riastradh Exp $	*/
+/*	$NetBSD: chacha_neon_32.S,v 1.4 2020/08/23 16:39:06 riastradh Exp $	*/
 
 /*-
  * Copyright (c) 2020 The NetBSD Foundation, Inc.
@@ -28,7 +28,7 @@
 
 #include 
 
-RCSID("$NetBSD: chacha_neon_32.S,v 1.3 2020/08/08 14:47:01 riastradh Exp $")
+RCSID("$NetBSD: chacha_neon_32.S,v 1.4 2020/08/23 16:39:06 riastradh Exp $")
 
 	.fpu	neon
 
@@ -54,7 +54,7 @@ RCSID("$NetBSD: chacha_neon_32.S,v 1.3 2
  */
 
 .macro	ROUNDLD	a0,a1,a2,a3, b0,b1,b2,b3, c0,c1,c2,c3, d0,d1,d2,d3
-	vld1.8		{\c2-\c3}, [fp, :256]
+	vld1.8		{\c2-\c3}, [sp, :256]
 .endm
 
 .macro	ROUND	a0,a1,a2,a3, b0,b1,b2,b3, c0,c1,c2,c3, d0,d1,d2,d3, c0l, d0l,d0h,d1l,d1h,d2l,d2h,d3l,d3h
@@ -80,7 +80,7 @@ RCSID("$NetBSD: chacha_neon_32.S,v 1.3 2
 	vadd.u32	\c2, \c2, \d2
 	vadd.u32	\c3, \c3, \d3
 
-	vst1.8		{\c0-\c1}, [fp, :256]	/* free c0 and c1 as temps */
+	vst1.8		{\c0-\c1}, [sp, :256]	/* free c0 and c1 as temps */
 
 	veor		\c0, \b0, \c0
 	veor		\c1, \b1, \c1
@@ -118,7 +118,7 @@ RCSID("$NetBSD: chacha_neon_32.S,v 1.3 2
 	vtbl.8		\d3l, {\d3l}, \c0l
 	vtbl.8		\d3h, {\d3h}, \c0l
 
-	vld1.8		{\c0-\c1}, [fp, :256]	/* restore c0 and c1 */
+	vld1.8		{\c0-\c1}, [sp, :256]	/* restore c0 and c1 */
 
 	/* c += d; b ^= c; b <<<= 7 */
 	vadd.u32	\c2, \c2, \d2
@@ -126,7 +126,7 @@ RCSID("$NetBSD: chacha_neon_32.S,v 1.3 2
 	vadd.u32	\c0, \c0, \d0
 	vadd.u32	\c1, \c1, \d1
 
-	vst1.8		{\c2-\c3}, [fp, :256]	/* free c2 and c3 as temps */
+	vst1.8		{\c2-\c3}, [sp, :256]	/* free c2 and c3 as temps */
 
 	veor		\c2, \b2, \c2
 	veor		\c3, \b3, \c3
@@ -160,17 +160,18 @@ ENTRY(chacha_stream256_neon)
 	/* save callee-saves registers */
 	push	{r4, r5, r6, r7, r8, r10, fp, lr}
 	vpush	{d8-d15}
+	mov	fp, sp
 
 	/* r7 := .Lconstants - .Lconstants_addr, r6 := .Lconstants_addr */
 	ldr	r7, .Lconstants_addr
 	adr	r6, .Lconstants_addr
 
 	/* reserve space for two 128-bit/16-byte q registers */
-	sub	fp, sp, #0x20
-	bic	fp, fp, #0x1f	/* align */
+	sub	sp, sp, #0x20
+	bic	sp, sp, #0x1f	/* align */
 
 	/* get parameters */
-	add	ip, sp, #96
+	add	ip, fp, #96
 	add	r7, r7, r6	/* r7 := .Lconstants (= v0123) */
 	ldm	ip, {r4, r5}	/* r4 := const, r5 := nr */
 	ldm	r2, {r6, r8, r10}	/* (r6, r8, r10) := nonce[0:12) */
@@ -311,7 +312,7 @@ ENTRY(chacha_stream256_neon)
 	vadd.u32 q3, q3, q8
 	vadd.u32 q7, q7, q8
 
-	vld1.8	{q8-q9}, [fp, :256]	/* restore q8-q9 */
+	vld1.8	{q8-q9}, [sp, :256]	/* restore q8-q9 */
 
 	vst1.8	{q0-q1}, [r0]!
 	vld1.8	{q0}, [r3]	/* q0 := key[16:32) */
@@ -354,9 +355,10 @@ ENTRY(chacha_stream256_neon)
 	/* zero temporary space on the stack */
 	vmov.i32 q0, #0
 	vmov.i32 q1, #0
-	vst1.8	{q0-q1}, [fp, :256]
+	vst1.8	{q0-q1}, [sp, :256]
 
 	/* restore callee-saves registers and stack */
+	mov	sp, fp
 	vpop	{d8-d15}
 	pop	{r4, r5, r6, r7, r8, r10, fp, lr}
 	bx	lr
@@ -374,17 +376,18 @@ ENTRY(chacha_stream_xor256_neon)
 	/* save callee-saves registers */
 	push	{r4, r5, r6, r7, r8, r10, fp, lr}
 	vpush	{d8-d15}
+	mov	fp, sp
 
 	/* r7 := .Lconstants - .Lconstants_addr, r6 := .Lconstants_addr */
 	ldr	r7, .Lconstants_addr
 	adr	r6, .Lconstants_addr
 
 	/* reserve space for two 128-bit/16-byte q registers */
-	sub	fp, sp, #0x20
-	bic	fp, fp, #0x1f	/* align */
+	sub	sp, sp, #0x20
+	bic	sp, sp, #0x1f	/* align */
 
 	/* get parameters */
-	add	ip, sp, #96
+	add	ip, fp, #96
 	add	r7, r7, r6	/* r7 := .Lconstants (= v0123) */
 	ldm	ip, {r4, r5, ip}	/* r4 := key, r5 := const, ip := nr */
 	ldm	r3, {r6, r8, r10}	/* (r6, r8, r10) := nonce[0:12) */
@@ -475,7 +478,7 @@ 

CVS commit: src/sys/crypto/chacha/arch/arm

2020-07-29 Thread Taylor R Campbell
Module Name:src
Committed By:   riastradh
Date:   Wed Jul 29 14:23:59 UTC 2020

Modified Files:
src/sys/crypto/chacha/arch/arm: chacha_neon_32.S

Log Message:
Issue three more swaps to save eight stores.

Reduces code size and yields a small (~2%) cgd throughput boost.

Remove duplicate comment while here.


To generate a diff of this commit:
cvs rdiff -u -r1.1 -r1.2 src/sys/crypto/chacha/arch/arm/chacha_neon_32.S

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/sys/crypto/chacha/arch/arm/chacha_neon_32.S
diff -u src/sys/crypto/chacha/arch/arm/chacha_neon_32.S:1.1 src/sys/crypto/chacha/arch/arm/chacha_neon_32.S:1.2
--- src/sys/crypto/chacha/arch/arm/chacha_neon_32.S:1.1	Tue Jul 28 20:08:48 2020
+++ src/sys/crypto/chacha/arch/arm/chacha_neon_32.S	Wed Jul 29 14:23:59 2020
@@ -1,4 +1,4 @@
-/*	$NetBSD: chacha_neon_32.S,v 1.1 2020/07/28 20:08:48 riastradh Exp $	*/
+/*	$NetBSD: chacha_neon_32.S,v 1.2 2020/07/29 14:23:59 riastradh Exp $	*/
 
 /*-
  * Copyright (c) 2020 The NetBSD Foundation, Inc.
@@ -28,7 +28,7 @@
 
 #include 
 
-RCSID("$NetBSD: chacha_neon_32.S,v 1.1 2020/07/28 20:08:48 riastradh Exp $")
+RCSID("$NetBSD: chacha_neon_32.S,v 1.2 2020/07/29 14:23:59 riastradh Exp $")
 
 	.fpu	neon
 
@@ -305,21 +305,29 @@ ENTRY(chacha_stream256_neon)
 	 *	q7 = (x3[4], x3[5]; x3[6], x3[7])
 	 *
 	 * The first two rows to write out are q0 = x0[0:4) and q4 =
-	 * x0[4:8).  If we first swap q1 and q4, then once we've
-	 * written them out we free up consecutive registers q0-q1 for
-	 * store-multiple.
+	 * x0[4:8).  Swapping q1<->q4, q3<->q6, q9<->q12, and q11<->q14
+	 * enables us to issue all stores in consecutive pairs:
+	 *	x0 in q0-q1
+	 *	x1 in q8-q9
+	 *	x2 in q2-q3
+	 *	x3 in q10-q11
+	 *	x4 in q4-q5
+	 *	x5 in q12-q3
+	 *	x6 in q6-q7
+	 *	x7 in q14-q15
 	 */
 
 	vswp	q1, q4
+	vswp	q3, q6
 
 	vadd.u32 q0, q0, q9
 	vadd.u32 q4, q4, q9
 	vadd.u32 q2, q2, q9
-	vadd.u32 q3, q3, q9
+	vadd.u32 q6, q6, q9
 
 	vadd.u32 q1, q1, q8
 	vadd.u32 q5, q5, q8
-	vadd.u32 q6, q6, q8
+	vadd.u32 q3, q3, q8
 	vadd.u32 q7, q7, q8
 
 	vld1.32 {q8-q9}, [fp, :256]	/* restore q8-q9 */
@@ -349,14 +357,17 @@ ENTRY(chacha_stream256_neon)
 	vswp	d19, d22
 	vswp	d27, d30
 
+	vswp	q9, q12
+	vswp	q11, q14
+
 	vadd.u32 q8, q8, q0
-	vadd.u32 q9, q9, q0
+	vadd.u32 q12, q12, q0
 	vadd.u32 q10, q10, q0
-	vadd.u32 q11, q11, q0
+	vadd.u32 q14, q14, q0
 
-	vadd.u32 q12, q12, q1
+	vadd.u32 q9, q9, q1
 	vadd.u32 q13, q13, q1
-	vadd.u32 q14, q14, q1
+	vadd.u32 q11, q11, q1
 	vadd.u32 q15, q15, q1
 
 	LE32TOH(q8)
@@ -368,28 +379,18 @@ ENTRY(chacha_stream256_neon)
 	LE32TOH(q14)
 	LE32TOH(q15)
 
-	/* prepare to zero temporary space on stack */
-	vmov.i32 q0, #0
-	vmov.i32 q1, #0
-
-	/* vst1.32	{q0}, [r0]! */
-	/* vst1.32	{q1}, [r0]! */	/* (was q4 before vswp) */
-	vst1.32	{q8}, [r0]!
-	vst1.32	{q12}, [r0]!
-	vst1.32	{q2}, [r0]!
-	vst1.32	{q6}, [r0]!
-	vst1.32	{q10}, [r0]!
-	vst1.32	{q14}, [r0]!
-	vst1.32	{q4}, [r0]!	/* (was q1 before vswp) */
-	vst1.32	{q5}, [r0]!
-	vst1.32	{q9}, [r0]!
-	vst1.32	{q13}, [r0]!
-	vst1.32 {q3}, [r0]!
-	vst1.32 {q7}, [r0]!
-	vst1.32 {q11}, [r0]!
-	vst1.32 {q15}, [r0]
+	/* vst1.32	{q0-q1}, [r0]! */
+	vst1.32	{q8-q9}, [r0]!
+	vst1.32	{q2-q3}, [r0]!
+	vst1.32	{q10-q11}, [r0]!
+	vst1.32	{q4-q5}, [r0]!
+	vst1.32	{q12-q13}, [r0]!
+	vst1.32 {q6-q7}, [r0]!
+	vst1.32 {q14-q15}, [r0]
 
 	/* zero temporary space on the stack */
+	vmov.i32 q0, #0
+	vmov.i32 q1, #0
 	vst1.8	{q0-q1}, [fp, :256]
 
 	/* restore callee-saves registers and stack */
@@ -481,42 +482,8 @@ ENTRY(chacha_stream_xor256_neon)
 	 * in only 16 registers, compute p[i] ^ (y[i] + x[i]) for i in
 	 * {0,1,2,...,15}.  The twist is that the p[i] and the y[i] are
 	 * transposed from one another, and the x[i] are in general
-	 * registers and memory.  So we have:
-	 *
-	 *	q0 = (x0[0], x1[0]; x2[0], x3[0])
-	 *	q1 = (x0[1], x1[1]; x2[1], x3[1])
-	 *	q2 = (x0[2], x1[2]; x2[2], x3[2])
-	 *	q3 = (x0[3], x1[3]; x2[3], x3[3])
-	 *	...
-	 *	q15 = (x0[15], x1[15]; x2[15], x3[15])
-	 *
-	 * where xi[j] is the jth word of the ith 16-word block.  Zip
-	 * consecutive pairs with vzip.32, and you get:
-	 *
-	 *	q0 = (x0[0], x0[1]; x1[0], x1[1])
-	 *	q1 = (x2[0], x2[1]; x3[0], x3[1])
-	 *	q2 = (x0[2], x0[3]; x1[2], x1[3])
-	 *	q3 = (x2[2], x2[3]; x3[2], x3[3])
-	 *	...
-	 *	q15 = (x2[14], x2[15]; x3[14], x3[15])
-	 *
-	 * As 64-bit d registers, this is:
-	 *
-	 *	d0 = (x0[0], x0[1])	d1 = (x1[0], x1[1])
-	 *	d2 = (x2[0], x2[1])	d3 = (x3[0], x3[1])
-	 *	d4 = (x0[2], x0[3])	d5 = (x1[2], x1[3])
-	 *	d6 = (x2[2], x2[3])	d7 = (x3[2], x3[3])
-	 *	...
-	 *	d30 = (x2[14], x2[15])	d31 = (x3[14], x3[15])
-	 *
-	 * Swap d1<->d4, d3<->d6, ..., and you get:
-	 *
-	 *	q0 = (x0[0], x0[1]; x0[2], x0[3])
-	 *	q1 = (x2[0], x2[1]; x2[2], x2[3])
-	 *	q2 = (x1[0], x1[1]; x1[2], x1[3])
-	 *	q3 = (x3[0], x3[1]; x3[2], x3[3])
-	 *	...
-	 *	q15 = (x15[0], x15[1]; x15[2], x15[3])
+	 * registers and memory.  

CVS commit: src/sys/crypto/chacha/arch/arm

2020-07-28 Thread Taylor R Campbell
Module Name:src
Committed By:   riastradh
Date:   Tue Jul 28 20:05:33 UTC 2020

Modified Files:
src/sys/crypto/chacha/arch/arm: chacha_neon.c

Log Message:
Fix big-endian build with appropriate casts around vrev32q_u8.


To generate a diff of this commit:
cvs rdiff -u -r1.5 -r1.6 src/sys/crypto/chacha/arch/arm/chacha_neon.c

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/sys/crypto/chacha/arch/arm/chacha_neon.c
diff -u src/sys/crypto/chacha/arch/arm/chacha_neon.c:1.5 src/sys/crypto/chacha/arch/arm/chacha_neon.c:1.6
--- src/sys/crypto/chacha/arch/arm/chacha_neon.c:1.5	Mon Jul 27 20:58:56 2020
+++ src/sys/crypto/chacha/arch/arm/chacha_neon.c	Tue Jul 28 20:05:33 2020
@@ -1,4 +1,4 @@
-/*	$NetBSD: chacha_neon.c,v 1.5 2020/07/27 20:58:56 riastradh Exp $	*/
+/*	$NetBSD: chacha_neon.c,v 1.6 2020/07/28 20:05:33 riastradh Exp $	*/
 
 /*-
  * Copyright (c) 2020 The NetBSD Foundation, Inc.
@@ -53,7 +53,7 @@ vhtole_u32(uint32x4_t x)
 #if _BYTE_ORDER == _LITTLE_ENDIAN
 	return x;
 #elif _BYTE_ORDER == _BIG_ENDIAN
-	return vrev32q_u8(x);
+	return vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(x)));
 #endif
 }
 
@@ -63,7 +63,7 @@ vletoh_u32(uint32x4_t x)
 #if _BYTE_ORDER == _LITTLE_ENDIAN
 	return x;
 #elif _BYTE_ORDER == _BIG_ENDIAN
-	return vrev32q_u8(x);
+	return vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(x)));
 #endif
 }
 



CVS commit: src/sys/crypto/chacha/arch/arm

2020-07-28 Thread Taylor R Campbell
Module Name:src
Committed By:   riastradh
Date:   Tue Jul 28 15:42:41 UTC 2020

Modified Files:
src/sys/crypto/chacha/arch/arm: chacha_neon_64.S

Log Message:
Fix typo in comment.


To generate a diff of this commit:
cvs rdiff -u -r1.4 -r1.5 src/sys/crypto/chacha/arch/arm/chacha_neon_64.S

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/sys/crypto/chacha/arch/arm/chacha_neon_64.S
diff -u src/sys/crypto/chacha/arch/arm/chacha_neon_64.S:1.4 src/sys/crypto/chacha/arch/arm/chacha_neon_64.S:1.5
--- src/sys/crypto/chacha/arch/arm/chacha_neon_64.S:1.4	Mon Jul 27 20:57:23 2020
+++ src/sys/crypto/chacha/arch/arm/chacha_neon_64.S	Tue Jul 28 15:42:41 2020
@@ -1,4 +1,4 @@
-/*	$NetBSD: chacha_neon_64.S,v 1.4 2020/07/27 20:57:23 riastradh Exp $	*/
+/*	$NetBSD: chacha_neon_64.S,v 1.5 2020/07/28 15:42:41 riastradh Exp $	*/
 
 /*-
  * Copyright (c) 2020 The NetBSD Foundation, Inc.
@@ -28,7 +28,7 @@
 
 #include 
 
-RCSID("$NetBSD: chacha_neon_64.S,v 1.4 2020/07/27 20:57:23 riastradh Exp $")
+RCSID("$NetBSD: chacha_neon_64.S,v 1.5 2020/07/28 15:42:41 riastradh Exp $")
 
 #define	ROUND(a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r) \
 STEP(STEP0,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);   \
@@ -142,7 +142,7 @@ STEP(STEP19,a0,b0,c0,d0,a1,b1,c1,d1,a2,b
  * chacha_stream256_neon(uint8_t s[256]@x0,
  * uint32_t blkno@w1,
  * const uint8_t nonce[12]@x2,
- * const uint8_t key[12]@x3,
+ * const uint8_t key[32]@x3,
  * const uint8_t const[16]@x4,
  * unsigned nr@w5)
  */



CVS commit: src/sys/crypto/chacha/arch/arm

2020-07-27 Thread Taylor R Campbell
Module Name:src
Committed By:   riastradh
Date:   Mon Jul 27 20:58:56 UTC 2020

Modified Files:
src/sys/crypto/chacha/arch/arm: arm_neon.h chacha_neon.c

Log Message:
Note that VSRI seems to hurt here.


To generate a diff of this commit:
cvs rdiff -u -r1.2 -r1.3 src/sys/crypto/chacha/arch/arm/arm_neon.h
cvs rdiff -u -r1.4 -r1.5 src/sys/crypto/chacha/arch/arm/chacha_neon.c

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/sys/crypto/chacha/arch/arm/arm_neon.h
diff -u src/sys/crypto/chacha/arch/arm/arm_neon.h:1.2 src/sys/crypto/chacha/arch/arm/arm_neon.h:1.3
--- src/sys/crypto/chacha/arch/arm/arm_neon.h:1.2	Mon Jul 27 20:58:06 2020
+++ src/sys/crypto/chacha/arch/arm/arm_neon.h	Mon Jul 27 20:58:56 2020
@@ -1,4 +1,4 @@
-/*	$NetBSD: arm_neon.h,v 1.2 2020/07/27 20:58:06 riastradh Exp $	*/
+/*	$NetBSD: arm_neon.h,v 1.3 2020/07/27 20:58:56 riastradh Exp $	*/
 
 /*-
  * Copyright (c) 2020 The NetBSD Foundation, Inc.
@@ -529,6 +529,40 @@ vsliq_n_s32(int32x4_t __vins, int32x4_t 
 #endif	/* __LITTLE_ENDIAN__ */
 #endif
 
+#if defined(__GNUC__) && !defined(__clang__)
+_INTRINSATTR
+static __inline uint32x4_t
+vsriq_n_u32(uint32x4_t __vins, uint32x4_t __vsh, uint8_t __bits)
+{
+#ifdef __aarch64__
+	return __builtin_aarch64_usri_nv4si_uuus(__vins, __vsh, __bits);
+#else
+	return (uint32x4_t)__builtin_neon_vsri_nv4si((int32x4_t)__vins,
+	(int32x4_t)__vsh, __bits);
+#endif
+}
+#elif defined(__clang__)
+#ifdef __LITTLE_ENDIAN__
+#define	vsriq_n_u32(__vins, __vsh, __bits)  \
+	(int32x4_t)__builtin_neon_vsriq_n_v((int32x4_t)(__vins),	  \
+	(int32x4_t)(__vsh), (__bits), 34)
+#else
+#define	vsliq_n_s32(__vins, __vsh, __bits) (  \
+{	  \
+	int32x4_t __tvins = (__vins);	  \
+	int32x4_t __tvsh = (__vsh);	  \
+	uint8_t __tbits = (__bits);	  \
+	int32x4_t __vins_r = __builtin_shufflevector(__tvins, __tvins,	  \
+	3,2,1,0);			  \
+	int32x4_t __vsh_r = __builtin_shufflevector(__tvsh, __tvsh,	  \
+	3,2,1,0);			  \
+	int32x4_t __r = __builtin_neon_vsriq_n_v(__tvins, __tvsh, __tbits,\
+	34);			  \
+	__builtin_shufflevector(__r, __r, 3,2,1,0);			  \
+})
+#endif
+#endif
+
 _INTRINSATTR
 static __inline void
 vst1q_u32(uint32_t *__p32, uint32x4_t __v)

Index: src/sys/crypto/chacha/arch/arm/chacha_neon.c
diff -u src/sys/crypto/chacha/arch/arm/chacha_neon.c:1.4 src/sys/crypto/chacha/arch/arm/chacha_neon.c:1.5
--- src/sys/crypto/chacha/arch/arm/chacha_neon.c:1.4	Mon Jul 27 20:58:06 2020
+++ src/sys/crypto/chacha/arch/arm/chacha_neon.c	Mon Jul 27 20:58:56 2020
@@ -1,4 +1,4 @@
-/*	$NetBSD: chacha_neon.c,v 1.4 2020/07/27 20:58:06 riastradh Exp $	*/
+/*	$NetBSD: chacha_neon.c,v 1.5 2020/07/27 20:58:56 riastradh Exp $	*/
 
 /*-
  * Copyright (c) 2020 The NetBSD Foundation, Inc.
@@ -36,7 +36,15 @@ static inline uint32x4_t
 vrolq_n_u32(uint32x4_t x, uint8_t n)
 {
 
+	/*
+	 * Tempting to use VSHL/VSRI instead of VSHL/VSHR/VORR, but in
+	 * practice it hurts performance at least on Cortex-A8.
+	 */
+#if 1
 	return vshlq_n_u32(x, n) | vshrq_n_u32(x, 32 - n);
+#else
+	return vsriq_n_u32(vshlq_n_u32(x, n), x, 32 - n);
+#endif
 }
 
 static inline uint32x4_t



CVS commit: src/sys/crypto/chacha/arch/arm

2020-07-27 Thread Taylor R Campbell
Module Name:src
Committed By:   riastradh
Date:   Mon Jul 27 20:58:07 UTC 2020

Modified Files:
src/sys/crypto/chacha/arch/arm: arm_neon.h chacha_neon.c

Log Message:
Take advantage of REV32 and TBL for 16-bit and 8-bit rotations.

However, disable use of (V)TBL on armv7/aarch32 for now, because for
some reason GCC spills things to the stack despite having plenty of
free registers, which hurts performance more than it helps at least
on ARM Cortex-A8.


To generate a diff of this commit:
cvs rdiff -u -r1.1 -r1.2 src/sys/crypto/chacha/arch/arm/arm_neon.h
cvs rdiff -u -r1.3 -r1.4 src/sys/crypto/chacha/arch/arm/chacha_neon.c

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/sys/crypto/chacha/arch/arm/arm_neon.h
diff -u src/sys/crypto/chacha/arch/arm/arm_neon.h:1.1 src/sys/crypto/chacha/arch/arm/arm_neon.h:1.2
--- src/sys/crypto/chacha/arch/arm/arm_neon.h:1.1	Sat Jul 25 22:51:57 2020
+++ src/sys/crypto/chacha/arch/arm/arm_neon.h	Mon Jul 27 20:58:06 2020
@@ -1,4 +1,4 @@
-/*	$NetBSD: arm_neon.h,v 1.1 2020/07/25 22:51:57 riastradh Exp $	*/
+/*	$NetBSD: arm_neon.h,v 1.2 2020/07/27 20:58:06 riastradh Exp $	*/
 
 /*-
  * Copyright (c) 2020 The NetBSD Foundation, Inc.
@@ -39,6 +39,7 @@
 typedef __Int32x4_t int32x4_t;
 typedef __Int64x2_t int64x2_t;
 typedef __Int8x16_t int8x16_t;
+typedef __Uint16x8_t uint16x8_t;
 typedef __Uint32x4_t uint32x4_t;
 typedef __Uint64x2_t uint64x2_t;
 typedef __Uint8x16_t uint8x16_t;
@@ -46,6 +47,7 @@ typedef __Uint8x16_t uint8x16_t;
 typedef __simd128_int32_t int32x4_t;
 typedef __simd128_int64_t int64x2_t;
 typedef __simd128_int8_t int8x16_t;
+typedef __simd128_uint16_t uint16x8_t;
 typedef __simd128_uint32_t uint32x4_t;
 typedef __simd128_uint64_t uint64x2_t;
 typedef __simd128_uint8_t uint8x16_t;
@@ -70,9 +72,11 @@ typedef struct { uint8x8_t val[2]; } uin
 typedef __attribute__((neon_vector_type(16))) int8_t int8x16_t;
 typedef __attribute__((neon_vector_type(2))) int64_t int64x2_t;
 typedef __attribute__((neon_vector_type(4))) int32_t int32x4_t;
+
 typedef __attribute__((neon_vector_type(16))) uint8_t uint8x16_t;
 typedef __attribute__((neon_vector_type(2))) uint64_t uint64x2_t;
 typedef __attribute__((neon_vector_type(4))) uint32_t uint32x4_t;
+typedef __attribute__((neon_vector_type(8))) uint16_t uint16x8_t;
 
 typedef __attribute__((neon_vector_type(8))) uint8_t uint8x8_t;
 typedef struct { uint8x8_t val[2]; } uint8x8x2_t;
@@ -330,6 +334,27 @@ vreinterpretq_s32_u8(uint8x16_t __v)
 }
 
 _INTRINSATTR
+static __inline uint16x8_t
+vreinterpretq_u16_u32(uint32x4_t __v)
+{
+	return (uint16x8_t)__v;
+}
+
+_INTRINSATTR
+static __inline uint32x4_t
+vreinterpretq_u32_u16(uint16x8_t __v)
+{
+	return (uint32x4_t)__v;
+}
+
+_INTRINSATTR
+static __inline uint32x4_t
+vreinterpretq_u32_u64(uint64x2_t __v)
+{
+	return (uint32x4_t)__v;
+}
+
+_INTRINSATTR
 static __inline uint32x4_t
 vreinterpretq_u32_u8(uint8x16_t __v)
 {
@@ -338,6 +363,13 @@ vreinterpretq_u32_u8(uint8x16_t __v)
 
 _INTRINSATTR
 static __inline uint64x2_t
+vreinterpretq_u64_u32(uint32x4_t __v)
+{
+	return (uint64x2_t)__v;
+}
+
+_INTRINSATTR
+static __inline uint64x2_t
 vreinterpretq_u64_u8(uint8x16_t __v)
 {
 	return (uint64x2_t)__v;
@@ -365,6 +397,17 @@ vreinterpretq_u8_u64(uint64x2_t __v)
 }
 
 _INTRINSATTR
+static __inline uint16x8_t
+vrev32q_u16(uint16x8_t __v)
+{
+#if defined(__GNUC__) && !defined(__clang__)
+	return __builtin_shuffle(__v, (uint16x8_t) { 1,0, 3,2, 5,4, 7,6 });
+#elif defined(__clang__)
+	return __builtin_shufflevector(__v,  1,0, 3,2, 5,4, 7,6);
+#endif
+}
+
+_INTRINSATTR
 static __inline uint8x16_t
 vrev32q_u8(uint8x16_t __v)
 {
@@ -531,4 +574,58 @@ vst1q_u8(uint8_t *__p8, uint8x16_t __v)
 #endif
 }
 
+#ifndef __aarch64__		/* XXX */
+
+_INTRINSATTR
+static __inline uint8x8_t
+vtbl1_u8(uint8x8_t __tab, uint8x8_t __idx)
+{
+#if defined(__GNUC__) && !defined(__clang__)
+	return (uint8x8_t)__builtin_neon_vtbl1v8qi((int8x8_t)__tab,
+	(int8x8_t)__idx);
+#elif defined(__clang__)
+	uint8x8_t __ret;
+#ifndef __LITTLE_ENDIAN__
+	__tab = __builtin_shufflevector(__tab, __tab, 7,6,5,4,3,2,1,0);
+	__idx = __builtin_shufflevector(__idx, __idx, 7,6,5,4,3,2,1,0);
+#endif
+	__ret = (uint8x8_t)__builtin_neon_vtbl1_v((int8x8_t)__tab,
+	(int8x8_t)__idx, 16);
+#ifndef __LITTLE_ENDIAN__
+	__ret = __builtin_shufflevector(__ret, __ret, 7,6,5,4,3,2,1,0);
+#endif
+	return __ret;
+#endif
+}
+
+_INTRINSATTR
+static __inline uint8x8_t
+vtbl2_u8(uint8x8x2_t __tab, uint8x8_t __idx)
+{
+#if defined(__GNUC__) && !defined(__clang__)
+	union {
+		uint8x8x2_t __u8x8x82;
+		__builtin_neon_ti __ti;
+	} __u = { __tab };
+	return (uint8x8_t)__builtin_neon_vtbl2v8qi(__u.__ti, (int8x8_t)__idx);
+#elif defined(__clang__)
+	uint8x8_t __ret;
+#ifndef __LITTLE_ENDIAN__
+	__tab.val[0] = __builtin_shufflevector(__tab.val[0], __tab.val[0],
+	7,6,5,4,3,2,1,0);
+	__tab.val[1] = __builtin_shufflevector(__tab.val[1], __tab.val[1],
+	

CVS commit: src/sys/crypto/chacha/arch/arm

2020-07-27 Thread Taylor R Campbell
Module Name:src
Committed By:   riastradh
Date:   Mon Jul 27 20:50:25 UTC 2020

Modified Files:
src/sys/crypto/chacha/arch/arm: chacha_neon_64.S

Log Message:
Use  rather than copying things from it here.

Vestige from userland build on netbsd-9 during development.


To generate a diff of this commit:
cvs rdiff -u -r1.1 -r1.2 src/sys/crypto/chacha/arch/arm/chacha_neon_64.S

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/sys/crypto/chacha/arch/arm/chacha_neon_64.S
diff -u src/sys/crypto/chacha/arch/arm/chacha_neon_64.S:1.1 src/sys/crypto/chacha/arch/arm/chacha_neon_64.S:1.2
--- src/sys/crypto/chacha/arch/arm/chacha_neon_64.S:1.1	Sat Jul 25 22:51:57 2020
+++ src/sys/crypto/chacha/arch/arm/chacha_neon_64.S	Mon Jul 27 20:50:25 2020
@@ -1,4 +1,4 @@
-/*	$NetBSD: chacha_neon_64.S,v 1.1 2020/07/25 22:51:57 riastradh Exp $	*/
+/*	$NetBSD: chacha_neon_64.S,v 1.2 2020/07/27 20:50:25 riastradh Exp $	*/
 
 /*-
  * Copyright (c) 2020 The NetBSD Foundation, Inc.
@@ -26,23 +26,7 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
-.macro	adrl 	reg, addr
-	adrp	\reg, \addr
-	add	\reg, \reg, #:lo12:\addr
-.endm
-
-#define	_ALIGN_TEXT			  \
-	.p2align 4
-
-#define	ENTRY(x)			  \
-	.text;  \
-	_ALIGN_TEXT;			  \
-	.global	x;			  \
-	.type	x,@function;		  \
-x:
-
-#define	END(x)  \
-	.size x, . - x
+#include 
 
 #define	ROUND(a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r) \
 STEP(STEP0,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);   \