Module Name:    src
Committed By:   riastradh
Date:           Mon Jul 27 20:53:23 UTC 2020

Modified Files:
        src/sys/crypto/aes/arch/arm: aes_armv8_64.S aes_neon_32.S
        src/sys/crypto/aes/arch/x86: aes_ni_64.S
        src/sys/crypto/chacha/arch/arm: chacha_neon_64.S

Log Message:
Align critical-path loops in AES and ChaCha.


To generate a diff of this commit:
cvs rdiff -u -r1.8 -r1.9 src/sys/crypto/aes/arch/arm/aes_armv8_64.S
cvs rdiff -u -r1.2 -r1.3 src/sys/crypto/aes/arch/arm/aes_neon_32.S
cvs rdiff -u -r1.4 -r1.5 src/sys/crypto/aes/arch/x86/aes_ni_64.S
cvs rdiff -u -r1.2 -r1.3 src/sys/crypto/chacha/arch/arm/chacha_neon_64.S

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/sys/crypto/aes/arch/arm/aes_armv8_64.S
diff -u src/sys/crypto/aes/arch/arm/aes_armv8_64.S:1.8 src/sys/crypto/aes/arch/arm/aes_armv8_64.S:1.9
--- src/sys/crypto/aes/arch/arm/aes_armv8_64.S:1.8	Sat Jul 25 22:33:04 2020
+++ src/sys/crypto/aes/arch/arm/aes_armv8_64.S	Mon Jul 27 20:53:22 2020
@@ -1,4 +1,4 @@
-/*	$NetBSD: aes_armv8_64.S,v 1.8 2020/07/25 22:33:04 riastradh Exp $	*/
+/*	$NetBSD: aes_armv8_64.S,v 1.9 2020/07/27 20:53:22 riastradh Exp $	*/
 
 /*-
  * Copyright (c) 2020 The NetBSD Foundation, Inc.
@@ -440,6 +440,7 @@ END(aesarmv8_setenckey256)
 ENTRY(aesarmv8_enctodec)
 	ldr	q0, [x0, x2, lsl #4]	/* load last round key */
 	b	2f
+	_ALIGN_TEXT
 1:	aesimc	v0.16b, v0.16b	/* convert encryption to decryption */
 2:	str	q0, [x1], #0x10	/* store round key */
 	subs	x2, x2, #1	/* count down round */
@@ -503,6 +504,7 @@ ENTRY(aesarmv8_cbc_enc)
 	mov	x9, x0			/* x9 := enckey */
 	mov	x10, x3			/* x10 := nbytes */
 	ldr	q0, [x4]		/* q0 := chaining value */
+	_ALIGN_TEXT
 1:	ldr	q1, [x1], #0x10		/* q1 := plaintext block */
 	eor	v0.16b, v0.16b, v1.16b	/* q0 := cv ^ ptxt */
 	mov	x0, x9			/* x0 := enckey */
@@ -539,6 +541,7 @@ ENTRY(aesarmv8_cbc_dec1)
 	ldr	q0, [x1, #-0x10]!	/* q0 := last ciphertext block */
 	str	q0, [x4]		/* update iv */
 	b	2f
+	_ALIGN_TEXT
 1:	ldr	q31, [x1, #-0x10]!	/* q31 := chaining value */
 	eor	v0.16b, v0.16b, v31.16b	/* q0 := plaintext block */
 	str	q0, [x2, #-0x10]!	/* store plaintext block */
@@ -576,6 +579,7 @@ ENTRY(aesarmv8_cbc_dec8)
 	ldp	q6, q7, [x1, #-0x20]!	/* q6, q7 := last ciphertext blocks */
 	str	q7, [x4]		/* update iv */
 	b	2f
+	_ALIGN_TEXT
 1:	ldp	q6, q7, [x1, #-0x20]!
 	eor	v0.16b, v0.16b, v7.16b	/* q0 := pt0 */
 	stp	q0, q1, [x2, #-0x20]!
@@ -629,6 +633,7 @@ ENTRY(aesarmv8_xts_enc1)
 	mov	x9, x0			/* x9 := enckey */
 	mov	x10, x3			/* x10 := nbytes */
 	ldr	q31, [x4]		/* q31 := tweak */
+	_ALIGN_TEXT
 1:	ldr	q0, [x1], #0x10		/* q0 := ptxt */
 	mov	x0, x9			/* x0 := enckey */
 	mov	x3, x5			/* x3 := nrounds */
@@ -661,6 +666,7 @@ ENTRY(aesarmv8_xts_enc8)
 	mov	x9, x0			/* x9 := enckey */
 	mov	x10, x3			/* x10 := nbytes */
 	ldr	q31, [x4]		/* q31 := tweak */
+	_ALIGN_TEXT
 1:	mov	v24.16b, v31.16b	/* q24 := tweak[0] */
 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
 	mov	v25.16b, v31.16b	/* q25 := tweak[1] */
@@ -729,6 +735,7 @@ ENTRY(aesarmv8_xts_dec1)
 	mov	x9, x0			/* x9 := deckey */
 	mov	x10, x3			/* x10 := nbytes */
 	ldr	q31, [x4]		/* q31 := tweak */
+	_ALIGN_TEXT
 1:	ldr	q0, [x1], #0x10		/* q0 := ctxt */
 	mov	x0, x9			/* x0 := deckey */
 	mov	x3, x5			/* x3 := nrounds */
@@ -761,6 +768,7 @@ ENTRY(aesarmv8_xts_dec8)
 	mov	x9, x0			/* x9 := deckey */
 	mov	x10, x3			/* x10 := nbytes */
 	ldr	q31, [x4]		/* q31 := tweak */
+	_ALIGN_TEXT
 1:	mov	v24.16b, v31.16b	/* q24 := tweak[0] */
 	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
 	mov	v25.16b, v31.16b	/* q25 := tweak[1] */
@@ -879,6 +887,7 @@ ENTRY(aesarmv8_cbcmac_update1)
 	ldr	q0, [x3]		/* q0 := initial authenticator */
 	mov	x9, x0			/* x9 := enckey */
 	mov	x5, x3			/* x5 := &auth (enc1 trashes x3) */
+	_ALIGN_TEXT
 1:	ldr	q1, [x1], #0x10		/* q1 := plaintext block */
 	mov	x0, x9			/* x0 := enckey */
 	mov	x3, x4			/* x3 := nrounds */
@@ -913,6 +922,7 @@ ENTRY(aesarmv8_ccm_enc1)
 #if _BYTE_ORDER == _LITTLE_ENDIAN
 	rev32	v2.16b, v2.16b		/* q2 := ctr (host-endian) */
 #endif
+	_ALIGN_TEXT
 1:	ldr	q3, [x1], #0x10		/* q3 := plaintext block */
 	add	v2.4s, v2.4s, v5.4s	/* increment ctr (32-bit) */
 	mov	x0, x9			/* x0 := enckey */
@@ -972,6 +982,7 @@ ENTRY(aesarmv8_ccm_dec1)
 	bl	aesarmv8_enc1		/* q0 := pad; trash x0/x3/q16 */
 	b	2f
 
+	_ALIGN_TEXT
 1:	/*
 	 * Authenticate the last block and decrypt the next block
 	 * simultaneously.
@@ -1031,6 +1042,7 @@ END(ctr32_inc)
 aesarmv8_enc1:
 	ldr	q16, [x0], #0x10	/* load round key */
 	b	2f
+	_ALIGN_TEXT
 1:	/* q0 := MixColumns(q0) */
 	aesmc	v0.16b, v0.16b
 2:	subs	x3, x3, #1
@@ -1056,6 +1068,7 @@ END(aesarmv8_enc1)
 aesarmv8_enc2:
 	ldr	q16, [x0], #0x10	/* load round key */
 	b	2f
+	_ALIGN_TEXT
 1:	/* q[i] := MixColumns(q[i]) */
 	aesmc	v0.16b, v0.16b
 	aesmc	v1.16b, v1.16b
@@ -1085,6 +1098,7 @@ END(aesarmv8_enc2)
 aesarmv8_enc8:
 	ldr	q16, [x0], #0x10	/* load round key */
 	b	2f
+	_ALIGN_TEXT
 1:	/* q[i] := MixColumns(q[i]) */
 	aesmc	v0.16b, v0.16b
 	aesmc	v1.16b, v1.16b
@@ -1131,6 +1145,7 @@ END(aesarmv8_enc8)
 aesarmv8_dec1:
 	ldr	q16, [x0], #0x10	/* load round key */
 	b	2f
+	_ALIGN_TEXT
 1:	/* q0 := InMixColumns(q0) */
 	aesimc	v0.16b, v0.16b
 2:	subs	x3, x3, #1
@@ -1157,6 +1172,7 @@ END(aesarmv8_dec1)
 aesarmv8_dec8:
 	ldr	q16, [x0], #0x10	/* load round key */
 	b	2f
+	_ALIGN_TEXT
 1:	/* q[i] := InMixColumns(q[i]) */
 	aesimc	v0.16b, v0.16b
 	aesimc	v1.16b, v1.16b

Index: src/sys/crypto/aes/arch/arm/aes_neon_32.S
diff -u src/sys/crypto/aes/arch/arm/aes_neon_32.S:1.2 src/sys/crypto/aes/arch/arm/aes_neon_32.S:1.3
--- src/sys/crypto/aes/arch/arm/aes_neon_32.S:1.2	Mon Jul 27 20:52:10 2020
+++ src/sys/crypto/aes/arch/arm/aes_neon_32.S	Mon Jul 27 20:53:22 2020
@@ -1,4 +1,4 @@
-/*	$NetBSD: aes_neon_32.S,v 1.2 2020/07/27 20:52:10 riastradh Exp $	*/
+/*	$NetBSD: aes_neon_32.S,v 1.3 2020/07/27 20:53:22 riastradh Exp $	*/
 
 /*-
  * Copyright (c) 2020 The NetBSD Foundation, Inc.
@@ -316,6 +316,7 @@ ENTRY(aes_neon_enc1)
 
 	b	2f
 
+	_ALIGN_TEXT
 1:	vld1.64	{d28-d29}, [r0 :128]!	/* q14 = *rk++ */
 
 	/* q0 := A = rk[i] + sb1_0(io) + sb1_1(jo) */
@@ -535,6 +536,7 @@ ENTRY(aes_neon_dec1)
 
 	b	2f
 
+	_ALIGN_TEXT
 1:	/* load dsbd */
 	add	r4, r12, #(dsbd_0 - .Lconstants)
 	vld1.64	{d16-d17}, [r4 :128]!	/* q8 := dsbd[0] */

Index: src/sys/crypto/aes/arch/x86/aes_ni_64.S
diff -u src/sys/crypto/aes/arch/x86/aes_ni_64.S:1.4 src/sys/crypto/aes/arch/x86/aes_ni_64.S:1.5
--- src/sys/crypto/aes/arch/x86/aes_ni_64.S:1.4	Sat Jul 25 22:29:06 2020
+++ src/sys/crypto/aes/arch/x86/aes_ni_64.S	Mon Jul 27 20:53:22 2020
@@ -1,4 +1,4 @@
-/*	$NetBSD: aes_ni_64.S,v 1.4 2020/07/25 22:29:06 riastradh Exp $	*/
+/*	$NetBSD: aes_ni_64.S,v 1.5 2020/07/27 20:53:22 riastradh Exp $	*/
 
 /*-
  * Copyright (c) 2020 The NetBSD Foundation, Inc.
@@ -523,6 +523,7 @@ ENTRY(aesni_enctodec)
 	movdqa	(%rdi,%rdx),%xmm0	/* load last round key */
 	movdqa	%xmm0,(%rsi)	/* store last round key verbatim */
 	jmp	2f
+	_ALIGN_TEXT
 1:	movdqa	(%rdi,%rdx),%xmm0	/* load round key */
 	aesimc	%xmm0,%xmm0	/* convert encryption to decryption */
 	movdqa	%xmm0,(%rsi)	/* store round key */
@@ -580,6 +581,7 @@ ENTRY(aesni_cbc_enc)
 	jz	2f
 	mov	%rcx,%r10		/* r10 := nbytes */
 	movdqu	(%r8),%xmm0		/* xmm0 := chaining value */
+	_ALIGN_TEXT
 1:	movdqu	(%rsi),%xmm1		/* xmm1 := plaintext block */
 	lea	0x10(%rsi),%rsi
 	pxor	%xmm1,%xmm0		/* xmm0 := cv ^ ptxt */
@@ -615,6 +617,7 @@ ENTRY(aesni_cbc_dec1)
 	movdqu	-0x10(%rsi,%r10),%xmm0	/* xmm0 := last ciphertext block */
 	movdqu	%xmm0,(%r8)		/* update iv */
 	jmp	2f
+	_ALIGN_TEXT
 1:	movdqu	-0x10(%rsi,%r10),%xmm8	/* xmm8 := chaining value */
 	pxor	%xmm8,%xmm0		/* xmm0 := ptxt */
 	movdqu	%xmm0,(%rdx,%r10)	/* store plaintext block */
@@ -650,6 +653,7 @@ ENTRY(aesni_cbc_dec8)
 	movdqu	-0x10(%rsi,%r10),%xmm7	/* xmm7 := ciphertext block[n-1] */
 	movdqu	%xmm7,(%r8)		/* update iv */
 	jmp	2f
+	_ALIGN_TEXT
 1:	movdqu	-0x10(%rsi,%r10),%xmm7	/* xmm7 := cv[0] */
 	pxor	%xmm7,%xmm0		/* xmm0 := ptxt[0] */
 	movdqu	%xmm0,(%rdx,%r10)	/* store plaintext block */
@@ -706,6 +710,7 @@ END(aesni_cbc_dec8)
 ENTRY(aesni_xts_enc1)
 	mov	%rcx,%r10		/* r10 := nbytes */
 	movdqu	(%r8),%xmm15		/* xmm15 := tweak */
+	_ALIGN_TEXT
 1:	movdqu	(%rsi),%xmm0		/* xmm0 := ptxt */
 	lea	0x10(%rsi),%rsi		/* advance rdi to next block */
 	pxor	%xmm15,%xmm0		/* xmm0 := ptxt ^ tweak */
@@ -738,6 +743,7 @@ ENTRY(aesni_xts_enc8)
 	sub	$0x10,%rsp
 	mov	%rcx,%r10		/* r10 := nbytes */
 	movdqu	(%r8),%xmm15		/* xmm15 := tweak[0] */
+	_ALIGN_TEXT
 1:	movdqa	%xmm15,%xmm8		/* xmm8 := tweak[0] */
 	call	aesni_xts_mulx		/* xmm15 := tweak[1] */
 	movdqa	%xmm15,%xmm9		/* xmm9 := tweak[1] */
@@ -812,6 +818,7 @@ END(aesni_xts_enc8)
 ENTRY(aesni_xts_dec1)
 	mov	%rcx,%r10		/* r10 := nbytes */
 	movdqu	(%r8),%xmm15		/* xmm15 := tweak */
+	_ALIGN_TEXT
 1:	movdqu	(%rsi),%xmm0		/* xmm0 := ctxt */
 	lea	0x10(%rsi),%rsi		/* advance rdi to next block */
 	pxor	%xmm15,%xmm0		/* xmm0 := ctxt ^ tweak */
@@ -844,6 +851,7 @@ ENTRY(aesni_xts_dec8)
 	sub	$0x10,%rsp
 	mov	%rcx,%r10		/* r10 := nbytes */
 	movdqu	(%r8),%xmm15		/* xmm15 := tweak[0] */
+	_ALIGN_TEXT
 1:	movdqa	%xmm15,%xmm8		/* xmm8 := tweak[0] */
 	call	aesni_xts_mulx		/* xmm15 := tweak[1] */
 	movdqa	%xmm15,%xmm9		/* xmm9 := tweak[1] */
@@ -964,6 +972,7 @@ ENTRY(aesni_cbcmac_update1)
 	movdqu	(%rcx),%xmm0		/* xmm0 := auth */
 	mov	%rdx,%r10		/* r10 := nbytes */
 	mov	%rcx,%rdx		/* rdx := &auth */
+	_ALIGN_TEXT
 1:	pxor	(%rsi),%xmm0		/* xmm0 ^= plaintext block */
 	lea	0x10(%rsi),%rsi
 	mov	%r8d,%ecx		/* ecx := nrounds */
@@ -992,6 +1001,7 @@ ENTRY(aesni_ccm_enc1)
 	movdqa	ctr32_inc(%rip),%xmm5	/* xmm5 := (0,0,0,1) (le) */
 	movdqu	(%r8),%xmm0		/* xmm0 := auth */
 	pshufb	%xmm4,%xmm2		/* xmm2 := ctr (le) */
+	_ALIGN_TEXT
 1:	movdqu	(%rsi),%xmm3		/* xmm3 := plaintext block */
 	paddd	%xmm5,%xmm2		/* increment ctr (32-bit) */
 	lea	0x10(%rsi),%rsi
@@ -1040,6 +1050,7 @@ ENTRY(aesni_ccm_dec1)
 	call	aesni_enc1		/* xmm0 := pad; trash rax/rcx/xmm8 */
 	jmp	2f
 
+	_ALIGN_TEXT
 1:	/*
 	 * Authenticate the last block and decrypt the next block
 	 * simultaneously.
@@ -1103,6 +1114,7 @@ aesni_enc1:
 	lea	0x10(%rdi,%rcx),%rax	/* rax := end of round key array */
 	neg	%rcx		/* rcx := byte offset of round key from end */
 	jmp	2f
+	_ALIGN_TEXT
 1:	aesenc	%xmm8,%xmm0
 2:	movdqa	(%rax,%rcx),%xmm8	/* load round key */
 	add	$0x10,%rcx
@@ -1130,6 +1142,7 @@ aesni_enc2:
 	pxor	%xmm8,%xmm0	/* xor in first round key */
 	pxor	%xmm8,%xmm1
 	jmp	2f
+	_ALIGN_TEXT
 1:	aesenc	%xmm8,%xmm0
 	aesenc	%xmm8,%xmm1
 2:	movdqa	(%rax,%rcx),%xmm8	/* load round key */
@@ -1165,6 +1178,7 @@ aesni_enc8:
 	lea	0x10(%rdi,%rcx),%rax	/* rax := end of round key array */
 	neg	%rcx		/* rcx := byte offset of round key from end */
 	jmp	2f
+	_ALIGN_TEXT
 1:	aesenc	%xmm8,%xmm0
 	aesenc	%xmm8,%xmm1
 	aesenc	%xmm8,%xmm2
@@ -1204,6 +1218,7 @@ aesni_dec1:
 	lea	0x10(%rdi,%rcx),%rax	/* rax := pointer to round key */
 	neg	%rcx		/* rcx := byte offset of round key from end */
 	jmp	2f
+	_ALIGN_TEXT
 1:	aesdec	%xmm8,%xmm0
 2:	movdqa	(%rax,%rcx),%xmm8	/* load round key */
 	add	$0x10,%rcx
@@ -1237,6 +1252,7 @@ aesni_dec8:
 	lea	0x10(%rdi,%rcx),%rax	/* rax := pointer to round key */
 	neg	%rcx		/* rcx := byte offset of round key from end */
 	jmp	2f
+	_ALIGN_TEXT
 1:	aesdec	%xmm8,%xmm0
 	aesdec	%xmm8,%xmm1
 	aesdec	%xmm8,%xmm2

Index: src/sys/crypto/chacha/arch/arm/chacha_neon_64.S
diff -u src/sys/crypto/chacha/arch/arm/chacha_neon_64.S:1.2 src/sys/crypto/chacha/arch/arm/chacha_neon_64.S:1.3
--- src/sys/crypto/chacha/arch/arm/chacha_neon_64.S:1.2	Mon Jul 27 20:50:25 2020
+++ src/sys/crypto/chacha/arch/arm/chacha_neon_64.S	Mon Jul 27 20:53:23 2020
@@ -1,4 +1,4 @@
-/*	$NetBSD: chacha_neon_64.S,v 1.2 2020/07/27 20:50:25 riastradh Exp $	*/
+/*	$NetBSD: chacha_neon_64.S,v 1.3 2020/07/27 20:53:23 riastradh Exp $	*/
 
 /*-
  * Copyright (c) 2020 The NetBSD Foundation, Inc.
@@ -201,6 +201,7 @@ ENTRY(chacha_stream256_neon)
 	mov	w11, v14.s[0]
 	mov	w12, v15.s[0]
 
+	_ALIGN_TEXT
 1:	subs	w5, w5, #2
 	ROUND(v0,v1,v2,v3, v4,v5,v6,v7, v8,v9,v10,v11, v12,v13,v14,v15,
 	    v28,v29,v30,v31, v27)
@@ -339,6 +340,7 @@ ENTRY(chacha_stream_xor256_neon)
 	mov	w11, v14.s[0]
 	mov	w12, v15.s[0]
 
+        _ALIGN_TEXT
 1:	subs	w6, w6, #2
 	ROUND(v0,v1,v2,v3, v4,v5,v6,v7, v8,v9,v10,v11, v12,v13,v14,v15,
 	    v28,v29,v30,v31, v27)

Reply via email to