Module Name:    src
Committed By:   riastradh
Date:           Mon Jul 27 20:54:12 UTC 2020

Modified Files:
        src/sys/crypto/aes/arch/arm: aes_armv8_64.S

Log Message:
Issue aese/aesmc and aesd/aesimc in pairs.

Advised by the aarch64 optimization guide; increases cgd throughput
by about 10%.


To generate a diff of this commit:
cvs rdiff -u -r1.9 -r1.10 src/sys/crypto/aes/arch/arm/aes_armv8_64.S

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/sys/crypto/aes/arch/arm/aes_armv8_64.S
diff -u src/sys/crypto/aes/arch/arm/aes_armv8_64.S:1.9 src/sys/crypto/aes/arch/arm/aes_armv8_64.S:1.10
--- src/sys/crypto/aes/arch/arm/aes_armv8_64.S:1.9	Mon Jul 27 20:53:22 2020
+++ src/sys/crypto/aes/arch/arm/aes_armv8_64.S	Mon Jul 27 20:54:11 2020
@@ -1,4 +1,4 @@
-/*	$NetBSD: aes_armv8_64.S,v 1.9 2020/07/27 20:53:22 riastradh Exp $	*/
+/*	$NetBSD: aes_armv8_64.S,v 1.10 2020/07/27 20:54:11 riastradh Exp $	*/
 
 /*-
  * Copyright (c) 2020 The NetBSD Foundation, Inc.
@@ -1041,15 +1041,18 @@ END(ctr32_inc)
 	.type	aesarmv8_enc1,@function
 aesarmv8_enc1:
 	ldr	q16, [x0], #0x10	/* load round key */
-	b	2f
+	sub	x3, x3, #1
 	_ALIGN_TEXT
-1:	/* q0 := MixColumns(q0) */
+1:	/* q0 := MixColumns(ShiftRows(SubBytes(AddRoundKey_q16(q0)))) */
+	aese	v0.16b, v16.16b
 	aesmc	v0.16b, v0.16b
-2:	subs	x3, x3, #1
+	ldr	q16, [x0], #0x10
+	subs	x3, x3, #1
+	b.ne	1b
 	/* q0 := ShiftRows(SubBytes(AddRoundKey_q16(q0))) */
 	aese	v0.16b, v16.16b
-	ldr	q16, [x0], #0x10		/* load next round key */
-	b.ne	1b
+	ldr	q16, [x0]		/* load last round key */
+	/* q0 := AddRoundKey_q16(q0) */
 	eor	v0.16b, v0.16b, v16.16b
 	ret
 END(aesarmv8_enc1)
@@ -1067,17 +1070,21 @@ END(aesarmv8_enc1)
 	.type	aesarmv8_enc2,@function
 aesarmv8_enc2:
 	ldr	q16, [x0], #0x10	/* load round key */
-	b	2f
+	sub	x3, x3, #1
 	_ALIGN_TEXT
-1:	/* q[i] := MixColumns(q[i]) */
+1:	/* q[i] := MixColumns(ShiftRows(SubBytes(AddRoundKey_q16(q[i])))) */
+	aese	v0.16b, v16.16b
 	aesmc	v0.16b, v0.16b
+	aese	v1.16b, v16.16b
 	aesmc	v1.16b, v1.16b
-2:	subs	x3, x3, #1
+	ldr	q16, [x0], #0x10	/* load next round key */
+	subs	x3, x3, #1
+	b.ne	1b
 	/* q[i] := ShiftRows(SubBytes(AddRoundKey_q16(q[i]))) */
 	aese	v0.16b, v16.16b
 	aese	v1.16b, v16.16b
-	ldr	q16, [x0], #0x10		/* load next round key */
-	b.ne	1b
+	ldr	q16, [x0]		/* load last round key */
+	/* q[i] := AddRoundKey_q16(q[i]) */
 	eor	v0.16b, v0.16b, v16.16b
 	eor	v1.16b, v1.16b, v16.16b
 	ret
@@ -1097,18 +1104,28 @@ END(aesarmv8_enc2)
 	.type	aesarmv8_enc8,@function
 aesarmv8_enc8:
 	ldr	q16, [x0], #0x10	/* load round key */
-	b	2f
+	sub	x3, x3, #1
 	_ALIGN_TEXT
-1:	/* q[i] := MixColumns(q[i]) */
+1:	/* q[i] := MixColumns(ShiftRows(SubBytes(AddRoundKey_q16(q[i])))) */
+	aese	v0.16b, v16.16b
 	aesmc	v0.16b, v0.16b
+	aese	v1.16b, v16.16b
 	aesmc	v1.16b, v1.16b
+	aese	v2.16b, v16.16b
 	aesmc	v2.16b, v2.16b
+	aese	v3.16b, v16.16b
 	aesmc	v3.16b, v3.16b
+	aese	v4.16b, v16.16b
 	aesmc	v4.16b, v4.16b
+	aese	v5.16b, v16.16b
 	aesmc	v5.16b, v5.16b
+	aese	v6.16b, v16.16b
 	aesmc	v6.16b, v6.16b
+	aese	v7.16b, v16.16b
 	aesmc	v7.16b, v7.16b
-2:	subs	x3, x3, #1
+	ldr	q16, [x0], #0x10	/* load next round key */
+	subs	x3, x3, #1
+	b.ne	1b
 	/* q[i] := ShiftRows(SubBytes(AddRoundKey_q16(q[i]))) */
 	aese	v0.16b, v16.16b
 	aese	v1.16b, v16.16b
@@ -1118,9 +1135,9 @@ aesarmv8_enc8:
 	aese	v5.16b, v16.16b
 	aese	v6.16b, v16.16b
 	aese	v7.16b, v16.16b
-	ldr	q16, [x0], #0x10	/* load next round key */
-	b.ne	1b
-	eor	v0.16b, v0.16b, v16.16b	/* AddRoundKey */
+	ldr	q16, [x0]		/* load last round key */
+	/* q[i] := AddRoundKey_q16(q[i]) */
+	eor	v0.16b, v0.16b, v16.16b
 	eor	v1.16b, v1.16b, v16.16b
 	eor	v2.16b, v2.16b, v16.16b
 	eor	v3.16b, v3.16b, v16.16b
@@ -1144,15 +1161,19 @@ END(aesarmv8_enc8)
 	.type	aesarmv8_dec1,@function
 aesarmv8_dec1:
 	ldr	q16, [x0], #0x10	/* load round key */
-	b	2f
+	sub	x3, x3, #1
 	_ALIGN_TEXT
-1:	/* q0 := InMixColumns(q0) */
-	aesimc	v0.16b, v0.16b
-2:	subs	x3, x3, #1
-	/* q0 := InSubBytes(InShiftRows(AddRoundKey_q16(q0))) */
+1:	/* q0 := InSubBytes(InShiftRows(AddRoundKey_q16(q0))) */
 	aesd	v0.16b, v16.16b
+	/* q0 := InMixColumns(q0) */
+	aesimc	v0.16b, v0.16b
 	ldr	q16, [x0], #0x10	/* load next round key */
+	subs	x3, x3, #1
 	b.ne	1b
+	/* q0 := InSubBytes(InShiftRows(AddRoundKey_q16(q0))) */
+	aesd	v0.16b, v16.16b
+	ldr	q16, [x0]		/* load last round key */
+	/* q0 := AddRoundKey_q16(q0) */
 	eor	v0.16b, v0.16b, v16.16b
 	ret
 END(aesarmv8_dec1)
@@ -1171,18 +1192,29 @@ END(aesarmv8_dec1)
 	.type	aesarmv8_dec8,@function
 aesarmv8_dec8:
 	ldr	q16, [x0], #0x10	/* load round key */
-	b	2f
+	sub	x3, x3, #1
 	_ALIGN_TEXT
-1:	/* q[i] := InMixColumns(q[i]) */
+1:	/* q[i] := InSubBytes(InShiftRows(AddRoundKey_q16(q[i]))) */
+	aesd	v0.16b, v16.16b
+	/* q[i] := InMixColumns(q[i]) */
 	aesimc	v0.16b, v0.16b
+	aesd	v1.16b, v16.16b
 	aesimc	v1.16b, v1.16b
+	aesd	v2.16b, v16.16b
 	aesimc	v2.16b, v2.16b
+	aesd	v3.16b, v16.16b
 	aesimc	v3.16b, v3.16b
+	aesd	v4.16b, v16.16b
 	aesimc	v4.16b, v4.16b
+	aesd	v5.16b, v16.16b
 	aesimc	v5.16b, v5.16b
+	aesd	v6.16b, v16.16b
 	aesimc	v6.16b, v6.16b
+	aesd	v7.16b, v16.16b
 	aesimc	v7.16b, v7.16b
-2:	subs	x3, x3, #1
+	ldr	q16, [x0], #0x10	/* load next round key */
+	subs	x3, x3, #1
+	b.ne	1b
 	/* q[i] := InSubBytes(InShiftRows(AddRoundKey_q16(q[i]))) */
 	aesd	v0.16b, v16.16b
 	aesd	v1.16b, v16.16b
@@ -1192,9 +1224,9 @@ aesarmv8_dec8:
 	aesd	v5.16b, v16.16b
 	aesd	v6.16b, v16.16b
 	aesd	v7.16b, v16.16b
-	ldr	q16, [x0], #0x10	/* load next round key */
-	b.ne	1b
-	eor	v0.16b, v0.16b, v16.16b	/* AddRoundKey */
+	ldr	q16, [x0]		/* load last round key */
+	/* q[i] := AddRoundKey_q16(q[i]) */
+	eor	v0.16b, v0.16b, v16.16b
 	eor	v1.16b, v1.16b, v16.16b
 	eor	v2.16b, v2.16b, v16.16b
 	eor	v3.16b, v3.16b, v16.16b

Reply via email to