Module Name:    src
Committed By:   riastradh
Date:           Tue Jun 30 23:06:02 UTC 2020

Modified Files:
        src/sys/crypto/aes/arch/arm: aes_armv8_64.S

Log Message:
Reallocate registers to avoid abusing callee-saves registers, v8-v15.

Forgot to consult the AAPCS before committing this before -- oops!

While here, take advantage of the 32 aarch64 simd registers to avoid
all stack spills.


To generate a diff of this commit:
cvs rdiff -u -r1.3 -r1.4 src/sys/crypto/aes/arch/arm/aes_armv8_64.S

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/sys/crypto/aes/arch/arm/aes_armv8_64.S
diff -u src/sys/crypto/aes/arch/arm/aes_armv8_64.S:1.3 src/sys/crypto/aes/arch/arm/aes_armv8_64.S:1.4
--- src/sys/crypto/aes/arch/arm/aes_armv8_64.S:1.3	Tue Jun 30 21:53:39 2020
+++ src/sys/crypto/aes/arch/arm/aes_armv8_64.S	Tue Jun 30 23:06:02 2020
@@ -1,4 +1,4 @@
-/*	$NetBSD: aes_armv8_64.S,v 1.3 2020/06/30 21:53:39 riastradh Exp $	*/
+/*	$NetBSD: aes_armv8_64.S,v 1.4 2020/06/30 23:06:02 riastradh Exp $	*/
 
 /*-
  * Copyright (c) 2020 The NetBSD Foundation, Inc.
@@ -116,7 +116,7 @@ ENTRY(aesarmv8_setenckey128)
 
 	adrl	x4, unshiftrows_rotword_3
 	eor	v0.16b, v0.16b, v0.16b	/* q0 := 0 */
-	ldr	q8, [x4]	/* q8 := unshiftrows_rotword_3 table */
+	ldr	q16, [x4]	/* q16 := unshiftrows_rotword_3 table */
 
 	str	q1, [x0], #0x10	/* store master key as first round key */
 	mov	x2, #10		/* round count */
@@ -136,7 +136,7 @@ ENTRY(aesarmv8_setenckey128)
 
 	/* v3.4s[i] := RotWords(SubBytes(prk[3])) ^ RCON */
 	ld1r	{v4.4s}, [x3], #4
-	tbl	v3.16b, {v3.16b}, v8.16b
+	tbl	v3.16b, {v3.16b}, v16.16b
 	eor	v3.16b, v3.16b, v4.16b
 
 	/*
@@ -175,8 +175,8 @@ ENTRY(aesarmv8_setenckey192)
 	adrl	x4, unshiftrows_rotword_1
 	adrl	x5, unshiftrows_rotword_3
 	eor	v0.16b, v0.16b, v0.16b	/* q0 := 0 */
-	ldr	q8, [x4]	/* q8 := unshiftrows_rotword_1 */
-	ldr	q9, [x5]	/* q9 := unshiftrows_rotword_3 */
+	ldr	q16, [x4]	/* q16 := unshiftrows_rotword_1 */
+	ldr	q17, [x5]	/* q17 := unshiftrows_rotword_3 */
 
 	str	q1, [x0], #0x10	/* store master key[0:128) as round key */
 	mov	x2, #12		/* round count */
@@ -197,7 +197,7 @@ ENTRY(aesarmv8_setenckey192)
 
 	/* v3.4s[i] := RotWords(SubBytes(rklo[1])) ^ RCON */
 	ld1r	{v4.4s}, [x3], #4
-	tbl	v3.16b, {v3.16b}, v8.16b
+	tbl	v3.16b, {v3.16b}, v16.16b
 	eor	v3.16b, v3.16b, v4.16b
 
 	/*
@@ -269,8 +269,8 @@ ENTRY(aesarmv8_setenckey192)
 	 *	q2 = rk
 	 *	q3 = nrk
 	 *	v5.4s = (rk[2], rk[3], nrk[0], nrk[1])
-	 *	q8 = unshiftrows_rotword_1
-	 *	q9 = unshiftrows_rotword_3
+	 *	q16 = unshiftrows_rotword_1
+	 *	q17 = unshiftrows_rotword_3
 	 *
 	 * We have to compute, in q1:
 	 *
@@ -294,7 +294,7 @@ ENTRY(aesarmv8_setenckey192)
 
 	/* v1.4s[i] := RotWords(SubBytes(nrk[3])) ^ RCON' */
 	ld1r	{v4.4s}, [x3], #4
-	tbl	v1.16b, {v1.16b}, v9.16b
+	tbl	v1.16b, {v1.16b}, v17.16b
 	eor	v1.16b, v1.16b, v4.16b
 
 	/*
@@ -354,8 +354,8 @@ ENTRY(aesarmv8_setenckey256)
 	adrl	x4, unshiftrows_rotword_3
 	adrl	x5, unshiftrows_3
 	eor	v0.16b, v0.16b, v0.16b	/* q0 := 0 */
-	ldr	q8, [x4]	/* q8 := unshiftrows_rotword_3 */
-	ldr	q9, [x5]	/* q9 := unshiftrows_3 */
+	ldr	q16, [x4]	/* q16 := unshiftrows_rotword_3 */
+	ldr	q17, [x5]	/* q17 := unshiftrows_3 */
 
 	/* store master key as first two round keys */
 	stp	q1, q2, [x0], #0x20
@@ -376,7 +376,7 @@ ENTRY(aesarmv8_setenckey256)
 
 	/* v3.4s[i] := RotWords(SubBytes(prk[3])) ^ RCON */
 	ld1r	{v4.4s}, [x3], #4
-	tbl	v3.16b, {v3.16b}, v8.16b
+	tbl	v3.16b, {v3.16b}, v16.16b
 	eor	v3.16b, v3.16b, v4.16b
 
 	/*
@@ -402,7 +402,7 @@ ENTRY(aesarmv8_setenckey256)
 	aese	v3.16b, v0.16b
 
 	/* v3.4s[i] := SubBytes(rk[3]) */
-	tbl	v3.16b, {v3.16b}, v9.16b
+	tbl	v3.16b, {v3.16b}, v17.16b
 
 	/*
 	 * v5.4s := (0,prk[0],prk[1],prk[2])
@@ -458,9 +458,9 @@ END(aesarmv8_enctodec)
 ENTRY(aesarmv8_enc)
 	stp	fp, lr, [sp, #-16]!	/* push stack frame */
 	mov	fp, sp
-	ldr	q0, [x1]	/* q0 := block */
-	bl	aesarmv8_enc1
-	str	q0, [x2]	/* store block */
+	ldr	q0, [x1]	/* q0 := ptxt */
+	bl	aesarmv8_enc1	/* q0 := ctxt; trash x0/x3/q16 */
+	str	q0, [x2]	/* store ctxt */
 	ldp	fp, lr, [sp], #16	/* pop stack frame */
 	ret
 END(aesarmv8_enc)
@@ -476,9 +476,9 @@ END(aesarmv8_enc)
 ENTRY(aesarmv8_dec)
 	stp	fp, lr, [sp, #-16]!	/* push stack frame */
 	mov	fp, sp
-	ldr	q0, [x1]	/* q0 := block */
-	bl	aesarmv8_dec1
-	str	q0, [x2]	/* store block */
+	ldr	q0, [x1]	/* q0 := ctxt */
+	bl	aesarmv8_dec1	/* q0 := ptxt; trash x0/x3/q16 */
+	str	q0, [x2]	/* store ptxt */
 	ldp	fp, lr, [sp], #16	/* pop stack frame */
 	ret
 END(aesarmv8_dec)
@@ -505,7 +505,7 @@ ENTRY(aesarmv8_cbc_enc)
 	eor	v0.16b, v0.16b, v1.16b	/* q0 := cv ^ ptxt */
 	mov	x0, x9			/* x0 := enckey */
 	mov	x3, x5			/* x3 := nrounds */
-	bl	aesarmv8_enc1		/* q0 := ciphertext block */
+	bl	aesarmv8_enc1		/* q0 := ctxt; trash x0/x3/q16 */
 	subs	x10, x10, #0x10		/* count down nbytes */
 	str	q0, [x2], #0x10		/* store ciphertext block */
 	b.ne	1b			/* repeat if x10 is nonzero */
@@ -527,10 +527,9 @@ END(aesarmv8_cbc_enc)
  *	Standard ABI calling convention.
  */
 ENTRY(aesarmv8_cbc_dec1)
-	stp	fp, lr, [sp, #-32]!	/* push stack frame with uint128 */
+	stp	fp, lr, [sp, #-16]!	/* push stack frame */
 	mov	fp, sp
-	ldr	q8, [x4]		/* q8 := iv */
-	str	q8, [sp, #16]		/* save iv */
+	ldr	q24, [x4]		/* q24 := iv */
 	mov	x9, x0			/* x9 := enckey */
 	mov	x10, x3			/* x10 := nbytes */
 	add	x1, x1, x3		/* x1 := pointer past end of in */
@@ -539,18 +538,17 @@ ENTRY(aesarmv8_cbc_dec1)
 	str	q0, [x4]		/* update iv */
 1:	mov	x0, x9			/* x0 := enckey */
 	mov	x3, x5			/* x3 := nrounds */
-	bl	aesarmv8_dec1		/* q0 := cv ^ ptxt; trash x0/x3 */
+	bl	aesarmv8_dec1		/* q0 := cv ^ ptxt; trash x0/x3/q16 */
 	subs	x10, x10, #0x10		/* count down nbytes */
 	b.eq	2f			/* stop if this is the first block */
-	ldr	q8, [x1, #-0x10]!	/* q8 := chaining value */
-	eor	v0.16b, v0.16b, v8.16b	/* q0 := plaintext block */
+	ldr	q31, [x1, #-0x10]!	/* q31 := chaining value */
+	eor	v0.16b, v0.16b, v31.16b	/* q0 := plaintext block */
 	str	q0, [x2, #-0x10]!	/* store plaintext block */
-	mov	v0.16b, v8.16b		/* move cv = ciphertext block */
+	mov	v0.16b, v31.16b		/* move cv = ciphertext block */
 	b	1b
-2:	ldr	q8, [sp, #16]		/* q8 := iv */
-	eor	v0.16b, v0.16b, v8.16b	/* q0 := first plaintext block */
+2:	eor	v0.16b, v0.16b, v24.16b	/* q0 := first plaintext block */
 	str	q0, [x2, #-0x10]!	/* store first plaintext block */
-	ldp	fp, lr, [sp], #32	/* pop stack frame */
+	ldp	fp, lr, [sp], #16	/* pop stack frame */
 	ret
 END(aesarmv8_cbc_dec1)
 
@@ -566,10 +564,9 @@ END(aesarmv8_cbc_dec1)
  *	Standard ABI calling convention.
  */
 ENTRY(aesarmv8_cbc_dec8)
-	stp	fp, lr, [sp, #-32]!	/* push stack frame with uint128 */
+	stp	fp, lr, [sp, #-16]!	/* push stack frame */
 	mov	fp, sp
-	ldr	q8, [x4]		/* q8 := iv */
-	str	q8, [sp, #16]		/* save iv */
+	ldr	q24, [x4]		/* q24 := iv */
 	mov	x9, x0			/* x9 := enckey */
 	mov	x10, x3			/* x10 := nbytes */
 	add	x1, x1, x3		/* x1 := pointer past end of in */
@@ -579,23 +576,24 @@ ENTRY(aesarmv8_cbc_dec8)
 1:	ldp	q4, q5, [x1, #-0x20]!
 	ldp	q2, q3, [x1, #-0x20]!
 	ldp	q0, q1, [x1, #-0x20]!
-	mov	v15.16b, v6.16b		/* q[8+i] := cv[i], 0<i<8 */
-	mov	v14.16b, v5.16b
-	mov	v13.16b, v4.16b
-	mov	v12.16b, v3.16b
-	mov	v11.16b, v2.16b
-	mov	v10.16b, v1.16b
-	mov	v9.16b, v0.16b
+	mov	v31.16b, v6.16b		/* q[24+i] := cv[i], 0<i<8 */
+	mov	v30.16b, v5.16b
+	mov	v29.16b, v4.16b
+	mov	v28.16b, v3.16b
+	mov	v27.16b, v2.16b
+	mov	v26.16b, v1.16b
+	mov	v25.16b, v0.16b
 	mov	x0, x9			/* x0 := enckey */
 	mov	x3, x5			/* x3 := nrounds */
-	bl	aesarmv8_dec8		/* q[i] := cv[i] ^ pt[i] */
-	eor	v7.16b, v7.16b, v15.16b	/* q[i] := pt[i] */
-	eor	v6.16b, v6.16b, v14.16b
-	eor	v5.16b, v5.16b, v13.16b
-	eor	v4.16b, v4.16b, v12.16b
-	eor	v3.16b, v3.16b, v11.16b
-	eor	v2.16b, v2.16b, v10.16b
-	eor	v1.16b, v1.16b, v9.16b
+	bl	aesarmv8_dec8		/* q[i] := cv[i] ^ pt[i];
+					 * trash x0/x3/q16 */
+	eor	v7.16b, v7.16b, v31.16b	/* q[i] := pt[i] */
+	eor	v6.16b, v6.16b, v30.16b
+	eor	v5.16b, v5.16b, v29.16b
+	eor	v4.16b, v4.16b, v28.16b
+	eor	v3.16b, v3.16b, v27.16b
+	eor	v2.16b, v2.16b, v26.16b
+	eor	v1.16b, v1.16b, v25.16b
 	subs	x10, x10, #0x80		/* count down nbytes */
 	stp	q6, q7, [x2, #-0x20]!	/* store plaintext blocks */
 	stp	q4, q5, [x2, #-0x20]!
@@ -605,10 +603,9 @@ ENTRY(aesarmv8_cbc_dec8)
 	eor	v0.16b, v0.16b, v7.16b	/* q0 := pt0 */
 	stp	q0, q1, [x2, #-0x20]!
 	b	1b
-2:	ldr	q8, [sp, #16]		/* q8 := iv */
-	eor	v0.16b, v0.16b, v8.16b	/* q0 := pt0 */
+2:	eor	v0.16b, v0.16b, v24.16b	/* q0 := pt0 */
 	stp	q0, q1, [x2, #-0x20]!	/* store first two plaintext blocks */
-	ldp	fp, lr, [sp], #32	/* pop stack frame */
+	ldp	fp, lr, [sp], #16	/* pop stack frame */
 	ret
 END(aesarmv8_cbc_dec8)
 
@@ -629,18 +626,18 @@ ENTRY(aesarmv8_xts_enc1)
 	mov	fp, sp
 	mov	x9, x0			/* x9 := enckey */
 	mov	x10, x3			/* x10 := nbytes */
-	ldr	q9, [x4]		/* q9 := tweak */
+	ldr	q31, [x4]		/* q31 := tweak */
 1:	ldr	q0, [x1], #0x10		/* q0 := ptxt */
 	mov	x0, x9			/* x0 := enckey */
 	mov	x3, x5			/* x3 := nrounds */
-	eor	v0.16b, v0.16b, v9.16b	/* q0 := ptxt ^ tweak */
-	bl	aesarmv8_enc1		/* q0 := AES(ptxt ^ tweak) */
-	eor	v0.16b, v0.16b, v9.16b	/* q0 := AES(ptxt ^ tweak) ^ tweak */
+	eor	v0.16b, v0.16b, v31.16b	/* q0 := ptxt ^ tweak */
+	bl	aesarmv8_enc1		/* q0 := AES(...); trash x0/x3/q16 */
+	eor	v0.16b, v0.16b, v31.16b	/* q0 := AES(ptxt ^ tweak) ^ tweak */
 	str	q0, [x2], #0x10		/* store ciphertext block */
-	bl	aesarmv8_xts_mulx	/* q9 *= x; trash x0/q0/q1 */
+	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
 	subs	x10, x10, #0x10		/* count down nbytes */
 	b.ne	1b			/* repeat if more blocks */
-	str	q9, [x4]		/* update tweak */
+	str	q31, [x4]		/* update tweak */
 	ldp	fp, lr, [sp], #16	/* pop stack frame */
 	ret
 END(aesarmv8_xts_enc1)
@@ -657,61 +654,58 @@ END(aesarmv8_xts_enc1)
  *	Standard ABI calling convention.
  */
 ENTRY(aesarmv8_xts_enc8)
-	stp	fp, lr, [sp, #-48]!	/* push stack frame uint128[2] */
+	stp	fp, lr, [sp, #-16]!	/* push stack frame */
 	mov	fp, sp
 	mov	x9, x0			/* x9 := enckey */
 	mov	x10, x3			/* x10 := nbytes */
-	ldr	q9, [x4]		/* q9 := tweak */
-1:	str	q9, [sp, #16]		/* save tweak[0] */
-	bl	aesarmv8_xts_mulx	/* q9 *= x; trash x0/q0/q1 */
-	str	q9, [sp, #32]		/* save tweak[1] */
-	bl	aesarmv8_xts_mulx	/* q9 *= x; trash x0/q0/q1 */
-	mov	v10.16b, v9.16b		/* q10 := tweak[2] */
-	bl	aesarmv8_xts_mulx	/* q9 *= x; trash x0/q0/q1 */
-	mov	v11.16b, v9.16b		/* q11 := tweak[3] */
-	bl	aesarmv8_xts_mulx	/* q9 *= x; trash x0/q0/q1 */
-	mov	v12.16b, v9.16b		/* q11 := tweak[4] */
-	bl	aesarmv8_xts_mulx	/* q9 *= x; trash x0/q0/q1 */
-	mov	v13.16b, v9.16b		/* q11 := tweak[5] */
-	bl	aesarmv8_xts_mulx	/* q9 *= x; trash x0/q0/q1 */
-	mov	v14.16b, v9.16b		/* q11 := tweak[6] */
-	bl	aesarmv8_xts_mulx	/* q9 *= x; trash x0/q0/q1 */
-	mov	v15.16b, v9.16b		/* q11 := tweak[7] */
-	ldp	q8, q9, [sp, #16]	/* q8 := tweak[0], q9 := tweak[1] */
-	ldp	q0, q1, [x1], #0x20	/* q[i] := pt[i] */
+	ldr	q31, [x4]		/* q31 := tweak */
+1:	mov	v24.16b, v31.16b	/* q24 := tweak[0] */
+	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
+	mov	v25.16b, v31.16b	/* q25 := tweak[1] */
+	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
+	mov	v26.16b, v31.16b	/* q26 := tweak[2] */
+	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
+	mov	v27.16b, v31.16b	/* q27 := tweak[3] */
+	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
+	mov	v28.16b, v31.16b	/* q28 := tweak[4] */
+	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
+	mov	v29.16b, v31.16b	/* q29 := tweak[5] */
+	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
+	mov	v30.16b, v31.16b	/* q30 := tweak[6] */
+	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
+					/* q31 := tweak[7] */
+	ldp	q0, q1, [x1], #0x20	/* q[i] := ptxt[i] */
 	ldp	q2, q3, [x1], #0x20
 	ldp	q4, q5, [x1], #0x20
 	ldp	q6, q7, [x1], #0x20
-	eor	v0.16b, v0.16b, v8.16b	/* q[i] := pt[i] ^ tweak[i] */
-	eor	v1.16b, v1.16b, v9.16b
-	eor	v2.16b, v2.16b, v10.16b
-	eor	v3.16b, v3.16b, v11.16b
-	eor	v4.16b, v4.16b, v12.16b
-	eor	v5.16b, v5.16b, v13.16b
-	eor	v6.16b, v6.16b, v14.16b
-	eor	v7.16b, v7.16b, v15.16b
+	eor	v0.16b, v0.16b, v24.16b	/* q[i] := ptxt[i] ^ tweak[i] */
+	eor	v1.16b, v1.16b, v25.16b
+	eor	v2.16b, v2.16b, v26.16b
+	eor	v3.16b, v3.16b, v27.16b
+	eor	v4.16b, v4.16b, v28.16b
+	eor	v5.16b, v5.16b, v29.16b
+	eor	v6.16b, v6.16b, v30.16b
+	eor	v7.16b, v7.16b, v31.16b
 	mov	x0, x9			/* x0 := enckey */
 	mov	x3, x5			/* x3 := nrounds */
-	bl	aesarmv8_enc8		/* encrypt q0,...,q7; trash x0/x3/q8 */
-	ldr	q8, [sp, #16]		/* reload q8 := tweak[0] */
-	eor	v1.16b, v1.16b, v9.16b	/* q[i] := AES(...) ^ tweak[i] */
-	eor	v2.16b, v2.16b, v10.16b
-	eor	v3.16b, v3.16b, v11.16b
-	eor	v0.16b, v0.16b, v8.16b
-	eor	v4.16b, v4.16b, v12.16b
-	eor	v5.16b, v5.16b, v13.16b
-	eor	v6.16b, v6.16b, v14.16b
-	eor	v7.16b, v7.16b, v15.16b
+	bl	aesarmv8_enc8		/* encrypt q0-q7; trash x0/x3/q16 */
+	eor	v0.16b, v0.16b, v24.16b	/* q[i] := AES(...) ^ tweak[i] */
+	eor	v1.16b, v1.16b, v25.16b
+	eor	v2.16b, v2.16b, v26.16b
+	eor	v3.16b, v3.16b, v27.16b
+	eor	v4.16b, v4.16b, v28.16b
+	eor	v5.16b, v5.16b, v29.16b
+	eor	v6.16b, v6.16b, v30.16b
+	eor	v7.16b, v7.16b, v31.16b
 	stp	q0, q1, [x2], #0x20	/* store ciphertext blocks */
-	stp	q2, q3, [x2], #0x20	/* store ciphertext blocks */
-	stp	q4, q5, [x2], #0x20	/* store ciphertext blocks */
-	stp	q6, q7, [x2], #0x20	/* store ciphertext blocks */
-	mov	v9.16b, v15.16b		/* q9 := q15 = tweak[7] */
-	bl	aesarmv8_xts_mulx	/* q9 *= x; trash x0/q0/q1 */
+	stp	q2, q3, [x2], #0x20
+	stp	q4, q5, [x2], #0x20
+	stp	q6, q7, [x2], #0x20
+	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
 	subs	x10, x10, #0x80		/* count down nbytes */
 	b.ne	1b			/* repeat if more block groups */
-	str	q9, [x4]		/* update tweak */
-	ldp	fp, lr, [sp], #48	/* pop stack frame */
+	str	q31, [x4]		/* update tweak */
+	ldp	fp, lr, [sp], #16	/* pop stack frame */
 	ret
 END(aesarmv8_xts_enc8)
 
@@ -720,7 +714,7 @@ END(aesarmv8_xts_enc8)
  *     uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4,
  *     uint32_t nrounds@x5)
  *
- *	Decrypt a contiguous sequence of blocks with AES-XTS.
+ *	Decrypt a contiguous sequdece of blocks with AES-XTS.
  *
  *	nbytes must be a positive integral multiple of 16.  This routine
  *	is not vectorized; use aesarmv8_xts_dec8 for >=8 blocks at once.
@@ -732,18 +726,18 @@ ENTRY(aesarmv8_xts_dec1)
 	mov	fp, sp
 	mov	x9, x0			/* x9 := deckey */
 	mov	x10, x3			/* x10 := nbytes */
-	ldr	q9, [x4]		/* q9 := tweak */
-1:	ldr	q0, [x1], #0x10		/* q0 := ptxt */
+	ldr	q31, [x4]		/* q31 := tweak */
+1:	ldr	q0, [x1], #0x10		/* q0 := ctxt */
 	mov	x0, x9			/* x0 := deckey */
 	mov	x3, x5			/* x3 := nrounds */
-	eor	v0.16b, v0.16b, v9.16b	/* q0 := ptxt ^ tweak */
-	bl	aesarmv8_dec1		/* q0 := AES(ptxt ^ tweak) */
-	eor	v0.16b, v0.16b, v9.16b	/* q0 := AES(ptxt ^ tweak) ^ tweak */
-	str	q0, [x2], #0x10		/* store ciphertext block */
-	bl	aesarmv8_xts_mulx	/* q9 *= x; trash x0/q0/q1 */
+	eor	v0.16b, v0.16b, v31.16b	/* q0 := ctxt ^ tweak */
+	bl	aesarmv8_dec1		/* q0 := AES(...); trash x0/x3/q16 */
+	eor	v0.16b, v0.16b, v31.16b	/* q0 := AES(ctxt ^ tweak) ^ tweak */
+	str	q0, [x2], #0x10		/* store plaintext block */
+	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
 	subs	x10, x10, #0x10		/* count down nbytes */
 	b.ne	1b			/* repeat if more blocks */
-	str	q9, [x4]		/* update tweak */
+	str	q31, [x4]		/* update tweak */
 	ldp	fp, lr, [sp], #16	/* pop stack frame */
 	ret
 END(aesarmv8_xts_dec1)
@@ -753,75 +747,72 @@ END(aesarmv8_xts_dec1)
  *     uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4,
  *     uint32_t nrounds@x5)
  *
- *	Decrypt a contiguous sequence of blocks with AES-XTS.
+ *	Decrypt a contiguous sequdece of blocks with AES-XTS.
  *
  *	nbytes must be a positive integral multiple of 128.
  *
  *	Standard ABI calling convention.
  */
 ENTRY(aesarmv8_xts_dec8)
-	stp	fp, lr, [sp, #-48]!	/* push stack frame uint128[2] */
+	stp	fp, lr, [sp, #-16]!	/* push stack frame */
 	mov	fp, sp
 	mov	x9, x0			/* x9 := deckey */
 	mov	x10, x3			/* x10 := nbytes */
-	ldr	q9, [x4]		/* q9 := tweak */
-1:	str	q9, [sp, #16]		/* save tweak[0] */
-	bl	aesarmv8_xts_mulx	/* q9 *= x; trash x0/q0/q1 */
-	str	q9, [sp, #32]		/* save tweak[1] */
-	bl	aesarmv8_xts_mulx	/* q9 *= x; trash x0/q0/q1 */
-	mov	v10.16b, v9.16b		/* q10 := tweak[2] */
-	bl	aesarmv8_xts_mulx	/* q9 *= x; trash x0/q0/q1 */
-	mov	v11.16b, v9.16b		/* q11 := tweak[3] */
-	bl	aesarmv8_xts_mulx	/* q9 *= x; trash x0/q0/q1 */
-	mov	v12.16b, v9.16b		/* q11 := tweak[4] */
-	bl	aesarmv8_xts_mulx	/* q9 *= x; trash x0/q0/q1 */
-	mov	v13.16b, v9.16b		/* q11 := tweak[5] */
-	bl	aesarmv8_xts_mulx	/* q9 *= x; trash x0/q0/q1 */
-	mov	v14.16b, v9.16b		/* q11 := tweak[6] */
-	bl	aesarmv8_xts_mulx	/* q9 *= x; trash x0/q0/q1 */
-	mov	v15.16b, v9.16b		/* q11 := tweak[7] */
-	ldp	q8, q9, [sp, #16]	/* q8 := tweak[0], q9 := tweak[1] */
-	ldp	q0, q1, [x1], #0x20	/* q[i] := pt[i] */
+	ldr	q31, [x4]		/* q31 := tweak */
+1:	mov	v24.16b, v31.16b	/* q24 := tweak[0] */
+	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
+	mov	v25.16b, v31.16b	/* q25 := tweak[1] */
+	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
+	mov	v26.16b, v31.16b	/* q26 := tweak[2] */
+	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
+	mov	v27.16b, v31.16b	/* q27 := tweak[3] */
+	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
+	mov	v28.16b, v31.16b	/* q28 := tweak[4] */
+	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
+	mov	v29.16b, v31.16b	/* q29 := tweak[5] */
+	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
+	mov	v30.16b, v31.16b	/* q30 := tweak[6] */
+	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
+					/* q31 := tweak[7] */
+	ldp	q0, q1, [x1], #0x20	/* q[i] := ctxt[i] */
 	ldp	q2, q3, [x1], #0x20
 	ldp	q4, q5, [x1], #0x20
 	ldp	q6, q7, [x1], #0x20
-	eor	v0.16b, v0.16b, v8.16b	/* q[i] := pt[i] ^ tweak[i] */
-	eor	v1.16b, v1.16b, v9.16b
-	eor	v2.16b, v2.16b, v10.16b
-	eor	v3.16b, v3.16b, v11.16b
-	eor	v4.16b, v4.16b, v12.16b
-	eor	v5.16b, v5.16b, v13.16b
-	eor	v6.16b, v6.16b, v14.16b
-	eor	v7.16b, v7.16b, v15.16b
+	eor	v0.16b, v0.16b, v24.16b	/* q[i] := ctxt[i] ^ tweak[i] */
+	eor	v1.16b, v1.16b, v25.16b
+	eor	v2.16b, v2.16b, v26.16b
+	eor	v3.16b, v3.16b, v27.16b
+	eor	v4.16b, v4.16b, v28.16b
+	eor	v5.16b, v5.16b, v29.16b
+	eor	v6.16b, v6.16b, v30.16b
+	eor	v7.16b, v7.16b, v31.16b
 	mov	x0, x9			/* x0 := deckey */
 	mov	x3, x5			/* x3 := nrounds */
-	bl	aesarmv8_dec8		/* decrypt q0,...,q7; trash x0/x3/q8 */
-	ldr	q8, [sp, #16]		/* reload q8 := tweak[0] */
-	eor	v1.16b, v1.16b, v9.16b	/* q[i] := AES(...) ^ tweak[i] */
-	eor	v2.16b, v2.16b, v10.16b
-	eor	v3.16b, v3.16b, v11.16b
-	eor	v0.16b, v0.16b, v8.16b
-	eor	v4.16b, v4.16b, v12.16b
-	eor	v5.16b, v5.16b, v13.16b
-	eor	v6.16b, v6.16b, v14.16b
-	eor	v7.16b, v7.16b, v15.16b
-	stp	q0, q1, [x2], #0x20	/* store ciphertext blocks */
-	stp	q2, q3, [x2], #0x20	/* store ciphertext blocks */
-	stp	q4, q5, [x2], #0x20	/* store ciphertext blocks */
-	stp	q6, q7, [x2], #0x20	/* store ciphertext blocks */
-	mov	v9.16b, v15.16b		/* q9 := q15 = tweak[7] */
-	bl	aesarmv8_xts_mulx	/* q9 *= x; trash x0/q0/q1 */
+	bl	aesarmv8_dec8		/* decrypt q0-q7; trash x0/x3/q16 */
+	eor	v0.16b, v0.16b, v24.16b	/* q[i] := AES(...) ^ tweak[i] */
+	eor	v1.16b, v1.16b, v25.16b
+	eor	v2.16b, v2.16b, v26.16b
+	eor	v3.16b, v3.16b, v27.16b
+	eor	v4.16b, v4.16b, v28.16b
+	eor	v5.16b, v5.16b, v29.16b
+	eor	v6.16b, v6.16b, v30.16b
+	eor	v7.16b, v7.16b, v31.16b
+	stp	q0, q1, [x2], #0x20	/* store plaintext blocks */
+	stp	q2, q3, [x2], #0x20
+	stp	q4, q5, [x2], #0x20
+	stp	q6, q7, [x2], #0x20
+	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
 	subs	x10, x10, #0x80		/* count down nbytes */
 	b.ne	1b			/* repeat if more block groups */
-	str	q9, [x4]		/* update tweak */
-	ldp	fp, lr, [sp], #48	/* pop stack frame */
+	str	q31, [x4]		/* update tweak */
+	ldp	fp, lr, [sp], #16	/* pop stack frame */
 	ret
 END(aesarmv8_xts_dec8)
 
 /*
- * aesarmv8_xts_mulx(tweak@q9)
+ * aesarmv8_xts_mulx(tweak@q31)
  *
- *	Multiply q9 by x, modulo x^128 + x^7 + x^2 + x + 1, in place.
+ *	Multiply q31 by x, modulo x^128 + x^7 + x^2 + x + 1, in place.
  *	Uses x0 and q0/q1 as temporaries.
  */
 	.text
@@ -836,12 +827,12 @@ aesarmv8_xts_mulx:
 	 *     carried into x^128 = x^7 + x^2 + x + 1.
 	 */
 	adrl	x0, xtscarry
-	cmlt	v1.2d, v9.2d, #0 /* v1.2d[i] := -1 if v9.2d[i] < 0, else 0 */
+	cmlt	v1.2d, v31.2d, #0 /* v1.2d[i] := -1 if v9.2d[i] < 0, else 0 */
 	ldr	q0, [x0]		/* q0 := xtscarry */
 	ext	v1.16b, v1.16b, v1.16b, #8 /* swap halves of q1 */
-	shl	v9.2d, v9.2d, #1	/* shift */
+	shl	v31.2d, v31.2d, #1	/* shift */
 	and	v0.16b, v0.16b, v1.16b	/* copy xtscarry according to mask */
-	eor	v9.16b, v9.16b, v0.16b	/* incorporate (a) and (b) */
+	eor	v31.16b, v31.16b, v0.16b /* incorporate (a) and (b) */
 	ret
 END(aesarmv8_xts_mulx)
 
@@ -862,9 +853,9 @@ END(xtscarry)
 ENTRY(aesarmv8_xts_update)
 	stp	fp, lr, [sp, #-16]!	/* push stack frame */
 	mov	fp, sp
-	ldr	q9, [x0]		/* load tweak */
-	bl	aesarmv8_xts_mulx	/* q9 *= x */
-	str	q9, [x1]		/* store tweak */
+	ldr	q31, [x0]		/* load tweak */
+	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
+	str	q31, [x1]		/* store tweak */
 	ldp	fp, lr, [sp], #16	/* pop stack frame */
 	ret
 END(aesarmv8_xts_update)
@@ -875,22 +866,22 @@ END(aesarmv8_xts_update)
  *
  *	Encrypt a single AES block in q0.
  *
- *	Internal ABI.  Uses q8 as temporary.  Destroys x0 and x3.
+ *	Internal ABI.  Uses q16 as temporary.  Destroys x0 and x3.
  */
 	.text
 	_ALIGN_TEXT
 	.type	aesarmv8_enc1,@function
 aesarmv8_enc1:
-	ldr	q8, [x0], #0x10		/* load round key */
+	ldr	q16, [x0], #0x10	/* load round key */
 1:	subs	x3, x3, #1
-	/* q0 := ShiftRows(SubBytes(AddRoundKey_q8(q0))) */
-	aese	v0.16b, v8.16b
-	ldr	q8, [x0], #0x10		/* load next round key */
+	/* q0 := ShiftRows(SubBytes(AddRoundKey_q16(q0))) */
+	aese	v0.16b, v16.16b
+	ldr	q16, [x0], #0x10		/* load next round key */
 	b.eq	2f
 	/* q0 := MixColumns(q0) */
 	aesmc	v0.16b, v0.16b
 	b	1b
-2:	eor	v0.16b, v0.16b, v8.16b
+2:	eor	v0.16b, v0.16b, v16.16b
 	ret
 END(aesarmv8_enc1)
 
@@ -901,24 +892,24 @@ END(aesarmv8_enc1)
  *
  *	Encrypt eight AES blocks in q0 through q7 in parallel.
  *
- *	Internal ABI.  Uses q8 as temporary.  Destroys x0 and x3.
+ *	Internal ABI.  Uses q16 as temporary.  Destroys x0 and x3.
  */
 	.text
 	_ALIGN_TEXT
 	.type	aesarmv8_enc8,@function
 aesarmv8_enc8:
-	ldr	q8, [x0], #0x10		/* load round key */
+	ldr	q16, [x0], #0x10	/* load round key */
 1:	subs	x3, x3, #1
-	/* q[i] := ShiftRows(SubBytes(AddRoundKey_q8(q[i]))) */
-	aese	v0.16b, v8.16b
-	aese	v1.16b, v8.16b
-	aese	v2.16b, v8.16b
-	aese	v3.16b, v8.16b
-	aese	v4.16b, v8.16b
-	aese	v5.16b, v8.16b
-	aese	v6.16b, v8.16b
-	aese	v7.16b, v8.16b
-	ldr	q8, [x0], #0x10		/* load next round key */
+	/* q[i] := ShiftRows(SubBytes(AddRoundKey_q16(q[i]))) */
+	aese	v0.16b, v16.16b
+	aese	v1.16b, v16.16b
+	aese	v2.16b, v16.16b
+	aese	v3.16b, v16.16b
+	aese	v4.16b, v16.16b
+	aese	v5.16b, v16.16b
+	aese	v6.16b, v16.16b
+	aese	v7.16b, v16.16b
+	ldr	q16, [x0], #0x10	/* load next round key */
 	b.eq	2f
 	/* q[i] := MixColumns(q[i]) */
 	aesmc	v0.16b, v0.16b
@@ -930,14 +921,14 @@ aesarmv8_enc8:
 	aesmc	v6.16b, v6.16b
 	aesmc	v7.16b, v7.16b
 	b	1b
-2:	eor	v0.16b, v0.16b, v8.16b	/* AddRoundKey */
-	eor	v1.16b, v1.16b, v8.16b
-	eor	v2.16b, v2.16b, v8.16b
-	eor	v3.16b, v3.16b, v8.16b
-	eor	v4.16b, v4.16b, v8.16b
-	eor	v5.16b, v5.16b, v8.16b
-	eor	v6.16b, v6.16b, v8.16b
-	eor	v7.16b, v7.16b, v8.16b
+2:	eor	v0.16b, v0.16b, v16.16b	/* AddRoundKey */
+	eor	v1.16b, v1.16b, v16.16b
+	eor	v2.16b, v2.16b, v16.16b
+	eor	v3.16b, v3.16b, v16.16b
+	eor	v4.16b, v4.16b, v16.16b
+	eor	v5.16b, v5.16b, v16.16b
+	eor	v6.16b, v6.16b, v16.16b
+	eor	v7.16b, v7.16b, v16.16b
 	ret
 END(aesarmv8_enc8)
 
@@ -947,22 +938,22 @@ END(aesarmv8_enc8)
  *
  *	Decrypt a single AES block in q0.
  *
- *	Internal ABI.  Uses q8 as temporary.  Destroys x0 and x3.
+ *	Internal ABI.  Uses q16 as temporary.  Destroys x0 and x3.
  */
 	.text
 	_ALIGN_TEXT
 	.type	aesarmv8_dec1,@function
 aesarmv8_dec1:
-	ldr	q8, [x0], #0x10		/* load round key */
+	ldr	q16, [x0], #0x10	/* load round key */
 1:	subs	x3, x3, #1
-	/* q0 := InSubBytes(InShiftRows(AddRoundKey_q8(q0))) */
-	aesd	v0.16b, v8.16b
-	ldr	q8, [x0], #0x10		/* load next round key */
+	/* q0 := InSubBytes(InShiftRows(AddRoundKey_q16(q0))) */
+	aesd	v0.16b, v16.16b
+	ldr	q16, [x0], #0x10	/* load next round key */
 	b.eq	2f
 	/* q0 := InMixColumns(q0) */
 	aesimc	v0.16b, v0.16b
 	b	1b
-2:	eor	v0.16b, v0.16b, v8.16b
+2:	eor	v0.16b, v0.16b, v16.16b
 	ret
 END(aesarmv8_dec1)
 
@@ -973,24 +964,24 @@ END(aesarmv8_dec1)
  *
  *	Decrypt eight AES blocks in q0 through q7 in parallel.
  *
- *	Internal ABI.  Uses q8 as temporary.  Destroys x0 and x3.
+ *	Internal ABI.  Uses q16 as temporary.  Destroys x0 and x3.
  */
 	.text
 	_ALIGN_TEXT
 	.type	aesarmv8_dec8,@function
 aesarmv8_dec8:
-	ldr	q8, [x0], #0x10		/* load round key */
+	ldr	q16, [x0], #0x10	/* load round key */
 1:	subs	x3, x3, #1
-	/* q[i] := InSubBytes(InShiftRows(AddRoundKey_q8(q[i]))) */
-	aesd	v0.16b, v8.16b
-	aesd	v1.16b, v8.16b
-	aesd	v2.16b, v8.16b
-	aesd	v3.16b, v8.16b
-	aesd	v4.16b, v8.16b
-	aesd	v5.16b, v8.16b
-	aesd	v6.16b, v8.16b
-	aesd	v7.16b, v8.16b
-	ldr	q8, [x0], #0x10		/* load next round key */
+	/* q[i] := InSubBytes(InShiftRows(AddRoundKey_q16(q[i]))) */
+	aesd	v0.16b, v16.16b
+	aesd	v1.16b, v16.16b
+	aesd	v2.16b, v16.16b
+	aesd	v3.16b, v16.16b
+	aesd	v4.16b, v16.16b
+	aesd	v5.16b, v16.16b
+	aesd	v6.16b, v16.16b
+	aesd	v7.16b, v16.16b
+	ldr	q16, [x0], #0x10	/* load next round key */
 	b.eq	2f
 	/* q[i] := InMixColumns(q[i]) */
 	aesimc	v0.16b, v0.16b
@@ -1002,13 +993,13 @@ aesarmv8_dec8:
 	aesimc	v6.16b, v6.16b
 	aesimc	v7.16b, v7.16b
 	b	1b
-2:	eor	v0.16b, v0.16b, v8.16b	/* AddRoundKey */
-	eor	v1.16b, v1.16b, v8.16b
-	eor	v2.16b, v2.16b, v8.16b
-	eor	v3.16b, v3.16b, v8.16b
-	eor	v4.16b, v4.16b, v8.16b
-	eor	v5.16b, v5.16b, v8.16b
-	eor	v6.16b, v6.16b, v8.16b
-	eor	v7.16b, v7.16b, v8.16b
+2:	eor	v0.16b, v0.16b, v16.16b	/* AddRoundKey */
+	eor	v1.16b, v1.16b, v16.16b
+	eor	v2.16b, v2.16b, v16.16b
+	eor	v3.16b, v3.16b, v16.16b
+	eor	v4.16b, v4.16b, v16.16b
+	eor	v5.16b, v5.16b, v16.16b
+	eor	v6.16b, v6.16b, v16.16b
+	eor	v7.16b, v7.16b, v16.16b
 	ret
 END(aesarmv8_dec8)

Reply via email to