Module Name: src Committed By: riastradh Date: Tue Jun 30 23:06:02 UTC 2020
Modified Files: src/sys/crypto/aes/arch/arm: aes_armv8_64.S Log Message: Reallocate registers to avoid abusing callee-saves registers, v8-v15. Forgot to consult the AAPCS before committing this before -- oops! While here, take advantage of the 32 aarch64 simd registers to avoid all stack spills. To generate a diff of this commit: cvs rdiff -u -r1.3 -r1.4 src/sys/crypto/aes/arch/arm/aes_armv8_64.S Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
Modified files: Index: src/sys/crypto/aes/arch/arm/aes_armv8_64.S diff -u src/sys/crypto/aes/arch/arm/aes_armv8_64.S:1.3 src/sys/crypto/aes/arch/arm/aes_armv8_64.S:1.4 --- src/sys/crypto/aes/arch/arm/aes_armv8_64.S:1.3 Tue Jun 30 21:53:39 2020 +++ src/sys/crypto/aes/arch/arm/aes_armv8_64.S Tue Jun 30 23:06:02 2020 @@ -1,4 +1,4 @@ -/* $NetBSD: aes_armv8_64.S,v 1.3 2020/06/30 21:53:39 riastradh Exp $ */ +/* $NetBSD: aes_armv8_64.S,v 1.4 2020/06/30 23:06:02 riastradh Exp $ */ /*- * Copyright (c) 2020 The NetBSD Foundation, Inc. @@ -116,7 +116,7 @@ ENTRY(aesarmv8_setenckey128) adrl x4, unshiftrows_rotword_3 eor v0.16b, v0.16b, v0.16b /* q0 := 0 */ - ldr q8, [x4] /* q8 := unshiftrows_rotword_3 table */ + ldr q16, [x4] /* q16 := unshiftrows_rotword_3 table */ str q1, [x0], #0x10 /* store master key as first round key */ mov x2, #10 /* round count */ @@ -136,7 +136,7 @@ ENTRY(aesarmv8_setenckey128) /* v3.4s[i] := RotWords(SubBytes(prk[3])) ^ RCON */ ld1r {v4.4s}, [x3], #4 - tbl v3.16b, {v3.16b}, v8.16b + tbl v3.16b, {v3.16b}, v16.16b eor v3.16b, v3.16b, v4.16b /* @@ -175,8 +175,8 @@ ENTRY(aesarmv8_setenckey192) adrl x4, unshiftrows_rotword_1 adrl x5, unshiftrows_rotword_3 eor v0.16b, v0.16b, v0.16b /* q0 := 0 */ - ldr q8, [x4] /* q8 := unshiftrows_rotword_1 */ - ldr q9, [x5] /* q9 := unshiftrows_rotword_3 */ + ldr q16, [x4] /* q16 := unshiftrows_rotword_1 */ + ldr q17, [x5] /* q17 := unshiftrows_rotword_3 */ str q1, [x0], #0x10 /* store master key[0:128) as round key */ mov x2, #12 /* round count */ @@ -197,7 +197,7 @@ ENTRY(aesarmv8_setenckey192) /* v3.4s[i] := RotWords(SubBytes(rklo[1])) ^ RCON */ ld1r {v4.4s}, [x3], #4 - tbl v3.16b, {v3.16b}, v8.16b + tbl v3.16b, {v3.16b}, v16.16b eor v3.16b, v3.16b, v4.16b /* @@ -269,8 +269,8 @@ ENTRY(aesarmv8_setenckey192) * q2 = rk * q3 = nrk * v5.4s = (rk[2], rk[3], nrk[0], nrk[1]) - * q8 = unshiftrows_rotword_1 - * q9 = unshiftrows_rotword_3 + * q16 = unshiftrows_rotword_1 + * q17 = unshiftrows_rotword_3 * * We have to compute, in q1: * @@ -294,7 +294,7 @@ ENTRY(aesarmv8_setenckey192) /* v1.4s[i] := RotWords(SubBytes(nrk[3])) ^ RCON' */ ld1r {v4.4s}, [x3], #4 - tbl v1.16b, {v1.16b}, v9.16b + tbl v1.16b, {v1.16b}, v17.16b eor v1.16b, v1.16b, v4.16b /* @@ -354,8 +354,8 @@ ENTRY(aesarmv8_setenckey256) adrl x4, unshiftrows_rotword_3 adrl x5, unshiftrows_3 eor v0.16b, v0.16b, v0.16b /* q0 := 0 */ - ldr q8, [x4] /* q8 := unshiftrows_rotword_3 */ - ldr q9, [x5] /* q9 := unshiftrows_3 */ + ldr q16, [x4] /* q16 := unshiftrows_rotword_3 */ + ldr q17, [x5] /* q17 := unshiftrows_3 */ /* store master key as first two round keys */ stp q1, q2, [x0], #0x20 @@ -376,7 +376,7 @@ ENTRY(aesarmv8_setenckey256) /* v3.4s[i] := RotWords(SubBytes(prk[3])) ^ RCON */ ld1r {v4.4s}, [x3], #4 - tbl v3.16b, {v3.16b}, v8.16b + tbl v3.16b, {v3.16b}, v16.16b eor v3.16b, v3.16b, v4.16b /* @@ -402,7 +402,7 @@ ENTRY(aesarmv8_setenckey256) aese v3.16b, v0.16b /* v3.4s[i] := SubBytes(rk[3]) */ - tbl v3.16b, {v3.16b}, v9.16b + tbl v3.16b, {v3.16b}, v17.16b /* * v5.4s := (0,prk[0],prk[1],prk[2]) @@ -458,9 +458,9 @@ END(aesarmv8_enctodec) ENTRY(aesarmv8_enc) stp fp, lr, [sp, #-16]! /* push stack frame */ mov fp, sp - ldr q0, [x1] /* q0 := block */ - bl aesarmv8_enc1 - str q0, [x2] /* store block */ + ldr q0, [x1] /* q0 := ptxt */ + bl aesarmv8_enc1 /* q0 := ctxt; trash x0/x3/q16 */ + str q0, [x2] /* store ctxt */ ldp fp, lr, [sp], #16 /* pop stack frame */ ret END(aesarmv8_enc) @@ -476,9 +476,9 @@ END(aesarmv8_enc) ENTRY(aesarmv8_dec) stp fp, lr, [sp, #-16]! /* push stack frame */ mov fp, sp - ldr q0, [x1] /* q0 := block */ - bl aesarmv8_dec1 - str q0, [x2] /* store block */ + ldr q0, [x1] /* q0 := ctxt */ + bl aesarmv8_dec1 /* q0 := ptxt; trash x0/x3/q16 */ + str q0, [x2] /* store ptxt */ ldp fp, lr, [sp], #16 /* pop stack frame */ ret END(aesarmv8_dec) @@ -505,7 +505,7 @@ ENTRY(aesarmv8_cbc_enc) eor v0.16b, v0.16b, v1.16b /* q0 := cv ^ ptxt */ mov x0, x9 /* x0 := enckey */ mov x3, x5 /* x3 := nrounds */ - bl aesarmv8_enc1 /* q0 := ciphertext block */ + bl aesarmv8_enc1 /* q0 := ctxt; trash x0/x3/q16 */ subs x10, x10, #0x10 /* count down nbytes */ str q0, [x2], #0x10 /* store ciphertext block */ b.ne 1b /* repeat if x10 is nonzero */ @@ -527,10 +527,9 @@ END(aesarmv8_cbc_enc) * Standard ABI calling convention. */ ENTRY(aesarmv8_cbc_dec1) - stp fp, lr, [sp, #-32]! /* push stack frame with uint128 */ + stp fp, lr, [sp, #-16]! /* push stack frame */ mov fp, sp - ldr q8, [x4] /* q8 := iv */ - str q8, [sp, #16] /* save iv */ + ldr q24, [x4] /* q24 := iv */ mov x9, x0 /* x9 := enckey */ mov x10, x3 /* x10 := nbytes */ add x1, x1, x3 /* x1 := pointer past end of in */ @@ -539,18 +538,17 @@ ENTRY(aesarmv8_cbc_dec1) str q0, [x4] /* update iv */ 1: mov x0, x9 /* x0 := enckey */ mov x3, x5 /* x3 := nrounds */ - bl aesarmv8_dec1 /* q0 := cv ^ ptxt; trash x0/x3 */ + bl aesarmv8_dec1 /* q0 := cv ^ ptxt; trash x0/x3/q16 */ subs x10, x10, #0x10 /* count down nbytes */ b.eq 2f /* stop if this is the first block */ - ldr q8, [x1, #-0x10]! /* q8 := chaining value */ - eor v0.16b, v0.16b, v8.16b /* q0 := plaintext block */ + ldr q31, [x1, #-0x10]! /* q31 := chaining value */ + eor v0.16b, v0.16b, v31.16b /* q0 := plaintext block */ str q0, [x2, #-0x10]! /* store plaintext block */ - mov v0.16b, v8.16b /* move cv = ciphertext block */ + mov v0.16b, v31.16b /* move cv = ciphertext block */ b 1b -2: ldr q8, [sp, #16] /* q8 := iv */ - eor v0.16b, v0.16b, v8.16b /* q0 := first plaintext block */ +2: eor v0.16b, v0.16b, v24.16b /* q0 := first plaintext block */ str q0, [x2, #-0x10]! /* store first plaintext block */ - ldp fp, lr, [sp], #32 /* pop stack frame */ + ldp fp, lr, [sp], #16 /* pop stack frame */ ret END(aesarmv8_cbc_dec1) @@ -566,10 +564,9 @@ END(aesarmv8_cbc_dec1) * Standard ABI calling convention. */ ENTRY(aesarmv8_cbc_dec8) - stp fp, lr, [sp, #-32]! /* push stack frame with uint128 */ + stp fp, lr, [sp, #-16]! /* push stack frame */ mov fp, sp - ldr q8, [x4] /* q8 := iv */ - str q8, [sp, #16] /* save iv */ + ldr q24, [x4] /* q24 := iv */ mov x9, x0 /* x9 := enckey */ mov x10, x3 /* x10 := nbytes */ add x1, x1, x3 /* x1 := pointer past end of in */ @@ -579,23 +576,24 @@ ENTRY(aesarmv8_cbc_dec8) 1: ldp q4, q5, [x1, #-0x20]! ldp q2, q3, [x1, #-0x20]! ldp q0, q1, [x1, #-0x20]! - mov v15.16b, v6.16b /* q[8+i] := cv[i], 0<i<8 */ - mov v14.16b, v5.16b - mov v13.16b, v4.16b - mov v12.16b, v3.16b - mov v11.16b, v2.16b - mov v10.16b, v1.16b - mov v9.16b, v0.16b + mov v31.16b, v6.16b /* q[24+i] := cv[i], 0<i<8 */ + mov v30.16b, v5.16b + mov v29.16b, v4.16b + mov v28.16b, v3.16b + mov v27.16b, v2.16b + mov v26.16b, v1.16b + mov v25.16b, v0.16b mov x0, x9 /* x0 := enckey */ mov x3, x5 /* x3 := nrounds */ - bl aesarmv8_dec8 /* q[i] := cv[i] ^ pt[i] */ - eor v7.16b, v7.16b, v15.16b /* q[i] := pt[i] */ - eor v6.16b, v6.16b, v14.16b - eor v5.16b, v5.16b, v13.16b - eor v4.16b, v4.16b, v12.16b - eor v3.16b, v3.16b, v11.16b - eor v2.16b, v2.16b, v10.16b - eor v1.16b, v1.16b, v9.16b + bl aesarmv8_dec8 /* q[i] := cv[i] ^ pt[i]; + * trash x0/x3/q16 */ + eor v7.16b, v7.16b, v31.16b /* q[i] := pt[i] */ + eor v6.16b, v6.16b, v30.16b + eor v5.16b, v5.16b, v29.16b + eor v4.16b, v4.16b, v28.16b + eor v3.16b, v3.16b, v27.16b + eor v2.16b, v2.16b, v26.16b + eor v1.16b, v1.16b, v25.16b subs x10, x10, #0x80 /* count down nbytes */ stp q6, q7, [x2, #-0x20]! /* store plaintext blocks */ stp q4, q5, [x2, #-0x20]! @@ -605,10 +603,9 @@ ENTRY(aesarmv8_cbc_dec8) eor v0.16b, v0.16b, v7.16b /* q0 := pt0 */ stp q0, q1, [x2, #-0x20]! b 1b -2: ldr q8, [sp, #16] /* q8 := iv */ - eor v0.16b, v0.16b, v8.16b /* q0 := pt0 */ +2: eor v0.16b, v0.16b, v24.16b /* q0 := pt0 */ stp q0, q1, [x2, #-0x20]! /* store first two plaintext blocks */ - ldp fp, lr, [sp], #32 /* pop stack frame */ + ldp fp, lr, [sp], #16 /* pop stack frame */ ret END(aesarmv8_cbc_dec8) @@ -629,18 +626,18 @@ ENTRY(aesarmv8_xts_enc1) mov fp, sp mov x9, x0 /* x9 := enckey */ mov x10, x3 /* x10 := nbytes */ - ldr q9, [x4] /* q9 := tweak */ + ldr q31, [x4] /* q31 := tweak */ 1: ldr q0, [x1], #0x10 /* q0 := ptxt */ mov x0, x9 /* x0 := enckey */ mov x3, x5 /* x3 := nrounds */ - eor v0.16b, v0.16b, v9.16b /* q0 := ptxt ^ tweak */ - bl aesarmv8_enc1 /* q0 := AES(ptxt ^ tweak) */ - eor v0.16b, v0.16b, v9.16b /* q0 := AES(ptxt ^ tweak) ^ tweak */ + eor v0.16b, v0.16b, v31.16b /* q0 := ptxt ^ tweak */ + bl aesarmv8_enc1 /* q0 := AES(...); trash x0/x3/q16 */ + eor v0.16b, v0.16b, v31.16b /* q0 := AES(ptxt ^ tweak) ^ tweak */ str q0, [x2], #0x10 /* store ciphertext block */ - bl aesarmv8_xts_mulx /* q9 *= x; trash x0/q0/q1 */ + bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ subs x10, x10, #0x10 /* count down nbytes */ b.ne 1b /* repeat if more blocks */ - str q9, [x4] /* update tweak */ + str q31, [x4] /* update tweak */ ldp fp, lr, [sp], #16 /* pop stack frame */ ret END(aesarmv8_xts_enc1) @@ -657,61 +654,58 @@ END(aesarmv8_xts_enc1) * Standard ABI calling convention. */ ENTRY(aesarmv8_xts_enc8) - stp fp, lr, [sp, #-48]! /* push stack frame uint128[2] */ + stp fp, lr, [sp, #-16]! /* push stack frame */ mov fp, sp mov x9, x0 /* x9 := enckey */ mov x10, x3 /* x10 := nbytes */ - ldr q9, [x4] /* q9 := tweak */ -1: str q9, [sp, #16] /* save tweak[0] */ - bl aesarmv8_xts_mulx /* q9 *= x; trash x0/q0/q1 */ - str q9, [sp, #32] /* save tweak[1] */ - bl aesarmv8_xts_mulx /* q9 *= x; trash x0/q0/q1 */ - mov v10.16b, v9.16b /* q10 := tweak[2] */ - bl aesarmv8_xts_mulx /* q9 *= x; trash x0/q0/q1 */ - mov v11.16b, v9.16b /* q11 := tweak[3] */ - bl aesarmv8_xts_mulx /* q9 *= x; trash x0/q0/q1 */ - mov v12.16b, v9.16b /* q11 := tweak[4] */ - bl aesarmv8_xts_mulx /* q9 *= x; trash x0/q0/q1 */ - mov v13.16b, v9.16b /* q11 := tweak[5] */ - bl aesarmv8_xts_mulx /* q9 *= x; trash x0/q0/q1 */ - mov v14.16b, v9.16b /* q11 := tweak[6] */ - bl aesarmv8_xts_mulx /* q9 *= x; trash x0/q0/q1 */ - mov v15.16b, v9.16b /* q11 := tweak[7] */ - ldp q8, q9, [sp, #16] /* q8 := tweak[0], q9 := tweak[1] */ - ldp q0, q1, [x1], #0x20 /* q[i] := pt[i] */ + ldr q31, [x4] /* q31 := tweak */ +1: mov v24.16b, v31.16b /* q24 := tweak[0] */ + bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ + mov v25.16b, v31.16b /* q25 := tweak[1] */ + bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ + mov v26.16b, v31.16b /* q26 := tweak[2] */ + bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ + mov v27.16b, v31.16b /* q27 := tweak[3] */ + bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ + mov v28.16b, v31.16b /* q28 := tweak[4] */ + bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ + mov v29.16b, v31.16b /* q29 := tweak[5] */ + bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ + mov v30.16b, v31.16b /* q30 := tweak[6] */ + bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ + /* q31 := tweak[7] */ + ldp q0, q1, [x1], #0x20 /* q[i] := ptxt[i] */ ldp q2, q3, [x1], #0x20 ldp q4, q5, [x1], #0x20 ldp q6, q7, [x1], #0x20 - eor v0.16b, v0.16b, v8.16b /* q[i] := pt[i] ^ tweak[i] */ - eor v1.16b, v1.16b, v9.16b - eor v2.16b, v2.16b, v10.16b - eor v3.16b, v3.16b, v11.16b - eor v4.16b, v4.16b, v12.16b - eor v5.16b, v5.16b, v13.16b - eor v6.16b, v6.16b, v14.16b - eor v7.16b, v7.16b, v15.16b + eor v0.16b, v0.16b, v24.16b /* q[i] := ptxt[i] ^ tweak[i] */ + eor v1.16b, v1.16b, v25.16b + eor v2.16b, v2.16b, v26.16b + eor v3.16b, v3.16b, v27.16b + eor v4.16b, v4.16b, v28.16b + eor v5.16b, v5.16b, v29.16b + eor v6.16b, v6.16b, v30.16b + eor v7.16b, v7.16b, v31.16b mov x0, x9 /* x0 := enckey */ mov x3, x5 /* x3 := nrounds */ - bl aesarmv8_enc8 /* encrypt q0,...,q7; trash x0/x3/q8 */ - ldr q8, [sp, #16] /* reload q8 := tweak[0] */ - eor v1.16b, v1.16b, v9.16b /* q[i] := AES(...) ^ tweak[i] */ - eor v2.16b, v2.16b, v10.16b - eor v3.16b, v3.16b, v11.16b - eor v0.16b, v0.16b, v8.16b - eor v4.16b, v4.16b, v12.16b - eor v5.16b, v5.16b, v13.16b - eor v6.16b, v6.16b, v14.16b - eor v7.16b, v7.16b, v15.16b + bl aesarmv8_enc8 /* encrypt q0-q7; trash x0/x3/q16 */ + eor v0.16b, v0.16b, v24.16b /* q[i] := AES(...) ^ tweak[i] */ + eor v1.16b, v1.16b, v25.16b + eor v2.16b, v2.16b, v26.16b + eor v3.16b, v3.16b, v27.16b + eor v4.16b, v4.16b, v28.16b + eor v5.16b, v5.16b, v29.16b + eor v6.16b, v6.16b, v30.16b + eor v7.16b, v7.16b, v31.16b stp q0, q1, [x2], #0x20 /* store ciphertext blocks */ - stp q2, q3, [x2], #0x20 /* store ciphertext blocks */ - stp q4, q5, [x2], #0x20 /* store ciphertext blocks */ - stp q6, q7, [x2], #0x20 /* store ciphertext blocks */ - mov v9.16b, v15.16b /* q9 := q15 = tweak[7] */ - bl aesarmv8_xts_mulx /* q9 *= x; trash x0/q0/q1 */ + stp q2, q3, [x2], #0x20 + stp q4, q5, [x2], #0x20 + stp q6, q7, [x2], #0x20 + bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ subs x10, x10, #0x80 /* count down nbytes */ b.ne 1b /* repeat if more block groups */ - str q9, [x4] /* update tweak */ - ldp fp, lr, [sp], #48 /* pop stack frame */ + str q31, [x4] /* update tweak */ + ldp fp, lr, [sp], #16 /* pop stack frame */ ret END(aesarmv8_xts_enc8) @@ -720,7 +714,7 @@ END(aesarmv8_xts_enc8) * uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4, * uint32_t nrounds@x5) * - * Decrypt a contiguous sequence of blocks with AES-XTS. + * Decrypt a contiguous sequdece of blocks with AES-XTS. * * nbytes must be a positive integral multiple of 16. This routine * is not vectorized; use aesarmv8_xts_dec8 for >=8 blocks at once. @@ -732,18 +726,18 @@ ENTRY(aesarmv8_xts_dec1) mov fp, sp mov x9, x0 /* x9 := deckey */ mov x10, x3 /* x10 := nbytes */ - ldr q9, [x4] /* q9 := tweak */ -1: ldr q0, [x1], #0x10 /* q0 := ptxt */ + ldr q31, [x4] /* q31 := tweak */ +1: ldr q0, [x1], #0x10 /* q0 := ctxt */ mov x0, x9 /* x0 := deckey */ mov x3, x5 /* x3 := nrounds */ - eor v0.16b, v0.16b, v9.16b /* q0 := ptxt ^ tweak */ - bl aesarmv8_dec1 /* q0 := AES(ptxt ^ tweak) */ - eor v0.16b, v0.16b, v9.16b /* q0 := AES(ptxt ^ tweak) ^ tweak */ - str q0, [x2], #0x10 /* store ciphertext block */ - bl aesarmv8_xts_mulx /* q9 *= x; trash x0/q0/q1 */ + eor v0.16b, v0.16b, v31.16b /* q0 := ctxt ^ tweak */ + bl aesarmv8_dec1 /* q0 := AES(...); trash x0/x3/q16 */ + eor v0.16b, v0.16b, v31.16b /* q0 := AES(ctxt ^ tweak) ^ tweak */ + str q0, [x2], #0x10 /* store plaintext block */ + bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ subs x10, x10, #0x10 /* count down nbytes */ b.ne 1b /* repeat if more blocks */ - str q9, [x4] /* update tweak */ + str q31, [x4] /* update tweak */ ldp fp, lr, [sp], #16 /* pop stack frame */ ret END(aesarmv8_xts_dec1) @@ -753,75 +747,72 @@ END(aesarmv8_xts_dec1) * uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4, * uint32_t nrounds@x5) * - * Decrypt a contiguous sequence of blocks with AES-XTS. + * Decrypt a contiguous sequdece of blocks with AES-XTS. * * nbytes must be a positive integral multiple of 128. * * Standard ABI calling convention. */ ENTRY(aesarmv8_xts_dec8) - stp fp, lr, [sp, #-48]! /* push stack frame uint128[2] */ + stp fp, lr, [sp, #-16]! /* push stack frame */ mov fp, sp mov x9, x0 /* x9 := deckey */ mov x10, x3 /* x10 := nbytes */ - ldr q9, [x4] /* q9 := tweak */ -1: str q9, [sp, #16] /* save tweak[0] */ - bl aesarmv8_xts_mulx /* q9 *= x; trash x0/q0/q1 */ - str q9, [sp, #32] /* save tweak[1] */ - bl aesarmv8_xts_mulx /* q9 *= x; trash x0/q0/q1 */ - mov v10.16b, v9.16b /* q10 := tweak[2] */ - bl aesarmv8_xts_mulx /* q9 *= x; trash x0/q0/q1 */ - mov v11.16b, v9.16b /* q11 := tweak[3] */ - bl aesarmv8_xts_mulx /* q9 *= x; trash x0/q0/q1 */ - mov v12.16b, v9.16b /* q11 := tweak[4] */ - bl aesarmv8_xts_mulx /* q9 *= x; trash x0/q0/q1 */ - mov v13.16b, v9.16b /* q11 := tweak[5] */ - bl aesarmv8_xts_mulx /* q9 *= x; trash x0/q0/q1 */ - mov v14.16b, v9.16b /* q11 := tweak[6] */ - bl aesarmv8_xts_mulx /* q9 *= x; trash x0/q0/q1 */ - mov v15.16b, v9.16b /* q11 := tweak[7] */ - ldp q8, q9, [sp, #16] /* q8 := tweak[0], q9 := tweak[1] */ - ldp q0, q1, [x1], #0x20 /* q[i] := pt[i] */ + ldr q31, [x4] /* q31 := tweak */ +1: mov v24.16b, v31.16b /* q24 := tweak[0] */ + bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ + mov v25.16b, v31.16b /* q25 := tweak[1] */ + bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ + mov v26.16b, v31.16b /* q26 := tweak[2] */ + bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ + mov v27.16b, v31.16b /* q27 := tweak[3] */ + bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ + mov v28.16b, v31.16b /* q28 := tweak[4] */ + bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ + mov v29.16b, v31.16b /* q29 := tweak[5] */ + bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ + mov v30.16b, v31.16b /* q30 := tweak[6] */ + bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ + /* q31 := tweak[7] */ + ldp q0, q1, [x1], #0x20 /* q[i] := ctxt[i] */ ldp q2, q3, [x1], #0x20 ldp q4, q5, [x1], #0x20 ldp q6, q7, [x1], #0x20 - eor v0.16b, v0.16b, v8.16b /* q[i] := pt[i] ^ tweak[i] */ - eor v1.16b, v1.16b, v9.16b - eor v2.16b, v2.16b, v10.16b - eor v3.16b, v3.16b, v11.16b - eor v4.16b, v4.16b, v12.16b - eor v5.16b, v5.16b, v13.16b - eor v6.16b, v6.16b, v14.16b - eor v7.16b, v7.16b, v15.16b + eor v0.16b, v0.16b, v24.16b /* q[i] := ctxt[i] ^ tweak[i] */ + eor v1.16b, v1.16b, v25.16b + eor v2.16b, v2.16b, v26.16b + eor v3.16b, v3.16b, v27.16b + eor v4.16b, v4.16b, v28.16b + eor v5.16b, v5.16b, v29.16b + eor v6.16b, v6.16b, v30.16b + eor v7.16b, v7.16b, v31.16b mov x0, x9 /* x0 := deckey */ mov x3, x5 /* x3 := nrounds */ - bl aesarmv8_dec8 /* decrypt q0,...,q7; trash x0/x3/q8 */ - ldr q8, [sp, #16] /* reload q8 := tweak[0] */ - eor v1.16b, v1.16b, v9.16b /* q[i] := AES(...) ^ tweak[i] */ - eor v2.16b, v2.16b, v10.16b - eor v3.16b, v3.16b, v11.16b - eor v0.16b, v0.16b, v8.16b - eor v4.16b, v4.16b, v12.16b - eor v5.16b, v5.16b, v13.16b - eor v6.16b, v6.16b, v14.16b - eor v7.16b, v7.16b, v15.16b - stp q0, q1, [x2], #0x20 /* store ciphertext blocks */ - stp q2, q3, [x2], #0x20 /* store ciphertext blocks */ - stp q4, q5, [x2], #0x20 /* store ciphertext blocks */ - stp q6, q7, [x2], #0x20 /* store ciphertext blocks */ - mov v9.16b, v15.16b /* q9 := q15 = tweak[7] */ - bl aesarmv8_xts_mulx /* q9 *= x; trash x0/q0/q1 */ + bl aesarmv8_dec8 /* decrypt q0-q7; trash x0/x3/q16 */ + eor v0.16b, v0.16b, v24.16b /* q[i] := AES(...) ^ tweak[i] */ + eor v1.16b, v1.16b, v25.16b + eor v2.16b, v2.16b, v26.16b + eor v3.16b, v3.16b, v27.16b + eor v4.16b, v4.16b, v28.16b + eor v5.16b, v5.16b, v29.16b + eor v6.16b, v6.16b, v30.16b + eor v7.16b, v7.16b, v31.16b + stp q0, q1, [x2], #0x20 /* store plaintext blocks */ + stp q2, q3, [x2], #0x20 + stp q4, q5, [x2], #0x20 + stp q6, q7, [x2], #0x20 + bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ subs x10, x10, #0x80 /* count down nbytes */ b.ne 1b /* repeat if more block groups */ - str q9, [x4] /* update tweak */ - ldp fp, lr, [sp], #48 /* pop stack frame */ + str q31, [x4] /* update tweak */ + ldp fp, lr, [sp], #16 /* pop stack frame */ ret END(aesarmv8_xts_dec8) /* - * aesarmv8_xts_mulx(tweak@q9) + * aesarmv8_xts_mulx(tweak@q31) * - * Multiply q9 by x, modulo x^128 + x^7 + x^2 + x + 1, in place. + * Multiply q31 by x, modulo x^128 + x^7 + x^2 + x + 1, in place. * Uses x0 and q0/q1 as temporaries. */ .text @@ -836,12 +827,12 @@ aesarmv8_xts_mulx: * carried into x^128 = x^7 + x^2 + x + 1. */ adrl x0, xtscarry - cmlt v1.2d, v9.2d, #0 /* v1.2d[i] := -1 if v9.2d[i] < 0, else 0 */ + cmlt v1.2d, v31.2d, #0 /* v1.2d[i] := -1 if v9.2d[i] < 0, else 0 */ ldr q0, [x0] /* q0 := xtscarry */ ext v1.16b, v1.16b, v1.16b, #8 /* swap halves of q1 */ - shl v9.2d, v9.2d, #1 /* shift */ + shl v31.2d, v31.2d, #1 /* shift */ and v0.16b, v0.16b, v1.16b /* copy xtscarry according to mask */ - eor v9.16b, v9.16b, v0.16b /* incorporate (a) and (b) */ + eor v31.16b, v31.16b, v0.16b /* incorporate (a) and (b) */ ret END(aesarmv8_xts_mulx) @@ -862,9 +853,9 @@ END(xtscarry) ENTRY(aesarmv8_xts_update) stp fp, lr, [sp, #-16]! /* push stack frame */ mov fp, sp - ldr q9, [x0] /* load tweak */ - bl aesarmv8_xts_mulx /* q9 *= x */ - str q9, [x1] /* store tweak */ + ldr q31, [x0] /* load tweak */ + bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ + str q31, [x1] /* store tweak */ ldp fp, lr, [sp], #16 /* pop stack frame */ ret END(aesarmv8_xts_update) @@ -875,22 +866,22 @@ END(aesarmv8_xts_update) * * Encrypt a single AES block in q0. * - * Internal ABI. Uses q8 as temporary. Destroys x0 and x3. + * Internal ABI. Uses q16 as temporary. Destroys x0 and x3. */ .text _ALIGN_TEXT .type aesarmv8_enc1,@function aesarmv8_enc1: - ldr q8, [x0], #0x10 /* load round key */ + ldr q16, [x0], #0x10 /* load round key */ 1: subs x3, x3, #1 - /* q0 := ShiftRows(SubBytes(AddRoundKey_q8(q0))) */ - aese v0.16b, v8.16b - ldr q8, [x0], #0x10 /* load next round key */ + /* q0 := ShiftRows(SubBytes(AddRoundKey_q16(q0))) */ + aese v0.16b, v16.16b + ldr q16, [x0], #0x10 /* load next round key */ b.eq 2f /* q0 := MixColumns(q0) */ aesmc v0.16b, v0.16b b 1b -2: eor v0.16b, v0.16b, v8.16b +2: eor v0.16b, v0.16b, v16.16b ret END(aesarmv8_enc1) @@ -901,24 +892,24 @@ END(aesarmv8_enc1) * * Encrypt eight AES blocks in q0 through q7 in parallel. * - * Internal ABI. Uses q8 as temporary. Destroys x0 and x3. + * Internal ABI. Uses q16 as temporary. Destroys x0 and x3. */ .text _ALIGN_TEXT .type aesarmv8_enc8,@function aesarmv8_enc8: - ldr q8, [x0], #0x10 /* load round key */ + ldr q16, [x0], #0x10 /* load round key */ 1: subs x3, x3, #1 - /* q[i] := ShiftRows(SubBytes(AddRoundKey_q8(q[i]))) */ - aese v0.16b, v8.16b - aese v1.16b, v8.16b - aese v2.16b, v8.16b - aese v3.16b, v8.16b - aese v4.16b, v8.16b - aese v5.16b, v8.16b - aese v6.16b, v8.16b - aese v7.16b, v8.16b - ldr q8, [x0], #0x10 /* load next round key */ + /* q[i] := ShiftRows(SubBytes(AddRoundKey_q16(q[i]))) */ + aese v0.16b, v16.16b + aese v1.16b, v16.16b + aese v2.16b, v16.16b + aese v3.16b, v16.16b + aese v4.16b, v16.16b + aese v5.16b, v16.16b + aese v6.16b, v16.16b + aese v7.16b, v16.16b + ldr q16, [x0], #0x10 /* load next round key */ b.eq 2f /* q[i] := MixColumns(q[i]) */ aesmc v0.16b, v0.16b @@ -930,14 +921,14 @@ aesarmv8_enc8: aesmc v6.16b, v6.16b aesmc v7.16b, v7.16b b 1b -2: eor v0.16b, v0.16b, v8.16b /* AddRoundKey */ - eor v1.16b, v1.16b, v8.16b - eor v2.16b, v2.16b, v8.16b - eor v3.16b, v3.16b, v8.16b - eor v4.16b, v4.16b, v8.16b - eor v5.16b, v5.16b, v8.16b - eor v6.16b, v6.16b, v8.16b - eor v7.16b, v7.16b, v8.16b +2: eor v0.16b, v0.16b, v16.16b /* AddRoundKey */ + eor v1.16b, v1.16b, v16.16b + eor v2.16b, v2.16b, v16.16b + eor v3.16b, v3.16b, v16.16b + eor v4.16b, v4.16b, v16.16b + eor v5.16b, v5.16b, v16.16b + eor v6.16b, v6.16b, v16.16b + eor v7.16b, v7.16b, v16.16b ret END(aesarmv8_enc8) @@ -947,22 +938,22 @@ END(aesarmv8_enc8) * * Decrypt a single AES block in q0. * - * Internal ABI. Uses q8 as temporary. Destroys x0 and x3. + * Internal ABI. Uses q16 as temporary. Destroys x0 and x3. */ .text _ALIGN_TEXT .type aesarmv8_dec1,@function aesarmv8_dec1: - ldr q8, [x0], #0x10 /* load round key */ + ldr q16, [x0], #0x10 /* load round key */ 1: subs x3, x3, #1 - /* q0 := InSubBytes(InShiftRows(AddRoundKey_q8(q0))) */ - aesd v0.16b, v8.16b - ldr q8, [x0], #0x10 /* load next round key */ + /* q0 := InSubBytes(InShiftRows(AddRoundKey_q16(q0))) */ + aesd v0.16b, v16.16b + ldr q16, [x0], #0x10 /* load next round key */ b.eq 2f /* q0 := InMixColumns(q0) */ aesimc v0.16b, v0.16b b 1b -2: eor v0.16b, v0.16b, v8.16b +2: eor v0.16b, v0.16b, v16.16b ret END(aesarmv8_dec1) @@ -973,24 +964,24 @@ END(aesarmv8_dec1) * * Decrypt eight AES blocks in q0 through q7 in parallel. * - * Internal ABI. Uses q8 as temporary. Destroys x0 and x3. + * Internal ABI. Uses q16 as temporary. Destroys x0 and x3. */ .text _ALIGN_TEXT .type aesarmv8_dec8,@function aesarmv8_dec8: - ldr q8, [x0], #0x10 /* load round key */ + ldr q16, [x0], #0x10 /* load round key */ 1: subs x3, x3, #1 - /* q[i] := InSubBytes(InShiftRows(AddRoundKey_q8(q[i]))) */ - aesd v0.16b, v8.16b - aesd v1.16b, v8.16b - aesd v2.16b, v8.16b - aesd v3.16b, v8.16b - aesd v4.16b, v8.16b - aesd v5.16b, v8.16b - aesd v6.16b, v8.16b - aesd v7.16b, v8.16b - ldr q8, [x0], #0x10 /* load next round key */ + /* q[i] := InSubBytes(InShiftRows(AddRoundKey_q16(q[i]))) */ + aesd v0.16b, v16.16b + aesd v1.16b, v16.16b + aesd v2.16b, v16.16b + aesd v3.16b, v16.16b + aesd v4.16b, v16.16b + aesd v5.16b, v16.16b + aesd v6.16b, v16.16b + aesd v7.16b, v16.16b + ldr q16, [x0], #0x10 /* load next round key */ b.eq 2f /* q[i] := InMixColumns(q[i]) */ aesimc v0.16b, v0.16b @@ -1002,13 +993,13 @@ aesarmv8_dec8: aesimc v6.16b, v6.16b aesimc v7.16b, v7.16b b 1b -2: eor v0.16b, v0.16b, v8.16b /* AddRoundKey */ - eor v1.16b, v1.16b, v8.16b - eor v2.16b, v2.16b, v8.16b - eor v3.16b, v3.16b, v8.16b - eor v4.16b, v4.16b, v8.16b - eor v5.16b, v5.16b, v8.16b - eor v6.16b, v6.16b, v8.16b - eor v7.16b, v7.16b, v8.16b +2: eor v0.16b, v0.16b, v16.16b /* AddRoundKey */ + eor v1.16b, v1.16b, v16.16b + eor v2.16b, v2.16b, v16.16b + eor v3.16b, v3.16b, v16.16b + eor v4.16b, v4.16b, v16.16b + eor v5.16b, v5.16b, v16.16b + eor v6.16b, v6.16b, v16.16b + eor v7.16b, v7.16b, v16.16b ret END(aesarmv8_dec8)