Module Name: src Committed By: riastradh Date: Mon Jul 27 20:54:12 UTC 2020
Modified Files: src/sys/crypto/aes/arch/arm: aes_armv8_64.S Log Message: Issue aese/aesmc and aesd/aesimc in pairs. Advised by the aarch64 optimization guide; increases cgd throughput by about 10%. To generate a diff of this commit: cvs rdiff -u -r1.9 -r1.10 src/sys/crypto/aes/arch/arm/aes_armv8_64.S Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
Modified files: Index: src/sys/crypto/aes/arch/arm/aes_armv8_64.S diff -u src/sys/crypto/aes/arch/arm/aes_armv8_64.S:1.9 src/sys/crypto/aes/arch/arm/aes_armv8_64.S:1.10 --- src/sys/crypto/aes/arch/arm/aes_armv8_64.S:1.9 Mon Jul 27 20:53:22 2020 +++ src/sys/crypto/aes/arch/arm/aes_armv8_64.S Mon Jul 27 20:54:11 2020 @@ -1,4 +1,4 @@ -/* $NetBSD: aes_armv8_64.S,v 1.9 2020/07/27 20:53:22 riastradh Exp $ */ +/* $NetBSD: aes_armv8_64.S,v 1.10 2020/07/27 20:54:11 riastradh Exp $ */ /*- * Copyright (c) 2020 The NetBSD Foundation, Inc. @@ -1041,15 +1041,18 @@ END(ctr32_inc) .type aesarmv8_enc1,@function aesarmv8_enc1: ldr q16, [x0], #0x10 /* load round key */ - b 2f + sub x3, x3, #1 _ALIGN_TEXT -1: /* q0 := MixColumns(q0) */ +1: /* q0 := MixColumns(ShiftRows(SubBytes(AddRoundKey_q16(q0)))) */ + aese v0.16b, v16.16b aesmc v0.16b, v0.16b -2: subs x3, x3, #1 + ldr q16, [x0], #0x10 + subs x3, x3, #1 + b.ne 1b /* q0 := ShiftRows(SubBytes(AddRoundKey_q16(q0))) */ aese v0.16b, v16.16b - ldr q16, [x0], #0x10 /* load next round key */ - b.ne 1b + ldr q16, [x0] /* load last round key */ + /* q0 := AddRoundKey_q16(q0) */ eor v0.16b, v0.16b, v16.16b ret END(aesarmv8_enc1) @@ -1067,17 +1070,21 @@ END(aesarmv8_enc1) .type aesarmv8_enc2,@function aesarmv8_enc2: ldr q16, [x0], #0x10 /* load round key */ - b 2f + sub x3, x3, #1 _ALIGN_TEXT -1: /* q[i] := MixColumns(q[i]) */ +1: /* q[i] := MixColumns(ShiftRows(SubBytes(AddRoundKey_q16(q[i])))) */ + aese v0.16b, v16.16b aesmc v0.16b, v0.16b + aese v1.16b, v16.16b aesmc v1.16b, v1.16b -2: subs x3, x3, #1 + ldr q16, [x0], #0x10 /* load next round key */ + subs x3, x3, #1 + b.ne 1b /* q[i] := ShiftRows(SubBytes(AddRoundKey_q16(q[i]))) */ aese v0.16b, v16.16b aese v1.16b, v16.16b - ldr q16, [x0], #0x10 /* load next round key */ - b.ne 1b + ldr q16, [x0] /* load last round key */ + /* q[i] := AddRoundKey_q16(q[i]) */ eor v0.16b, v0.16b, v16.16b eor v1.16b, v1.16b, v16.16b ret @@ -1097,18 +1104,28 @@ END(aesarmv8_enc2) .type aesarmv8_enc8,@function aesarmv8_enc8: ldr q16, [x0], #0x10 /* load round key */ - b 2f + sub x3, x3, #1 _ALIGN_TEXT -1: /* q[i] := MixColumns(q[i]) */ +1: /* q[i] := MixColumns(ShiftRows(SubBytes(AddRoundKey_q16(q[i])))) */ + aese v0.16b, v16.16b aesmc v0.16b, v0.16b + aese v1.16b, v16.16b aesmc v1.16b, v1.16b + aese v2.16b, v16.16b aesmc v2.16b, v2.16b + aese v3.16b, v16.16b aesmc v3.16b, v3.16b + aese v4.16b, v16.16b aesmc v4.16b, v4.16b + aese v5.16b, v16.16b aesmc v5.16b, v5.16b + aese v6.16b, v16.16b aesmc v6.16b, v6.16b + aese v7.16b, v16.16b aesmc v7.16b, v7.16b -2: subs x3, x3, #1 + ldr q16, [x0], #0x10 /* load next round key */ + subs x3, x3, #1 + b.ne 1b /* q[i] := ShiftRows(SubBytes(AddRoundKey_q16(q[i]))) */ aese v0.16b, v16.16b aese v1.16b, v16.16b @@ -1118,9 +1135,9 @@ aesarmv8_enc8: aese v5.16b, v16.16b aese v6.16b, v16.16b aese v7.16b, v16.16b - ldr q16, [x0], #0x10 /* load next round key */ - b.ne 1b - eor v0.16b, v0.16b, v16.16b /* AddRoundKey */ + ldr q16, [x0] /* load last round key */ + /* q[i] := AddRoundKey_q16(q[i]) */ + eor v0.16b, v0.16b, v16.16b eor v1.16b, v1.16b, v16.16b eor v2.16b, v2.16b, v16.16b eor v3.16b, v3.16b, v16.16b @@ -1144,15 +1161,19 @@ END(aesarmv8_enc8) .type aesarmv8_dec1,@function aesarmv8_dec1: ldr q16, [x0], #0x10 /* load round key */ - b 2f + sub x3, x3, #1 _ALIGN_TEXT -1: /* q0 := InMixColumns(q0) */ - aesimc v0.16b, v0.16b -2: subs x3, x3, #1 - /* q0 := InSubBytes(InShiftRows(AddRoundKey_q16(q0))) */ +1: /* q0 := InSubBytes(InShiftRows(AddRoundKey_q16(q0))) */ aesd v0.16b, v16.16b + /* q0 := InMixColumns(q0) */ + aesimc v0.16b, v0.16b ldr q16, [x0], #0x10 /* load next round key */ + subs x3, x3, #1 b.ne 1b + /* q0 := InSubBytes(InShiftRows(AddRoundKey_q16(q0))) */ + aesd v0.16b, v16.16b + ldr q16, [x0] /* load last round key */ + /* q0 := AddRoundKey_q16(q0) */ eor v0.16b, v0.16b, v16.16b ret END(aesarmv8_dec1) @@ -1171,18 +1192,29 @@ END(aesarmv8_dec1) .type aesarmv8_dec8,@function aesarmv8_dec8: ldr q16, [x0], #0x10 /* load round key */ - b 2f + sub x3, x3, #1 _ALIGN_TEXT -1: /* q[i] := InMixColumns(q[i]) */ +1: /* q[i] := InSubBytes(InShiftRows(AddRoundKey_q16(q[i]))) */ + aesd v0.16b, v16.16b + /* q[i] := InMixColumns(q[i]) */ aesimc v0.16b, v0.16b + aesd v1.16b, v16.16b aesimc v1.16b, v1.16b + aesd v2.16b, v16.16b aesimc v2.16b, v2.16b + aesd v3.16b, v16.16b aesimc v3.16b, v3.16b + aesd v4.16b, v16.16b aesimc v4.16b, v4.16b + aesd v5.16b, v16.16b aesimc v5.16b, v5.16b + aesd v6.16b, v16.16b aesimc v6.16b, v6.16b + aesd v7.16b, v16.16b aesimc v7.16b, v7.16b -2: subs x3, x3, #1 + ldr q16, [x0], #0x10 /* load next round key */ + subs x3, x3, #1 + b.ne 1b /* q[i] := InSubBytes(InShiftRows(AddRoundKey_q16(q[i]))) */ aesd v0.16b, v16.16b aesd v1.16b, v16.16b @@ -1192,9 +1224,9 @@ aesarmv8_dec8: aesd v5.16b, v16.16b aesd v6.16b, v16.16b aesd v7.16b, v16.16b - ldr q16, [x0], #0x10 /* load next round key */ - b.ne 1b - eor v0.16b, v0.16b, v16.16b /* AddRoundKey */ + ldr q16, [x0] /* load last round key */ + /* q[i] := AddRoundKey_q16(q[i]) */ + eor v0.16b, v0.16b, v16.16b eor v1.16b, v1.16b, v16.16b eor v2.16b, v2.16b, v16.16b eor v3.16b, v3.16b, v16.16b