Improve overall performance of chacha20 encrypt and decrypt operations
for Power10 or later CPU.

Signed-off-by: Danny Tsen <dt...@linux.ibm.com>
---
 arch/powerpc/crypto/chacha-p10le-8x.S | 842 ++++++++++++++++++++++++++
 1 file changed, 842 insertions(+)
 create mode 100644 arch/powerpc/crypto/chacha-p10le-8x.S

diff --git a/arch/powerpc/crypto/chacha-p10le-8x.S 
b/arch/powerpc/crypto/chacha-p10le-8x.S
new file mode 100644
index 000000000000..17bedb66b822
--- /dev/null
+++ b/arch/powerpc/crypto/chacha-p10le-8x.S
@@ -0,0 +1,842 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#
+# Accelerated chacha20 implementation for ppc64le.
+#
+# Copyright 2023- IBM Corp. All rights reserved
+#
+#===================================================================================
+# Written by Danny Tsen <dt...@us.ibm.com>
+#
+# chacha_p10le_8x(u32 *state, byte *dst, const byte *src,
+#                               size_t len, int nrounds);
+#
+# do rounds,  8 quarter rounds
+# 1.  a += b; d ^= a; d <<<= 16;
+# 2.  c += d; b ^= c; b <<<= 12;
+# 3.  a += b; d ^= a; d <<<= 8;
+# 4.  c += d; b ^= c; b <<<= 7
+#
+# row1 = (row1 + row2),  row4 = row1 xor row4,  row4 rotate each word by 16
+# row3 = (row3 + row4),  row2 = row3 xor row2,  row2 rotate each word by 12
+# row1 = (row1 + row2), row4 = row1 xor row4,  row4 rotate each word by 8
+# row3 = (row3 + row4), row2 = row3 xor row2,  row2 rotate each word by 7
+#
+# 4 blocks (a b c d)
+#
+# a0 b0 c0 d0
+# a1 b1 c1 d1
+# ...
+# a4 b4 c4 d4
+# ...
+# a8 b8 c8 d8
+# ...
+# a12 b12 c12 d12
+# a13 ...
+# a14 ...
+# a15 b15 c15 d15
+#
+# Column round (v0, v4,  v8, v12, v1, v5,  v9, v13, v2, v6, v10, v14, v3, v7, 
v11, v15)
+# Diagnal round (v0, v5, v10, v15, v1, v6, v11, v12, v2, v7,  v8, v13, v3, v4, 
 v9, v14)
+#
+
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/asm-compat.h>
+#include <linux/linkage.h>
+
+.machine       "any"
+.text
+
+.macro SAVE_GPR GPR OFFSET FRAME
+       std     \GPR,\OFFSET(\FRAME)
+.endm
+
+.macro SAVE_VRS VRS OFFSET FRAME
+       li      16, \OFFSET
+       stvx    \VRS, 16, \FRAME
+.endm
+
+.macro SAVE_VSX VSX OFFSET FRAME
+       li      16, \OFFSET
+       stxvx   \VSX, 16, \FRAME
+.endm
+
+.macro RESTORE_GPR GPR OFFSET FRAME
+       ld      \GPR,\OFFSET(\FRAME)
+.endm
+
+.macro RESTORE_VRS VRS OFFSET FRAME
+       li      16, \OFFSET
+       lvx     \VRS, 16, \FRAME
+.endm
+
+.macro RESTORE_VSX VSX OFFSET FRAME
+       li      16, \OFFSET
+       lxvx    \VSX, 16, \FRAME
+.endm
+
+.macro SAVE_REGS
+       mflr 0
+       std 0, 16(1)
+       stdu 1,-752(1)
+
+       SAVE_GPR 14, 112, 1
+       SAVE_GPR 15, 120, 1
+       SAVE_GPR 16, 128, 1
+       SAVE_GPR 17, 136, 1
+       SAVE_GPR 18, 144, 1
+       SAVE_GPR 19, 152, 1
+       SAVE_GPR 20, 160, 1
+       SAVE_GPR 21, 168, 1
+       SAVE_GPR 22, 176, 1
+       SAVE_GPR 23, 184, 1
+       SAVE_GPR 24, 192, 1
+       SAVE_GPR 25, 200, 1
+       SAVE_GPR 26, 208, 1
+       SAVE_GPR 27, 216, 1
+       SAVE_GPR 28, 224, 1
+       SAVE_GPR 29, 232, 1
+       SAVE_GPR 30, 240, 1
+       SAVE_GPR 31, 248, 1
+
+       addi    9, 1, 256
+       SAVE_VRS 20, 0, 9
+       SAVE_VRS 21, 16, 9
+       SAVE_VRS 22, 32, 9
+       SAVE_VRS 23, 48, 9
+       SAVE_VRS 24, 64, 9
+       SAVE_VRS 25, 80, 9
+       SAVE_VRS 26, 96, 9
+       SAVE_VRS 27, 112, 9
+       SAVE_VRS 28, 128, 9
+       SAVE_VRS 29, 144, 9
+       SAVE_VRS 30, 160, 9
+       SAVE_VRS 31, 176, 9
+
+       SAVE_VSX 14, 192, 9
+       SAVE_VSX 15, 208, 9
+       SAVE_VSX 16, 224, 9
+       SAVE_VSX 17, 240, 9
+       SAVE_VSX 18, 256, 9
+       SAVE_VSX 19, 272, 9
+       SAVE_VSX 20, 288, 9
+       SAVE_VSX 21, 304, 9
+       SAVE_VSX 22, 320, 9
+       SAVE_VSX 23, 336, 9
+       SAVE_VSX 24, 352, 9
+       SAVE_VSX 25, 368, 9
+       SAVE_VSX 26, 384, 9
+       SAVE_VSX 27, 400, 9
+       SAVE_VSX 28, 416, 9
+       SAVE_VSX 29, 432, 9
+       SAVE_VSX 30, 448, 9
+       SAVE_VSX 31, 464, 9
+.endm # SAVE_REGS
+
+.macro RESTORE_REGS
+       addi    9, 1, 256
+       RESTORE_VRS 20, 0, 9
+       RESTORE_VRS 21, 16, 9
+       RESTORE_VRS 22, 32, 9
+       RESTORE_VRS 23, 48, 9
+       RESTORE_VRS 24, 64, 9
+       RESTORE_VRS 25, 80, 9
+       RESTORE_VRS 26, 96, 9
+       RESTORE_VRS 27, 112, 9
+       RESTORE_VRS 28, 128, 9
+       RESTORE_VRS 29, 144, 9
+       RESTORE_VRS 30, 160, 9
+       RESTORE_VRS 31, 176, 9
+
+       RESTORE_VSX 14, 192, 9
+       RESTORE_VSX 15, 208, 9
+       RESTORE_VSX 16, 224, 9
+       RESTORE_VSX 17, 240, 9
+       RESTORE_VSX 18, 256, 9
+       RESTORE_VSX 19, 272, 9
+       RESTORE_VSX 20, 288, 9
+       RESTORE_VSX 21, 304, 9
+       RESTORE_VSX 22, 320, 9
+       RESTORE_VSX 23, 336, 9
+       RESTORE_VSX 24, 352, 9
+       RESTORE_VSX 25, 368, 9
+       RESTORE_VSX 26, 384, 9
+       RESTORE_VSX 27, 400, 9
+       RESTORE_VSX 28, 416, 9
+       RESTORE_VSX 29, 432, 9
+       RESTORE_VSX 30, 448, 9
+       RESTORE_VSX 31, 464, 9
+
+       RESTORE_GPR 14, 112, 1
+       RESTORE_GPR 15, 120, 1
+       RESTORE_GPR 16, 128, 1
+       RESTORE_GPR 17, 136, 1
+       RESTORE_GPR 18, 144, 1
+       RESTORE_GPR 19, 152, 1
+       RESTORE_GPR 20, 160, 1
+       RESTORE_GPR 21, 168, 1
+       RESTORE_GPR 22, 176, 1
+       RESTORE_GPR 23, 184, 1
+       RESTORE_GPR 24, 192, 1
+       RESTORE_GPR 25, 200, 1
+       RESTORE_GPR 26, 208, 1
+       RESTORE_GPR 27, 216, 1
+       RESTORE_GPR 28, 224, 1
+       RESTORE_GPR 29, 232, 1
+       RESTORE_GPR 30, 240, 1
+       RESTORE_GPR 31, 248, 1
+
+       addi    1, 1, 752
+       ld 0, 16(1)
+       mtlr 0
+.endm # RESTORE_REGS
+
+.macro QT_loop_8x
+       # QR(v0, v4,  v8, v12, v1, v5,  v9, v13, v2, v6, v10, v14, v3, v7, v11, 
v15)
+       xxlor   0, 32+25, 32+25
+       xxlor   32+25, 20, 20
+       vadduwm 0, 0, 4
+       vadduwm 1, 1, 5
+       vadduwm 2, 2, 6
+       vadduwm 3, 3, 7
+         vadduwm 16, 16, 20
+         vadduwm 17, 17, 21
+         vadduwm 18, 18, 22
+         vadduwm 19, 19, 23
+
+         vpermxor 12, 12, 0, 25
+         vpermxor 13, 13, 1, 25
+         vpermxor 14, 14, 2, 25
+         vpermxor 15, 15, 3, 25
+         vpermxor 28, 28, 16, 25
+         vpermxor 29, 29, 17, 25
+         vpermxor 30, 30, 18, 25
+         vpermxor 31, 31, 19, 25
+       xxlor   32+25, 0, 0
+       vadduwm 8, 8, 12
+       vadduwm 9, 9, 13
+       vadduwm 10, 10, 14
+       vadduwm 11, 11, 15
+         vadduwm 24, 24, 28
+         vadduwm 25, 25, 29
+         vadduwm 26, 26, 30
+         vadduwm 27, 27, 31
+       vxor 4, 4, 8
+       vxor 5, 5, 9
+       vxor 6, 6, 10
+       vxor 7, 7, 11
+         vxor 20, 20, 24
+         vxor 21, 21, 25
+         vxor 22, 22, 26
+         vxor 23, 23, 27
+
+       xxlor   0, 32+25, 32+25
+       xxlor   32+25, 21, 21
+       vrlw 4, 4, 25  #
+       vrlw 5, 5, 25
+       vrlw 6, 6, 25
+       vrlw 7, 7, 25
+         vrlw 20, 20, 25  #
+         vrlw 21, 21, 25
+         vrlw 22, 22, 25
+         vrlw 23, 23, 25
+       xxlor   32+25, 0, 0
+       vadduwm 0, 0, 4
+       vadduwm 1, 1, 5
+       vadduwm 2, 2, 6
+       vadduwm 3, 3, 7
+         vadduwm 16, 16, 20
+         vadduwm 17, 17, 21
+         vadduwm 18, 18, 22
+         vadduwm 19, 19, 23
+
+       xxlor   0, 32+25, 32+25
+       xxlor   32+25, 22, 22
+         vpermxor 12, 12, 0, 25
+         vpermxor 13, 13, 1, 25
+         vpermxor 14, 14, 2, 25
+         vpermxor 15, 15, 3, 25
+         vpermxor 28, 28, 16, 25
+         vpermxor 29, 29, 17, 25
+         vpermxor 30, 30, 18, 25
+         vpermxor 31, 31, 19, 25
+       xxlor   32+25, 0, 0
+       vadduwm 8, 8, 12
+       vadduwm 9, 9, 13
+       vadduwm 10, 10, 14
+       vadduwm 11, 11, 15
+         vadduwm 24, 24, 28
+         vadduwm 25, 25, 29
+         vadduwm 26, 26, 30
+         vadduwm 27, 27, 31
+       xxlor   0, 32+28, 32+28
+       xxlor   32+28, 23, 23
+       vxor 4, 4, 8
+       vxor 5, 5, 9
+       vxor 6, 6, 10
+       vxor 7, 7, 11
+         vxor 20, 20, 24
+         vxor 21, 21, 25
+         vxor 22, 22, 26
+         vxor 23, 23, 27
+       vrlw 4, 4, 28  #
+       vrlw 5, 5, 28
+       vrlw 6, 6, 28
+       vrlw 7, 7, 28
+         vrlw 20, 20, 28  #
+         vrlw 21, 21, 28
+         vrlw 22, 22, 28
+         vrlw 23, 23, 28
+       xxlor   32+28, 0, 0
+
+       # QR(v0, v5, v10, v15, v1, v6, v11, v12, v2, v7,  v8, v13, v3, v4,  v9, 
v14)
+       xxlor   0, 32+25, 32+25
+       xxlor   32+25, 20, 20
+       vadduwm 0, 0, 5
+       vadduwm 1, 1, 6
+       vadduwm 2, 2, 7
+       vadduwm 3, 3, 4
+         vadduwm 16, 16, 21
+         vadduwm 17, 17, 22
+         vadduwm 18, 18, 23
+         vadduwm 19, 19, 20
+
+         vpermxor 15, 15, 0, 25
+         vpermxor 12, 12, 1, 25
+         vpermxor 13, 13, 2, 25
+         vpermxor 14, 14, 3, 25
+         vpermxor 31, 31, 16, 25
+         vpermxor 28, 28, 17, 25
+         vpermxor 29, 29, 18, 25
+         vpermxor 30, 30, 19, 25
+
+       xxlor   32+25, 0, 0
+       vadduwm 10, 10, 15
+       vadduwm 11, 11, 12
+       vadduwm 8, 8, 13
+       vadduwm 9, 9, 14
+         vadduwm 26, 26, 31
+         vadduwm 27, 27, 28
+         vadduwm 24, 24, 29
+         vadduwm 25, 25, 30
+       vxor 5, 5, 10
+       vxor 6, 6, 11
+       vxor 7, 7, 8
+       vxor 4, 4, 9
+         vxor 21, 21, 26
+         vxor 22, 22, 27
+         vxor 23, 23, 24
+         vxor 20, 20, 25
+
+       xxlor   0, 32+25, 32+25
+       xxlor   32+25, 21, 21
+       vrlw 5, 5, 25
+       vrlw 6, 6, 25
+       vrlw 7, 7, 25
+       vrlw 4, 4, 25
+         vrlw 21, 21, 25
+         vrlw 22, 22, 25
+         vrlw 23, 23, 25
+         vrlw 20, 20, 25
+       xxlor   32+25, 0, 0
+
+       vadduwm 0, 0, 5
+       vadduwm 1, 1, 6
+       vadduwm 2, 2, 7
+       vadduwm 3, 3, 4
+         vadduwm 16, 16, 21
+         vadduwm 17, 17, 22
+         vadduwm 18, 18, 23
+         vadduwm 19, 19, 20
+
+       xxlor   0, 32+25, 32+25
+       xxlor   32+25, 22, 22
+         vpermxor 15, 15, 0, 25
+         vpermxor 12, 12, 1, 25
+         vpermxor 13, 13, 2, 25
+         vpermxor 14, 14, 3, 25
+         vpermxor 31, 31, 16, 25
+         vpermxor 28, 28, 17, 25
+         vpermxor 29, 29, 18, 25
+         vpermxor 30, 30, 19, 25
+       xxlor   32+25, 0, 0
+
+       vadduwm 10, 10, 15
+       vadduwm 11, 11, 12
+       vadduwm 8, 8, 13
+       vadduwm 9, 9, 14
+         vadduwm 26, 26, 31
+         vadduwm 27, 27, 28
+         vadduwm 24, 24, 29
+         vadduwm 25, 25, 30
+
+       xxlor   0, 32+28, 32+28
+       xxlor   32+28, 23, 23
+       vxor 5, 5, 10
+       vxor 6, 6, 11
+       vxor 7, 7, 8
+       vxor 4, 4, 9
+         vxor 21, 21, 26
+         vxor 22, 22, 27
+         vxor 23, 23, 24
+         vxor 20, 20, 25
+       vrlw 5, 5, 28
+       vrlw 6, 6, 28
+       vrlw 7, 7, 28
+       vrlw 4, 4, 28
+         vrlw 21, 21, 28
+         vrlw 22, 22, 28
+         vrlw 23, 23, 28
+         vrlw 20, 20, 28
+       xxlor   32+28, 0, 0
+.endm
+
+.macro QT_loop_4x
+       # QR(v0, v4,  v8, v12, v1, v5,  v9, v13, v2, v6, v10, v14, v3, v7, v11, 
v15)
+       vadduwm 0, 0, 4
+       vadduwm 1, 1, 5
+       vadduwm 2, 2, 6
+       vadduwm 3, 3, 7
+         vpermxor 12, 12, 0, 20
+         vpermxor 13, 13, 1, 20
+         vpermxor 14, 14, 2, 20
+         vpermxor 15, 15, 3, 20
+       vadduwm 8, 8, 12
+       vadduwm 9, 9, 13
+       vadduwm 10, 10, 14
+       vadduwm 11, 11, 15
+       vxor 4, 4, 8
+       vxor 5, 5, 9
+       vxor 6, 6, 10
+       vxor 7, 7, 11
+       vrlw 4, 4, 21
+       vrlw 5, 5, 21
+       vrlw 6, 6, 21
+       vrlw 7, 7, 21
+       vadduwm 0, 0, 4
+       vadduwm 1, 1, 5
+       vadduwm 2, 2, 6
+       vadduwm 3, 3, 7
+         vpermxor 12, 12, 0, 22
+         vpermxor 13, 13, 1, 22
+         vpermxor 14, 14, 2, 22
+         vpermxor 15, 15, 3, 22
+       vadduwm 8, 8, 12
+       vadduwm 9, 9, 13
+       vadduwm 10, 10, 14
+       vadduwm 11, 11, 15
+       vxor 4, 4, 8
+       vxor 5, 5, 9
+       vxor 6, 6, 10
+       vxor 7, 7, 11
+       vrlw 4, 4, 23
+       vrlw 5, 5, 23
+       vrlw 6, 6, 23
+       vrlw 7, 7, 23
+
+       # QR(v0, v5, v10, v15, v1, v6, v11, v12, v2, v7,  v8, v13, v3, v4,  v9, 
v14)
+       vadduwm 0, 0, 5
+       vadduwm 1, 1, 6
+       vadduwm 2, 2, 7
+       vadduwm 3, 3, 4
+         vpermxor 15, 15, 0, 20
+         vpermxor 12, 12, 1, 20
+         vpermxor 13, 13, 2, 20
+         vpermxor 14, 14, 3, 20
+       vadduwm 10, 10, 15
+       vadduwm 11, 11, 12
+       vadduwm 8, 8, 13
+       vadduwm 9, 9, 14
+       vxor 5, 5, 10
+       vxor 6, 6, 11
+       vxor 7, 7, 8
+       vxor 4, 4, 9
+       vrlw 5, 5, 21
+       vrlw 6, 6, 21
+       vrlw 7, 7, 21
+       vrlw 4, 4, 21
+       vadduwm 0, 0, 5
+       vadduwm 1, 1, 6
+       vadduwm 2, 2, 7
+       vadduwm 3, 3, 4
+         vpermxor 15, 15, 0, 22
+         vpermxor 12, 12, 1, 22
+         vpermxor 13, 13, 2, 22
+         vpermxor 14, 14, 3, 22
+       vadduwm 10, 10, 15
+       vadduwm 11, 11, 12
+       vadduwm 8, 8, 13
+       vadduwm 9, 9, 14
+       vxor 5, 5, 10
+       vxor 6, 6, 11
+       vxor 7, 7, 8
+       vxor 4, 4, 9
+       vrlw 5, 5, 23
+       vrlw 6, 6, 23
+       vrlw 7, 7, 23
+       vrlw 4, 4, 23
+.endm
+
+# Transpose
+.macro TP_4x a0 a1 a2 a3
+       xxmrghw  10, 32+\a0, 32+\a1     # a0, a1, b0, b1
+       xxmrghw  11, 32+\a2, 32+\a3     # a2, a3, b2, b3
+       xxmrglw  12, 32+\a0, 32+\a1     # c0, c1, d0, d1
+       xxmrglw  13, 32+\a2, 32+\a3     # c2, c3, d2, d3
+       xxpermdi        32+\a0, 10, 11, 0       # a0, a1, a2, a3
+       xxpermdi        32+\a1, 10, 11, 3       # b0, b1, b2, b3
+       xxpermdi        32+\a2, 12, 13, 0       # c0, c1, c2, c3
+       xxpermdi        32+\a3, 12, 13, 3       # d0, d1, d2, d3
+.endm
+
+# key stream = working state + state
+.macro Add_state S
+       vadduwm \S+0, \S+0, 16-\S
+       vadduwm \S+4, \S+4, 17-\S
+       vadduwm \S+8, \S+8, 18-\S
+       vadduwm \S+12, \S+12, 19-\S
+
+       vadduwm \S+1, \S+1, 16-\S
+       vadduwm \S+5, \S+5, 17-\S
+       vadduwm \S+9, \S+9, 18-\S
+       vadduwm \S+13, \S+13, 19-\S
+
+       vadduwm \S+2, \S+2, 16-\S
+       vadduwm \S+6, \S+6, 17-\S
+       vadduwm \S+10, \S+10, 18-\S
+       vadduwm \S+14, \S+14, 19-\S
+
+       vadduwm \S+3, \S+3, 16-\S
+       vadduwm \S+7, \S+7, 17-\S
+       vadduwm \S+11, \S+11, 18-\S
+       vadduwm \S+15, \S+15, 19-\S
+.endm
+
+#
+# write 256 bytes
+#
+.macro Write_256 S
+       add 9, 14, 5
+       add 16, 14, 4
+       lxvw4x 0, 0, 9
+       lxvw4x 1, 17, 9
+       lxvw4x 2, 18, 9
+       lxvw4x 3, 19, 9
+       lxvw4x 4, 20, 9
+       lxvw4x 5, 21, 9
+       lxvw4x 6, 22, 9
+       lxvw4x 7, 23, 9
+       lxvw4x 8, 24, 9
+       lxvw4x 9, 25, 9
+       lxvw4x 10, 26, 9
+       lxvw4x 11, 27, 9
+       lxvw4x 12, 28, 9
+       lxvw4x 13, 29, 9
+       lxvw4x 14, 30, 9
+       lxvw4x 15, 31, 9
+
+       xxlxor \S+32, \S+32, 0
+       xxlxor \S+36, \S+36, 1
+       xxlxor \S+40, \S+40, 2
+       xxlxor \S+44, \S+44, 3
+       xxlxor \S+33, \S+33, 4
+       xxlxor \S+37, \S+37, 5
+       xxlxor \S+41, \S+41, 6
+       xxlxor \S+45, \S+45, 7
+       xxlxor \S+34, \S+34, 8
+       xxlxor \S+38, \S+38, 9
+       xxlxor \S+42, \S+42, 10
+       xxlxor \S+46, \S+46, 11
+       xxlxor \S+35, \S+35, 12
+       xxlxor \S+39, \S+39, 13
+       xxlxor \S+43, \S+43, 14
+       xxlxor \S+47, \S+47, 15
+
+       stxvw4x \S+32, 0, 16
+       stxvw4x \S+36, 17, 16
+       stxvw4x \S+40, 18, 16
+       stxvw4x \S+44, 19, 16
+
+       stxvw4x \S+33, 20, 16
+       stxvw4x \S+37, 21, 16
+       stxvw4x \S+41, 22, 16
+       stxvw4x \S+45, 23, 16
+
+       stxvw4x \S+34, 24, 16
+       stxvw4x \S+38, 25, 16
+       stxvw4x \S+42, 26, 16
+       stxvw4x \S+46, 27, 16
+
+       stxvw4x \S+35, 28, 16
+       stxvw4x \S+39, 29, 16
+       stxvw4x \S+43, 30, 16
+       stxvw4x \S+47, 31, 16
+
+.endm
+
+#
+# chacha20_p10le_8x(u32 *state, byte *dst, const byte *src, size_t len, int 
nrounds);
+#
+SYM_FUNC_START(chacha_p10le_8x)
+.align 5
+       cmpdi   6, 0
+       ble     Out_no_chacha
+
+       SAVE_REGS
+
+       # r17 - r31 mainly for Write_256 macro.
+       li      17, 16
+       li      18, 32
+       li      19, 48
+       li      20, 64
+       li      21, 80
+       li      22, 96
+       li      23, 112
+       li      24, 128
+       li      25, 144
+       li      26, 160
+       li      27, 176
+       li      28, 192
+       li      29, 208
+       li      30, 224
+       li      31, 240
+
+       mr 15, 6                        # len
+       li 14, 0                        # offset to inp and outp
+
+        lxvw4x 48, 0, 3                #  vr16, constants
+       lxvw4x  49, 17, 3               #  vr17, key 1
+       lxvw4x  50, 18, 3               #  vr18, key 2
+       lxvw4x  51, 19, 3               #  vr19, counter, nonce
+
+       # create (0, 1, 2, 3) counters
+       vspltisw 0, 0
+       vspltisw 1, 1
+       vspltisw 2, 2
+       vspltisw 3, 3
+       vmrghw  4, 0, 1
+       vmrglw  5, 2, 3
+       vsldoi  30, 4, 5, 8             # vr30 counter, 4 (0, 1, 2, 3)
+
+       vspltisw 21, 12
+       vspltisw 23, 7
+
+       addis   11, 2, permx@toc@ha
+       addi    11, 11, permx@toc@l
+       lxvw4x  32+20, 0, 11
+       lxvw4x  32+22, 17, 11
+
+       sradi   8, 7, 1
+
+       mtctr 8
+
+       # save constants to vsx
+       xxlor   16, 48, 48
+       xxlor   17, 49, 49
+       xxlor   18, 50, 50
+       xxlor   19, 51, 51
+
+       vspltisw 25, 4
+       vspltisw 26, 8
+
+       xxlor   25, 32+26, 32+26
+       xxlor   24, 32+25, 32+25
+
+       vadduwm 31, 30, 25              # counter = (0, 1, 2, 3) + (4, 4, 4, 4)
+       xxlor   30, 32+30, 32+30
+       xxlor   31, 32+31, 32+31
+
+       xxlor   20, 32+20, 32+20
+       xxlor   21, 32+21, 32+21
+       xxlor   22, 32+22, 32+22
+       xxlor   23, 32+23, 32+23
+
+       cmpdi   6, 512
+       blt     Loop_last
+
+Loop_8x:
+       xxspltw  32+0, 16, 0
+       xxspltw  32+1, 16, 1
+       xxspltw  32+2, 16, 2
+       xxspltw  32+3, 16, 3
+
+       xxspltw  32+4, 17, 0
+       xxspltw  32+5, 17, 1
+       xxspltw  32+6, 17, 2
+       xxspltw  32+7, 17, 3
+       xxspltw  32+8, 18, 0
+       xxspltw  32+9, 18, 1
+       xxspltw  32+10, 18, 2
+       xxspltw  32+11, 18, 3
+       xxspltw  32+12, 19, 0
+       xxspltw  32+13, 19, 1
+       xxspltw  32+14, 19, 2
+       xxspltw  32+15, 19, 3
+       vadduwm 12, 12, 30      # increase counter
+
+       xxspltw  32+16, 16, 0
+       xxspltw  32+17, 16, 1
+       xxspltw  32+18, 16, 2
+       xxspltw  32+19, 16, 3
+
+       xxspltw  32+20, 17, 0
+       xxspltw  32+21, 17, 1
+       xxspltw  32+22, 17, 2
+       xxspltw  32+23, 17, 3
+       xxspltw  32+24, 18, 0
+       xxspltw  32+25, 18, 1
+       xxspltw  32+26, 18, 2
+       xxspltw  32+27, 18, 3
+       xxspltw  32+28, 19, 0
+       xxspltw  32+29, 19, 1
+       vadduwm 28, 28, 31      # increase counter
+       xxspltw  32+30, 19, 2
+       xxspltw  32+31, 19, 3
+
+.align 5
+quarter_loop_8x:
+       QT_loop_8x
+
+       bdnz    quarter_loop_8x
+
+       xxlor   0, 32+30, 32+30
+       xxlor   32+30, 30, 30
+       vadduwm 12, 12, 30
+       xxlor   32+30, 0, 0
+       TP_4x 0, 1, 2, 3
+       TP_4x 4, 5, 6, 7
+       TP_4x 8, 9, 10, 11
+       TP_4x 12, 13, 14, 15
+
+       xxlor   0, 48, 48
+       xxlor   1, 49, 49
+       xxlor   2, 50, 50
+       xxlor   3, 51, 51
+       xxlor   48, 16, 16
+       xxlor   49, 17, 17
+       xxlor   50, 18, 18
+       xxlor   51, 19, 19
+       Add_state 0
+       xxlor   48, 0, 0
+       xxlor   49, 1, 1
+       xxlor   50, 2, 2
+       xxlor   51, 3, 3
+       Write_256 0
+       addi    14, 14, 256     # offset +=256
+       addi    15, 15, -256    # len -=256
+
+       xxlor   5, 32+31, 32+31
+       xxlor   32+31, 31, 31
+       vadduwm 28, 28, 31
+       xxlor   32+31, 5, 5
+       TP_4x 16+0, 16+1, 16+2, 16+3
+       TP_4x 16+4, 16+5, 16+6, 16+7
+       TP_4x 16+8, 16+9, 16+10, 16+11
+       TP_4x 16+12, 16+13, 16+14, 16+15
+
+       xxlor   32, 16, 16
+       xxlor   33, 17, 17
+       xxlor   34, 18, 18
+       xxlor   35, 19, 19
+       Add_state 16
+       Write_256 16
+       addi    14, 14, 256     # offset +=256
+       addi    15, 15, -256    # len +=256
+
+       xxlor   32+24, 24, 24
+       xxlor   32+25, 25, 25
+       xxlor   32+30, 30, 30
+       vadduwm 30, 30, 25
+       vadduwm 31, 30, 24
+       xxlor   30, 32+30, 32+30
+       xxlor   31, 32+31, 32+31
+
+       cmpdi   15, 0
+       beq     Out_loop
+
+       cmpdi   15, 512
+       blt     Loop_last
+
+       mtctr 8
+       b Loop_8x
+
+Loop_last:
+        lxvw4x 48, 0, 3                #  vr16, constants
+       lxvw4x  49, 17, 3               #  vr17, key 1
+       lxvw4x  50, 18, 3               #  vr18, key 2
+       lxvw4x  51, 19, 3               #  vr19, counter, nonce
+
+       vspltisw 21, 12
+       vspltisw 23, 7
+       addis   11, 2, permx@toc@ha
+       addi    11, 11, permx@toc@l
+       lxvw4x  32+20, 0, 11
+       lxvw4x  32+22, 17, 11
+
+       sradi   8, 7, 1
+       mtctr 8
+
+Loop_4x:
+       vspltw  0, 16, 0
+       vspltw  1, 16, 1
+       vspltw  2, 16, 2
+       vspltw  3, 16, 3
+
+       vspltw  4, 17, 0
+       vspltw  5, 17, 1
+       vspltw  6, 17, 2
+       vspltw  7, 17, 3
+       vspltw  8, 18, 0
+       vspltw  9, 18, 1
+       vspltw  10, 18, 2
+       vspltw  11, 18, 3
+       vspltw  12, 19, 0
+       vadduwm 12, 12, 30      # increase counter
+       vspltw  13, 19, 1
+       vspltw  14, 19, 2
+       vspltw  15, 19, 3
+
+.align 5
+quarter_loop:
+       QT_loop_4x
+
+       bdnz    quarter_loop
+
+       vadduwm 12, 12, 30
+       TP_4x 0, 1, 2, 3
+       TP_4x 4, 5, 6, 7
+       TP_4x 8, 9, 10, 11
+       TP_4x 12, 13, 14, 15
+
+       Add_state 0
+       Write_256 0
+       addi    14, 14, 256     # offset += 256
+       addi    15, 15, -256    # len += 256
+
+       # Update state counter
+       vspltisw 25, 4
+       vadduwm 30, 30, 25
+
+       cmpdi   15, 0
+       beq     Out_loop
+       cmpdi   15, 256
+       blt     Out_loop
+
+       mtctr 8
+       b Loop_4x
+
+Out_loop:
+       RESTORE_REGS
+       blr
+
+Out_no_chacha:
+       li      3, 0
+       blr
+SYM_FUNC_END(chacha_p10le_8x)
+
+SYM_DATA_START_LOCAL(PERMX)
+.align 5
+permx:
+.long 0x22330011, 0x66774455, 0xaabb8899, 0xeeffccdd
+.long 0x11223300, 0x55667744, 0x99aabb88, 0xddeeffcc
+SYM_DATA_END(PERMX)
-- 
2.31.1

Reply via email to