This patch introduces an optimized powerpc64 assembly implementation for
sha256-compress-n. This takes advantage of the vshasigma instruction, as
well as unrolling loops to best take advantage of running instructions
in parallel.
The following data was captured on a POWER 10 LPAR @ ~3.896GHz
Current C implementation:
Algorithm mode Mbyte/s
sha256 update 280.97
hmac-sha256 64 bytes 80.81
hmac-sha256 256 bytes 170.50
hmac-sha256 1024 bytes 241.92
hmac-sha256 4096 bytes 268.54
hmac-sha256 single msg 276.16
With optimized assembly:
Algorithm mode Mbyte/s
sha256 update 446.42
hmac-sha256 64 bytes 124.89
hmac-sha256 256 bytes 268.90
hmac-sha256 1024 bytes 382.06
hmac-sha256 4096 bytes 425.38
hmac-sha256 single msg 439.75
Signed-off-by: Eric Richter <[email protected]>
---
fat-ppc.c | 12 +
powerpc64/fat/sha256-compress-n-2.asm | 36 +++
powerpc64/p8/sha256-compress-n.asm | 339 ++++++++++++++++++++++++++
3 files changed, 387 insertions(+)
create mode 100644 powerpc64/fat/sha256-compress-n-2.asm
create mode 100644 powerpc64/p8/sha256-compress-n.asm
diff --git a/fat-ppc.c b/fat-ppc.c
index cd76f7a1..efbeb2ec 100644
--- a/fat-ppc.c
+++ b/fat-ppc.c
@@ -203,6 +203,10 @@ DECLARE_FAT_FUNC(_nettle_poly1305_blocks,
poly1305_blocks_func)
DECLARE_FAT_FUNC_VAR(poly1305_blocks, poly1305_blocks_func, c)
DECLARE_FAT_FUNC_VAR(poly1305_blocks, poly1305_blocks_func, ppc64)
+DECLARE_FAT_FUNC(_nettle_sha256_compress_n, sha256_compress_n_func)
+DECLARE_FAT_FUNC_VAR(sha256_compress_n, sha256_compress_n_func, c)
+DECLARE_FAT_FUNC_VAR(sha256_compress_n, sha256_compress_n_func, ppc64)
+
static void CONSTRUCTOR
fat_init (void)
@@ -231,6 +235,8 @@ fat_init (void)
_nettle_ghash_update_arm64() */
_nettle_ghash_set_key_vec = _nettle_ghash_set_key_ppc64;
_nettle_ghash_update_vec = _nettle_ghash_update_ppc64;
+
+ _nettle_sha256_compress_n_vec = _nettle_sha256_compress_n_ppc64;
}
else
{
@@ -239,6 +245,7 @@ fat_init (void)
_nettle_aes_invert_vec = _nettle_aes_invert_c;
_nettle_ghash_set_key_vec = _nettle_ghash_set_key_c;
_nettle_ghash_update_vec = _nettle_ghash_update_c;
+ _nettle_sha256_compress_n_vec = _nettle_sha256_compress_n_c;
}
if (features.have_altivec)
{
@@ -338,3 +345,8 @@ DEFINE_FAT_FUNC(_nettle_poly1305_blocks, const uint8_t *,
size_t blocks,
const uint8_t *m),
(ctx, blocks, m))
+
+DEFINE_FAT_FUNC(_nettle_sha256_compress_n, const uint8_t *,
+ (uint32_t *state, const uint32_t *k,
+ size_t blocks, const uint8_t *input),
+ (state, k, blocks, input))
diff --git a/powerpc64/fat/sha256-compress-n-2.asm
b/powerpc64/fat/sha256-compress-n-2.asm
new file mode 100644
index 00000000..4f4eee9d
--- /dev/null
+++ b/powerpc64/fat/sha256-compress-n-2.asm
@@ -0,0 +1,36 @@
+C powerpc64/fat/sha256-compress-n-2.asm
+
+ifelse(`
+ Copyright (C) 2024 Eric Richter, IBM Corporation
+
+ This file is part of GNU Nettle.
+
+ GNU Nettle is free software: you can redistribute it and/or
+ modify it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+ or
+
+ * the GNU General Public License as published by the Free
+ Software Foundation; either version 2 of the License, or (at your
+ option) any later version.
+
+ or both in parallel, as here.
+
+ GNU Nettle is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received copies of the GNU General Public License and
+ the GNU Lesser General Public License along with this program. If
+ not, see http://www.gnu.org/licenses/.
+')
+
+dnl PROLOGUE(_nettle_sha256_compress_n) picked up by configure
+
+define(`fat_transform', `$1_ppc64')
+include_src(`powerpc64/p8/sha256-compress-n.asm')
diff --git a/powerpc64/p8/sha256-compress-n.asm
b/powerpc64/p8/sha256-compress-n.asm
new file mode 100644
index 00000000..52f548dc
--- /dev/null
+++ b/powerpc64/p8/sha256-compress-n.asm
@@ -0,0 +1,339 @@
+C x86_64/sha256-compress-n.asm
+
+ifelse(`
+ Copyright (C) 2024 Eric Richter, IBM Corporation
+
+ This file is part of GNU Nettle.
+
+ GNU Nettle is free software: you can redistribute it and/or
+ modify it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+ or
+
+ * the GNU General Public License as published by the Free
+ Software Foundation; either version 2 of the License, or (at your
+ option) any later version.
+
+ or both in parallel, as here.
+
+ GNU Nettle is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received copies of the GNU General Public License and
+ the GNU Lesser General Public License along with this program. If
+ not, see http://www.gnu.org/licenses/.
+')
+
+.file "sha256-compress-n.asm"
+
+C Parameters in
+define(`SP', `1')
+define(`STATE', `3')
+define(`K', `4')
+define(`NUMBLOCKS', `5')
+define(`INPUT', `6')
+
+define(`T0', `7')
+define(`T1', `8')
+define(`TK', `9')
+define(`COUNT', `10')
+
+C State registers
+define(`VSA', `0')
+define(`VSB', `1')
+define(`VSC', `2')
+define(`VSD', `3')
+define(`VSE', `4')
+define(`VSF', `5')
+define(`VSG', `6')
+define(`VSH', `7')
+
+C Current K values
+define(`VK', `8')
+
+C Temp registers for math
+define(`VT0', `9')
+define(`VT1', `10')
+define(`VT2', `11')
+define(`VT3', `12')
+define(`VT4', `13')
+
+C Convenience named registers for sigma(a) and sigma(e)
+define(`SIGA', `14')
+define(`SIGE', `15')
+
+C Input words W[i]. Not directly referenced, but defined here to keep track
+define(`VW0', `16')
+define(`VW1', `17')
+define(`VW2', `18')
+define(`VW3', `19')
+define(`VW4', `20')
+define(`VW5', `21')
+define(`VW6', `22')
+define(`VW7', `23')
+define(`VW8', `24')
+define(`VW9', `25')
+define(`VW10', `26')
+define(`VW11', `27')
+define(`VW12', `28')
+define(`VW13', `29')
+define(`VW14', `30')
+define(`VW15', `31')
+
+C Convert an index for W[i] to the corresponding register
+define(`IV', `eval($1 + VW0)')
+
+C ROUND(A B C D E F G H R EXT)
+define(`ROUND', `
+
+ vadduwm VT1, VK, IV($9) C VT1: k+W
+ vadduwm VT4, $8, VT1 C VT4: H+k+W
+
+ lxvw4x VSR(VK), TK, K C Load Key
+ addi TK, TK, 4 C Increment Pointer to next key
+
+ vadduwm VT2, $4, $8 C VT2: H+D
+ vadduwm VT2, VT2, VT1 C VT2: H+D+k+W
+
+ vshasigmaw SIGE, $5, 1, 0b1111 C Sigma(E) Se
+ vshasigmaw SIGA, $1, 1, 0 C Sigma(A) Sa
+
+ vxor VT3, $2, $3 C VT3: b^c
+ vsel VT0, $7, $6, $5 C VT0: Ch.
+ vsel VT3, $3, $1, VT3 C VT3: Maj(a,b,c)
+
+ vadduwm VT4, VT4, VT0 C VT4: Hkw + Ch.
+ vadduwm VT3, VT3, VT4 C VT3: HkW + Ch. + Maj.
+
+ vadduwm VT0, VT0, VT2 C VT0: Ch. + DHKW
+ vadduwm $8, SIGE, SIGA C Anext: Se + Sa
+ vadduwm $4, VT0, SIGE C Dnext: Ch. + DHKW + Se
+ vadduwm $8, $8, VT3 C Anext: Se+Sa+HkW+Ch.+Maj.
+
+
+ C Schedule (data) for 16th round in future
+ C Extend W[i]
+ ifelse(`$10', `1', `
+ vshasigmaw SIGE, IV(($9 + 14) % 16), 0, 0b1111
+ vshasigmaw SIGA, IV(($9 + 1) % 16), 0, 0b0000
+ vadduwm IV($9), IV($9), SIGE
+ vadduwm IV($9), IV($9), SIGA
+ vadduwm IV($9), IV($9), IV(($9 + 9) % 16)
+ ')
+')
+
+define(`EXTENDROUND', `ROUND($1, $2, $3, $4, $5, $6, $7, $8, $9, 1)')
+define(`NOEXTENDROUND', `ROUND($1, $2, $3, $4, $5, $6, $7, $8, $9, 0)')
+
+define(`NOEXTENDROUNDS', `
+ NOEXTENDROUND(VSA, VSB, VSC, VSD, VSE, VSF, VSG, VSH, 0)
+ NOEXTENDROUND(VSH, VSA, VSB, VSC, VSD, VSE, VSF, VSG, 1)
+ NOEXTENDROUND(VSG, VSH, VSA, VSB, VSC, VSD, VSE, VSF, 2)
+ NOEXTENDROUND(VSF, VSG, VSH, VSA, VSB, VSC, VSD, VSE, 3)
+
+ NOEXTENDROUND(VSE, VSF, VSG, VSH, VSA, VSB, VSC, VSD, 4)
+ NOEXTENDROUND(VSD, VSE, VSF, VSG, VSH, VSA, VSB, VSC, 5)
+ NOEXTENDROUND(VSC, VSD, VSE, VSF, VSG, VSH, VSA, VSB, 6)
+ NOEXTENDROUND(VSB, VSC, VSD, VSE, VSF, VSG, VSH, VSA, 7)
+
+ NOEXTENDROUND(VSA, VSB, VSC, VSD, VSE, VSF, VSG, VSH, 8)
+ NOEXTENDROUND(VSH, VSA, VSB, VSC, VSD, VSE, VSF, VSG, 9)
+ NOEXTENDROUND(VSG, VSH, VSA, VSB, VSC, VSD, VSE, VSF, 10)
+ NOEXTENDROUND(VSF, VSG, VSH, VSA, VSB, VSC, VSD, VSE, 11)
+
+ NOEXTENDROUND(VSE, VSF, VSG, VSH, VSA, VSB, VSC, VSD, 12)
+ NOEXTENDROUND(VSD, VSE, VSF, VSG, VSH, VSA, VSB, VSC, 13)
+ NOEXTENDROUND(VSC, VSD, VSE, VSF, VSG, VSH, VSA, VSB, 14)
+ NOEXTENDROUND(VSB, VSC, VSD, VSE, VSF, VSG, VSH, VSA, 15)
+')
+
+define(`EXTENDROUNDS', `
+ EXTENDROUND(VSA, VSB, VSC, VSD, VSE, VSF, VSG, VSH, 0)
+ EXTENDROUND(VSH, VSA, VSB, VSC, VSD, VSE, VSF, VSG, 1)
+ EXTENDROUND(VSG, VSH, VSA, VSB, VSC, VSD, VSE, VSF, 2)
+ EXTENDROUND(VSF, VSG, VSH, VSA, VSB, VSC, VSD, VSE, 3)
+
+ EXTENDROUND(VSE, VSF, VSG, VSH, VSA, VSB, VSC, VSD, 4)
+ EXTENDROUND(VSD, VSE, VSF, VSG, VSH, VSA, VSB, VSC, 5)
+ EXTENDROUND(VSC, VSD, VSE, VSF, VSG, VSH, VSA, VSB, 6)
+ EXTENDROUND(VSB, VSC, VSD, VSE, VSF, VSG, VSH, VSA, 7)
+
+ EXTENDROUND(VSA, VSB, VSC, VSD, VSE, VSF, VSG, VSH, 8)
+ EXTENDROUND(VSH, VSA, VSB, VSC, VSD, VSE, VSF, VSG, 9)
+ EXTENDROUND(VSG, VSH, VSA, VSB, VSC, VSD, VSE, VSF, 10)
+ EXTENDROUND(VSF, VSG, VSH, VSA, VSB, VSC, VSD, VSE, 11)
+
+ EXTENDROUND(VSE, VSF, VSG, VSH, VSA, VSB, VSC, VSD, 12)
+ EXTENDROUND(VSD, VSE, VSF, VSG, VSH, VSA, VSB, VSC, 13)
+ EXTENDROUND(VSC, VSD, VSE, VSF, VSG, VSH, VSA, VSB, 14)
+ EXTENDROUND(VSB, VSC, VSD, VSE, VSF, VSG, VSH, VSA, 15)
+')
+
+define(`LOAD', `
+ IF_BE(`lxvw4x VSR(IV($1)), 0, INPUT')
+ IF_LE(`
+ lxvd2x VSR(IV($1)), 0, INPUT
+ vperm IV($1), IV($1), IV($1), VT0
+ ')
+ addi INPUT, INPUT, 4
+')
+
+define(`DOLOADS', `
+ IF_LE(`DATA_LOAD_VEC(VT0, .load_swap, T1)')
+ LOAD(0)
+ LOAD(1)
+ LOAD(2)
+ LOAD(3)
+
+ LOAD(4)
+ LOAD(5)
+ LOAD(6)
+ LOAD(7)
+
+ LOAD(8)
+ LOAD(9)
+ LOAD(10)
+ LOAD(11)
+
+ LOAD(12)
+ LOAD(13)
+ LOAD(14)
+ LOAD(15)
+')
+
+.text
+PROLOGUE(_nettle_sha256_compress_n)
+ cmpwi 0, NUMBLOCKS, 0
+ ble 0, .done
+ mtctr NUMBLOCKS
+
+ C Store non-volatile registers
+ subi SP, SP, 64+(12*16)
+ std T0, 24(SP)
+ std T1, 16(SP)
+ std COUNT, 8(SP)
+
+ li T0, 32
+ stvx 20, 0, SP
+ subi T0, T0, 16
+ stvx 21, T0, SP
+ subi T0, T0, 16
+ stvx 22, T0, SP
+ subi T0, T0, 16
+ stvx 23, T0, SP
+ subi T0, T0, 16
+ stvx 24, T0, SP
+ subi T0, T0, 16
+ stvx 25, T0, SP
+ subi T0, T0, 16
+ stvx 26, T0, SP
+ subi T0, T0, 16
+ stvx 27, T0, SP
+ subi T0, T0, 16
+ stvx 28, T0, SP
+ subi T0, T0, 16
+ stvx 29, T0, SP
+ subi T0, T0, 16
+ stvx 30, T0, SP
+ subi T0, T0, 16
+ stvx 31, T0, SP
+
+ C Load state values
+ li T0, 16
+ lxvw4x VSR(VSA), 0, STATE C VSA contains A,B,C,D
+ lxvw4x VSR(VSE), T0, STATE C VSE contains E,F,G,H
+
+.loop:
+ li TK, 0
+ lxvw4x VSR(VK), TK, K
+ addi TK, TK, 4
+
+ DOLOADS
+
+ C "permute" state from VSA containing A,B,C,D into VSA,VSB,VSC,VSD
+ vsldoi VSB, VSA, VSA, 4
+ vsldoi VSF, VSE, VSE, 4
+
+ vsldoi VSC, VSA, VSA, 8
+ vsldoi VSG, VSE, VSE, 8
+
+ vsldoi VSD, VSA, VSA, 12
+ vsldoi VSH, VSE, VSE, 12
+
+ EXTENDROUNDS
+ EXTENDROUNDS
+ EXTENDROUNDS
+ NOEXTENDROUNDS
+
+ C Reload initial state from stack
+ li T0, 16
+ lxvw4x VSR(VT0), 0, STATE C VSA contains A,B,C,D
+ lxvw4x VSR(VT1), T0, STATE C VSE contains E,F,G,H
+
+ C Repack VSA,VSB,VSC,VSD into VSA,VSE for storing
+ vmrghw VSA, VSA, VSB
+ vmrghw VSC, VSC, VSD
+ vmrghw VSE, VSE, VSF
+ vmrghw VSG, VSG, VSH
+
+ xxmrghd VSR(VSA), VSR(VSA), VSR(VSC)
+ xxmrghd VSR(VSE), VSR(VSE), VSR(VSG)
+
+ vadduwm VSA, VSA, VT0
+ vadduwm VSE, VSE, VT1
+
+ li T0, 16
+ stxvw4x VSR(VSA), 0, STATE
+ stxvw4x VSR(VSE), T0, STATE
+
+ bdnz .loop
+
+ C Restore nonvolatile registers
+ li T0, 32
+ lvx 20, 0, SP
+ subi T0, T0, 16
+ lvx 21, T0, SP
+ subi T0, T0, 16
+ lvx 22, T0, SP
+ subi T0, T0, 16
+ lvx 23, T0, SP
+ subi T0, T0, 16
+ lvx 24, T0, SP
+ subi T0, T0, 16
+ lvx 25, T0, SP
+ subi T0, T0, 16
+ lvx 26, T0, SP
+ subi T0, T0, 16
+ lvx 27, T0, SP
+ subi T0, T0, 16
+ lvx 28, T0, SP
+ subi T0, T0, 16
+ lvx 29, T0, SP
+ subi T0, T0, 16
+ lvx 30, T0, SP
+ subi T0, T0, 16
+ lvx 31, T0, SP
+
+ ld T0, 24(SP)
+ ld T1, 16(SP)
+ ld COUNT, 8(SP)
+ addi SP, SP, 64+(12*16)
+
+.done:
+ mr 3, INPUT
+
+ blr
+EPILOGUE(_nettle_sha256_compress_n)
+
+IF_LE(`
+.data
+.align 4
+.load_swap:
+ .byte 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7
+')
--
2.44.0
_______________________________________________
nettle-bugs mailing list -- [email protected]
To unsubscribe send an email to [email protected]