This implementation is based on the existing, per-algorithm optimized
powerpc64/p8/aes-encrypt-internal.asm and powerpc64/p8/gcm-hash.asm
implementations by Niels Möller and Mamone Tarsha.
Significant changes:
- Combine AES + GCM into a single function call which does up-to 8x
unrolled AES followed by 2x 4x unrolled GCM back-to-back.
- Handle the IV|CTR increment in assembly and avoid the somewhat
costly gcm_fill() call to precalculate the counter values.
- Use ISA 3.0 (P9) lxvb16x/stxvb16x to load/store unaligned VSX
registers to avoid permutes on LE machines.
- Use ISA 3.0 (P9) lxvll/stxvll to load/store left-aligned,
zero-padded partial (<16B) blocks.
- Use ISA 3.0 (P9) lxv/stxv to load/store the non-volatile vector
registers from/to the stack redzone to avoid using a GPR register as
an index.
Signed-off-by: Christopher M. Riedl <[email protected]>
---
gcm.c | 4 +
powerpc64/p9/gcm-aes-encrypt.asm | 666 +++++++++++++++++++++++++++++++
2 files changed, 670 insertions(+)
create mode 100644 powerpc64/p9/gcm-aes-encrypt.asm
diff --git a/gcm.c b/gcm.c
index 6fe25a01..39e7a7c7 100644
--- a/gcm.c
+++ b/gcm.c
@@ -61,8 +61,12 @@
GCM_TABLE_BITS == 8 layout */
#undef HAVE_NATIVE_gcm_hash
#undef HAVE_NATIVE_gcm_init_key
+#undef HAVE_NATIVE_gcm_aes_decrypt
+#undef HAVE_NATIVE_gcm_aes_encrypt
#undef HAVE_NATIVE_fat_gcm_hash
#undef HAVE_NATIVE_fat_gcm_init_key
+#undef HAVE_NATIVE_fat_gcm_aes_decrypt
+#undef HAVE_NATIVE_fat_gcm_aes_encrypt
#endif
#if !HAVE_NATIVE_gcm_hash
diff --git a/powerpc64/p9/gcm-aes-encrypt.asm b/powerpc64/p9/gcm-aes-encrypt.asm
new file mode 100644
index 00000000..43f577fa
--- /dev/null
+++ b/powerpc64/p9/gcm-aes-encrypt.asm
@@ -0,0 +1,666 @@
+C powerpc64/p9/gcm-aes-encrypt.asm
+
+ifelse(`
+ Copyright (C) 2020 Niels Möller and Mamone Tarsha
+ Copyright (C) 2021 Christopher M. Riedl
+ This file is part of GNU Nettle.
+
+ GNU Nettle is free software: you can redistribute it and/or
+ modify it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+ or
+
+ * the GNU General Public License as published by the Free
+ Software Foundation; either version 2 of the License, or (at your
+ option) any later version.
+
+ or both in parallel, as here.
+
+ GNU Nettle is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received copies of the GNU General Public License and
+ the GNU Lesser General Public License along with this program. If
+ not, see http://www.gnu.org/licenses/.
+')
+
+
+.file "gcm-aes-encrypt.asm"
+
+.text
+
+C void gcm_aes_encrypt(const struct gcm_key *key, union gcm_block *x,
+C size_t length, const uint8_t *src,
+C unsigned rounds, const uint32_t *keys,
+C uint8_t *dst, uint32_t *ctr)
+
+C Register usage:
+define(`SP', `r1')
+define(`TOCP', `r2')
+
+C Parameters:
+define(`TABLE', `r3')
+define(`X', `r4') C Output GCM/Ghash tag
+define(`LENGTH',`r5')
+define(`SRC', `r6') C Plaintext input
+define(`ROUNDS',`r7')
+define(`KEYS', `r8')
+define(`DST', `r9')
+define(`PCTR', `r10') C Pointer to 12B IV and starting 4B ctr
+
+C GCM/Ghash:
+define(`POLY_L',`v0')
+define(`D', `v1')
+define(`H1M', `v6')
+define(`H1L', `v7')
+define(`H2M', `v8')
+define(`H2L', `v9')
+define(`H3M', `v10')
+define(`H3L', `v11')
+define(`H4M', `v12')
+define(`H4L', `v13')
+define(`R', `v14')
+define(`F', `v15')
+define(`R2', `v16')
+define(`F2', `v17')
+define(`T', `v18')
+define(`R3', `v20')
+define(`F3', `v21')
+define(`R4', `v22')
+define(`F4', `v23')
+
+C AES:
+define(`K', `v25')
+define(`S0', `v2')
+define(`S1', `v3')
+define(`S2', `v4')
+define(`S3', `v5')
+define(`S4', `v26')
+define(`S5', `v27')
+define(`S6', `v28')
+define(`S7', `v29')
+define(`CTR', `v30')
+define(`INC', `v31')
+define(`C0', `v14')
+define(`C1', `v15')
+define(`C2', `v16')
+define(`C3', `v17')
+define(`C4', `v20')
+define(`C5', `v21')
+define(`C6', `v22')
+define(`C7', `v23')
+
+define(`LCNT', `r14')
+define(`ZERO', `v16')
+define(`POLY', `v24')
+C misc: r15,r16,r17
+
+define(`FUNC_ALIGN', `5')
+PROLOGUE(_nettle_gcm_aes_encrypt)
+
+ vxor ZERO,ZERO,ZERO
+ subi ROUNDS,ROUNDS,1 C Last AES round uses
vcipherlast
+
+ C Store non-volatiles on the 288B stack redzone
+ std r14,-8*1(SP)
+ std r15,-8*2(SP)
+ std r16,-8*3(SP)
+ std r17,-8*4(SP)
+ stxv VSR(v20),-16*3(SP)
+ stxv VSR(v21),-16*4(SP)
+ stxv VSR(v22),-16*5(SP)
+ stxv VSR(v23),-16*6(SP)
+ stxv VSR(v24),-16*7(SP)
+ stxv VSR(v25),-16*8(SP)
+ stxv VSR(v26),-16*9(SP)
+ stxv VSR(v27),-16*10(SP)
+ stxv VSR(v28),-16*11(SP)
+ stxv VSR(v29),-16*12(SP)
+ stxv VSR(v30),-16*13(SP)
+ stxv VSR(v31),-16*14(SP)
+
+ DATA_LOAD_VEC(POLY,.polynomial,r14)
+ DATA_LOAD_VEC(INC,.increment,r14)
+
+ lxvb16x VSR(CTR),0,PCTR C Load 'ctr' pointer
+ xxmrghd VSR(POLY_L),VSR(ZERO),VSR(POLY)
+ lxvb16x VSR(D),0,X C load 'X' pointer
+
+L8x:
+ C --- process 8 blocks '128-bit each' per one loop ---
+ srdi. LCNT,LENGTH,7 C 8-blocks loop count 'LENGTH /
(8 * 16)'
+ beq L4x
+
+ C load table elements
+ li r15,4*16
+ li r16,5*16
+ li r17,6*16
+ lxvd2x VSR(H3M),r15,TABLE
+ lxvd2x VSR(H3L),r16,TABLE
+ lxvd2x VSR(H4M),r17,TABLE
+ li r16,7*16
+ lxvd2x VSR(H4L),r16,TABLE
+ li r15,1*16
+ li r16,2*16
+ li r17,3*16
+ lxvd2x VSR(H1M),0,TABLE
+ lxvd2x VSR(H1L),r15,TABLE
+ lxvd2x VSR(H2M),r16,TABLE
+ lxvd2x VSR(H2L),r17,TABLE
+
+L8x_loop:
+L8x_aes:
+ lxvb16x VSR(K),0,KEYS
+
+ C Increment ctr
+ vmr S0,CTR
+ vadduwm CTR,CTR,INC
+ vxor S0,S0,K
+ vmr S1,CTR
+ vadduwm CTR,CTR,INC
+ vxor S1,S1,K
+ vmr S2,CTR
+ vadduwm CTR,CTR,INC
+ vxor S2,S2,K
+ vmr S3,CTR
+ vadduwm CTR,CTR,INC
+ vxor S3,S3,K
+
+ mtctr ROUNDS
+ li r15,1*16
+
+ vmr S4,CTR
+ vadduwm CTR,CTR,INC
+ vxor S4,S4,K
+ vmr S5,CTR
+ vadduwm CTR,CTR,INC
+ vxor S5,S5,K
+ vmr S6,CTR
+ vadduwm CTR,CTR,INC
+ vxor S6,S6,K
+ vmr S7,CTR
+ vadduwm CTR,CTR,INC
+ vxor S7,S7,K
+
+.align 5
+L8x_aes_rnd_loop:
+ lxvb16x VSR(K),r15,KEYS
+ addi r15,r15,1*16
+ vcipher S0,S0,K
+ vcipher S1,S1,K
+ vcipher S2,S2,K
+ vcipher S3,S3,K
+ vcipher S4,S4,K
+ vcipher S5,S5,K
+ vcipher S6,S6,K
+ vcipher S7,S7,K
+ bdnz L8x_aes_rnd_loop
+
+ lxvb16x VSR(K),r15,KEYS
+ vcipherlast S0,S0,K
+ vcipherlast S1,S1,K
+ vcipherlast S2,S2,K
+ vcipherlast S3,S3,K
+ vcipherlast S4,S4,K
+ vcipherlast S5,S5,K
+ vcipherlast S6,S6,K
+ vcipherlast S7,S7,K
+
+ C AES(counter) XOR plaintext = ciphertext
+ li r15,1*16
+ li r16,2*16
+ li r17,3*16
+ lxvb16x VSR(C0),0,SRC
+ lxvb16x VSR(C1),r15,SRC
+ lxvb16x VSR(C2),r16,SRC
+ lxvb16x VSR(C3),r17,SRC
+ vxor S0,C0,S0
+ vxor S1,C1,S1
+ vxor S2,C2,S2
+ vxor S3,C3,S3
+
+ addi SRC,SRC,4*16
+ lxvb16x VSR(C4),0,SRC
+ lxvb16x VSR(C5),r15,SRC
+ lxvb16x VSR(C6),r16,SRC
+ lxvb16x VSR(C7),r17,SRC
+ vxor S4,C4,S4
+ vxor S5,C5,S5
+ vxor S6,C6,S6
+ vxor S7,C7,S7
+
+ C Store ciphertext
+ stxvb16x VSR(S0),0,DST
+ stxvb16x VSR(S1),r15,DST
+ stxvb16x VSR(S2),r16,DST
+ stxvb16x VSR(S3),r17,DST
+ addi DST,DST,4*16
+ stxvb16x VSR(S4),0,DST
+ stxvb16x VSR(S5),r15,DST
+ stxvb16x VSR(S6),r16,DST
+ stxvb16x VSR(S7),r17,DST
+
+ addi SRC,SRC,4*16
+ addi DST,DST,4*16
+
+L8x_gcm:
+ C previous digest combining
+ vxor S0,S0,D
+
+ C polynomial multiplication
+ vpmsumd F2,H3L,S1
+ vpmsumd R2,H3M,S1
+ vpmsumd F3,H2L,S2
+ vpmsumd R3,H2M,S2
+ vpmsumd F4,H1L,S3
+ vpmsumd R4,H1M,S3
+ vpmsumd F,H4L,S0
+ vpmsumd R,H4M,S0
+
+ C deferred recombination of partial products
+ vxor F3,F3,F4
+ vxor R3,R3,R4
+ vxor F,F,F2
+ vxor R,R,R2
+ vxor F,F,F3
+ vxor R,R,R3
+
+ C reduction
+ vpmsumd T,F,POLY_L
+ xxswapd VSR(D),VSR(F)
+ vxor R,R,T
+ vxor D,R,D
+
+ C previous digest combining
+ vxor S4,S4,D
+
+ C polynomial multiplication
+ vpmsumd F2,H3L,S5
+ vpmsumd R2,H3M,S5
+ vpmsumd F3,H2L,S6
+ vpmsumd R3,H2M,S6
+ vpmsumd F4,H1L,S7
+ vpmsumd R4,H1M,S7
+ vpmsumd F,H4L,S4
+ vpmsumd R,H4M,S4
+
+ C deferred recombination of partial products
+ vxor F3,F3,F4
+ vxor R3,R3,R4
+ vxor F,F,F2
+ vxor R,R,R2
+ vxor F,F,F3
+ vxor R,R,R3
+
+ C reduction
+ vpmsumd T,F,POLY_L
+ xxswapd VSR(D),VSR(F)
+ vxor R,R,T
+ vxor D,R,D
+
+ C Decrement 8x block count and check if done
+ subi LCNT,LCNT,1
+ cmpldi LCNT,0
+ bne L8x_loop
+ clrldi LENGTH,LENGTH,57 C 'set the high-order 57 bits
to zeros'
+
+L4x:
+ C --- process 4 blocks ---
+ srdi. LCNT,LENGTH,6 C 4-blocks loop count 'LENGTH /
(4 * 16)'
+ beq L2x
+
+ C load table elements
+ li r15,4*16
+ li r16,5*16
+ li r17,6*16
+ lxvd2x VSR(H3M),r15,TABLE
+ lxvd2x VSR(H3L),r16,TABLE
+ lxvd2x VSR(H4M),r17,TABLE
+ li r16,7*16
+ lxvd2x VSR(H4L),r16,TABLE
+ li r15,1*16
+ li r16,2*16
+ li r17,3*16
+ lxvd2x VSR(H1M),0,TABLE
+ lxvd2x VSR(H1L),r15,TABLE
+ lxvd2x VSR(H2M),r16,TABLE
+ lxvd2x VSR(H2L),r17,TABLE
+
+L4x_aes:
+ lxvb16x VSR(K),0,KEYS
+
+ C Increment ctr
+ vmr S0,CTR
+ vadduwm CTR,CTR,INC
+ vmr S1,CTR
+ vadduwm CTR,CTR,INC
+ vmr S2,CTR
+ vadduwm CTR,CTR,INC
+ vmr S3,CTR
+ vadduwm CTR,CTR,INC
+
+ vxor S0,S0,K
+ vxor S1,S1,K
+ vxor S2,S2,K
+ vxor S3,S3,K
+
+ mtctr ROUNDS
+ li r15,1*16
+
+.align 5
+L4x_aes_rnd_loop:
+ lxvb16x VSR(K),r15,KEYS
+ vcipher S0,S0,K
+ vcipher S1,S1,K
+ vcipher S2,S2,K
+ vcipher S3,S3,K
+ addi r15,r15,1*16
+ bdnz L4x_aes_rnd_loop
+
+ lxvb16x VSR(K),r15,KEYS
+ vcipherlast S0,S0,K
+ vcipherlast S1,S1,K
+ vcipherlast S2,S2,K
+ vcipherlast S3,S3,K
+
+ C AES(counter) XOR plaintext = ciphertext
+ li r15,1*16
+ li r16,2*16
+ li r17,3*16
+ lxvb16x VSR(C0),0,SRC
+ lxvb16x VSR(C1),r15,SRC
+ lxvb16x VSR(C2),r16,SRC
+ lxvb16x VSR(C3),r17,SRC
+ vxor S0,C0,S0
+ vxor S1,C1,S1
+ vxor S2,C2,S2
+ vxor S3,C3,S3
+
+ C Store ciphertext in DST
+ stxvb16x VSR(S0),0,DST
+ stxvb16x VSR(S1),r15,DST
+ stxvb16x VSR(S2),r16,DST
+ stxvb16x VSR(S3),r17,DST
+
+L4x_gcm:
+ C previous digest combining
+ vxor S0,S0,D
+
+ C polynomial multiplication
+ vpmsumd F2,H3L,S1
+ vpmsumd R2,H3M,S1
+ vpmsumd F3,H2L,S2
+ vpmsumd R3,H2M,S2
+ vpmsumd F4,H1L,S3
+ vpmsumd R4,H1M,S3
+ vpmsumd F,H4L,S0
+ vpmsumd R,H4M,S0
+
+ C deferred recombination of partial products
+ vxor F3,F3,F4
+ vxor R3,R3,R4
+ vxor F,F,F2
+ vxor R,R,R2
+ vxor F,F,F3
+ vxor R,R,R3
+
+ C reduction
+ vpmsumd T,F,POLY_L
+ xxswapd VSR(D),VSR(F)
+ vxor R,R,T
+ vxor D,R,D
+
+ addi DST,DST,4*16
+ addi SRC,SRC,4*16
+ clrldi LENGTH,LENGTH,58 C 'set the high-order 58 bits
to zeros'
+
+L2x:
+ C --- process 2 blocks ---
+ srdi. r14,LENGTH,5 C 'LENGTH / (2 * 16)'
+ beq L1x
+
+ C load table elements
+ li r15,1*16
+ li r16,2*16
+ li r17,3*16
+ lxvd2x VSR(H1M),0,TABLE
+ lxvd2x VSR(H1L),r15,TABLE
+ lxvd2x VSR(H2M),r16,TABLE
+ lxvd2x VSR(H2L),r17,TABLE
+
+L2x_aes:
+ lxvb16x VSR(K),0,KEYS
+
+ C Increment ctr
+ vmr S0,CTR
+ vadduwm CTR,CTR,INC
+ vmr S1,CTR
+ vadduwm CTR,CTR,INC
+
+ vxor S0,S0,K
+ vxor S1,S1,K
+
+ mtctr ROUNDS
+ li r15,1*16
+
+.align 5
+L2x_aes_rnd_loop:
+ lxvb16x VSR(K),r15,KEYS
+ vcipher S0,S0,K
+ vcipher S1,S1,K
+ addi r15,r15,1*16
+ bdnz L2x_aes_rnd_loop
+
+ lxvb16x VSR(K),r15,KEYS
+ vcipherlast S0,S0,K
+ vcipherlast S1,S1,K
+
+ C AES(counter) XOR plaintext = ciphertext
+ li r15,1*16
+ lxvb16x VSR(C0),0,SRC
+ lxvb16x VSR(C1),r15,SRC
+ vxor S0,C0,S0
+ vxor S1,C1,S1
+
+ C Store ciphertext in DST
+ stxvb16x VSR(S0),0,DST
+ stxvb16x VSR(S1),r15,DST
+
+L2x_gcm:
+ C previous digest combining
+ vxor S0,S0,D
+
+ C polynomial multiplication
+ vpmsumd F2,H1L,S1
+ vpmsumd R2,H1M,S1
+ vpmsumd F,H2L,S0
+ vpmsumd R,H2M,S0
+
+ C deferred recombination of partial products
+ vxor F,F,F2
+ vxor R,R,R2
+
+ C reduction
+ vpmsumd T,F,POLY_L
+ xxswapd VSR(D),VSR(F)
+ vxor R,R,T
+ vxor D,R,D
+
+ addi DST,DST,2*16
+ addi SRC,SRC,2*16
+ clrldi LENGTH,LENGTH,59 C 'set the high-order 59 bits
to zeros'
+
+L1x:
+ C --- process 1 block ---
+ srdi. r14,LENGTH,4 C 'LENGTH / (1 * 16)'
+ beq Lpartial
+
+ C load table elements
+ li r15,1*16
+ lxvd2x VSR(H1M),0,TABLE
+ lxvd2x VSR(H1L),r15,TABLE
+
+L1x_aes:
+ lxvb16x VSR(K),0,KEYS
+
+ C Increment ctr
+ vmr S0,CTR
+ vadduwm CTR,CTR,INC
+
+ vxor S0,S0,K
+
+ mtctr ROUNDS
+ li r15,1*16
+
+.align 5
+L1x_aes_rnd_loop:
+ lxvb16x VSR(K),r15,KEYS
+ vcipher S0,S0,K
+ addi r15,r15,1*16
+ bdnz L1x_aes_rnd_loop
+
+ lxvb16x VSR(K),r15,KEYS
+ vcipherlast S0,S0,K
+
+ C AES(counter) XOR plaintext = ciphertext
+ lxvb16x VSR(C0),0,SRC
+ vxor S0,C0,S0
+
+ C Store ciphertext in DST
+ stxvb16x VSR(S0),0,DST
+
+L1x_gcm:
+ C previous digest combining
+ vxor S0,S0,D
+
+ C polynomial multiplication
+ vpmsumd F,H1L,S0
+ vpmsumd R,H1M,S0
+
+ C reduction
+ vpmsumd T,F,POLY_L
+ xxswapd VSR(D),VSR(F)
+ vxor R,R,T
+ vxor D,R,D
+
+ addi DST,DST,1*16
+ addi SRC,SRC,1*16
+ clrldi LENGTH,LENGTH,60 C 'set the high-order 60 bits
to zeros'
+
+Lpartial:
+ C --- process partial block ---
+ cmpldi LENGTH,0
+ beq Ldone
+
+ C load table elements
+ li r15,1*16
+ lxvd2x VSR(H1M),0,TABLE
+ lxvd2x VSR(H1L),r15,TABLE
+
+Lpartial_aes:
+ lxvb16x VSR(K),0,KEYS
+
+ C Increment ctr
+ vmr S0,CTR
+ vadduwm CTR,CTR,INC
+
+ vxor S0,S0,K
+
+ mtctr ROUNDS
+ li r15,1*16
+
+.align 5
+Lpartial_aes_rnd_loop:
+ lxvb16x VSR(K),r15,KEYS
+ vcipher S0,S0,K
+ addi r15,r15,1*16
+ bdnz Lpartial_aes_rnd_loop
+
+ lxvb16x VSR(K),r15,KEYS
+ vcipherlast S0,S0,K
+
+ C Load the partial block left-aligned and zero-padded
+ sldi LENGTH,LENGTH,56
+ lxvll VSR(C0),SRC,LENGTH
+
+ C AES(counter) XOR plaintext = ciphertext
+ vxor S0,C0,S0
+
+ C Store ciphertext in DST
+ stxvll VSR(S0),DST,LENGTH
+
+ C TODO: Lazy, reload the value to zero-out the padding bits again
+ lxvll VSR(S0),DST,LENGTH
+
+Lpartial_gcm:
+ C previous digest combining
+ vxor S0,S0,D
+
+ C polynomial multiplication
+ vpmsumd F,H1L,S0
+ vpmsumd R,H1M,S0
+
+ C reduction
+ vpmsumd T,F,POLY_L
+ xxswapd VSR(D),VSR(F)
+ vxor R,R,T
+ vxor D,R,D
+
+Ldone:
+ stxvb16x VSR(D),0,X C store digest 'D'
+ stxvb16x VSR(CTR),0,PCTR C store updated 'ctr'
+
+ C Restore non-volatiles from the 288B stack redzone
+ ld r14,-8*1(SP)
+ ld r15,-8*2(SP)
+ ld r16,-8*3(SP)
+ ld r17,-8*4(SP)
+ lxv VSR(v20),-16*3(SP)
+ lxv VSR(v21),-16*4(SP)
+ lxv VSR(v22),-16*5(SP)
+ lxv VSR(v23),-16*6(SP)
+ lxv VSR(v24),-16*7(SP)
+ lxv VSR(v25),-16*8(SP)
+ lxv VSR(v26),-16*9(SP)
+ lxv VSR(v27),-16*10(SP)
+ lxv VSR(v28),-16*11(SP)
+ lxv VSR(v29),-16*12(SP)
+ lxv VSR(v30),-16*13(SP)
+ lxv VSR(v31),-16*14(SP)
+
+ li r3,0 C return 0 for success
+ blr
+
+EPILOGUE(_nettle_gcm_aes_encrypt)
+
+.data
+.align 4
+C 0xC2000000000000000000000000000001
+.polynomial:
+IF_BE(`
+ .byte 0xC2
+ .rept 14
+ .byte 0x00
+ .endr
+ .byte 0x01
+',`
+ .byte 0x01
+ .rept 14
+ .byte 0x00
+ .endr
+ .byte 0xC2
+')
+.align 4
+.increment:
+IF_LE(`
+ .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+')
+IF_BE(`
+ .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
+')
--
2.26.1
_______________________________________________
nettle-bugs mailing list
[email protected]
http://lists.lysator.liu.se/mailman/listinfo/nettle-bugs