---
Makefile.in | 2 +-
configure.ac | 5 +
gcm.c | 19 +-
powerpc64le/aes-decrypt-internal.asm | 573 +++++++++++++++
powerpc64le/aes-encrypt-internal.asm | 534 ++++++++++++++
powerpc64le/gcm-hash8.asm | 992
++++++++++++++++++++++++++
powerpc64le/machine.m4 | 0
testsuite/gcm-test.c | 23 +
8 files changed, 2146 insertions(+), 2 deletions(-)
create mode 100644 powerpc64le/aes-decrypt-internal.asm
create mode 100644 powerpc64le/aes-encrypt-internal.asm
create mode 100644 powerpc64le/gcm-hash8.asm
create mode 100644 powerpc64le/machine.m4
diff --git a/Makefile.in b/Makefile.in
index 64ff1001..5bbc0f79 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -603,7 +603,7 @@ distdir: $(DISTFILES)
done
set -e; for d in sparc32 sparc64 x86 \
x86_64 x86_64/aesni x86_64/sha_ni x86_64/fat \
- arm arm/neon arm/v6 arm/fat ; do \
+ arm arm/neon arm/v6 arm/fat powerpc64le ; do \
mkdir "$(distdir)/$$d" ; \
find "$(srcdir)/$$d" -maxdepth 1 '(' -name '*.asm' -o -name '*.m4' ')' \
-exec cp '{}' "$(distdir)/$$d" ';' ; \
diff --git a/configure.ac b/configure.ac
index 90ea1ea8..1ea54ce8 100644
--- a/configure.ac
+++ b/configure.ac
@@ -435,6 +435,9 @@ if test "x$enable_assembler" = xyes ; then
esac
fi
;;
+ *powerpc64le*)
+ asm_path=powerpc64le
+ ;;
*)
enable_assembler=no
;;
@@ -572,7 +575,9 @@ AH_VERBATIM([HAVE_NATIVE],
#undef HAVE_NATIVE_ecc_secp384r1_redc
#undef HAVE_NATIVE_ecc_secp521r1_modp
#undef HAVE_NATIVE_ecc_secp521r1_redc
+#undef HAVE_NATIVE_gcm_init_key8
#undef HAVE_NATIVE_gcm_hash8
+#undef HAVE_NATIVE_gcm_fill
#undef HAVE_NATIVE_salsa20_core
#undef HAVE_NATIVE_sha1_compress
#undef HAVE_NATIVE_sha256_compress
diff --git a/gcm.c b/gcm.c
index cf615daf..809c03bc 100644
--- a/gcm.c
+++ b/gcm.c
@@ -140,6 +140,12 @@ gcm_gf_mul (union nettle_block16 *x, const union
nettle_block16 *table)
memcpy (x->b, Z.b, sizeof(Z));
}
# elif GCM_TABLE_BITS == 8
+# if HAVE_NATIVE_gcm_init_key8
+
+#define gcm_init_key _nettle_gcm_init_key8
+void
+_nettle_gcm_init_key8 (union nettle_block16 *table);
+# endif /* HAVE_NATIVE_gcm_init_key8 */
# if HAVE_NATIVE_gcm_hash8
#define gcm_hash _nettle_gcm_hash8
@@ -225,6 +231,13 @@ gcm_gf_mul (union nettle_block16 *x, const union
nettle_block16 *table)
#endif /* GCM_TABLE_BITS */
+#if HAVE_NATIVE_gcm_fill
+
+#define gcm_fill _nettle_gcm_fill
+void
+_nettle_gcm_fill (uint8_t *ctr, size_t blocks, union nettle_block16
*buffer);
+#endif /* HAVE_NATIVE_gcm_fill */
+
/* Increment the rightmost 32 bits. */
#define INC32(block) INCREMENT(4, (block.b) + GCM_BLOCK_SIZE - 4)
@@ -245,7 +258,9 @@ gcm_set_key(struct gcm_key *key,
memset(key->h[0].b, 0, GCM_BLOCK_SIZE);
f (cipher, GCM_BLOCK_SIZE, key->h[i].b, key->h[0].b);
-#if GCM_TABLE_BITS
+#ifdef gcm_init_key
+ gcm_init_key(key->h);
+#elif GCM_TABLE_BITS
/* Algorithm 3 from the gcm paper. First do powers of two, then do
the rest by adding. */
while (i /= 2)
@@ -333,6 +348,7 @@ gcm_update(struct gcm_ctx *ctx, const struct gcm_key
*key,
ctx->auth_size += length;
}
+#ifndef gcm_fill
static nettle_fill16_func gcm_fill;
static void
gcm_fill(uint8_t *ctr, size_t blocks, union nettle_block16 *buffer)
@@ -349,6 +365,7 @@ gcm_fill(uint8_t *ctr, size_t blocks, union
nettle_block16 *buffer)
WRITE_UINT32(ctr + GCM_BLOCK_SIZE - 4, c);
}
+#endif /* !gcm_fill */
void
gcm_encrypt (struct gcm_ctx *ctx, const struct gcm_key *key,
diff --git a/powerpc64le/aes-decrypt-internal.asm
b/powerpc64le/aes-decrypt-internal.asm
new file mode 100644
index 00000000..bde34779
--- /dev/null
+++ b/powerpc64le/aes-decrypt-internal.asm
@@ -0,0 +1,573 @@
+C powerpc64le/aes-decrypt-internal.asm
+
+ifelse(<
+ Copyright (C) 2020 Mamone Tarsha
+
+ This file is part of GNU Nettle.
+
+ GNU Nettle is free software: you can redistribute it and/or
+ modify it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+ or
+
+ * the GNU General Public License as published by the Free
+ Software Foundation; either version 2 of the License, or (at your
+ option) any later version.
+
+ or both in parallel, as here.
+
+ GNU Nettle is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received copies of the GNU General Public License and
+ the GNU Lesser General Public License along with this program. If
+ not, see http://www.gnu.org/licenses/.
+>)
+
+C Register usage:
+
+define(<SP>, <1>)
+define(<TOCP>, <2>)
+
+define(<ROUNDS>, <3>)
+define(<KEYS>, <4>)
+define(<LENGTH>, <6>)
+define(<DST>, <7>)
+define(<SRC>, <8>)
+
+define(<swap_mask>, <0>)
+
+define(<K>, <1>)
+define(<S0>, <2>)
+define(<S1>, <3>)
+define(<S2>, <4>)
+define(<S3>, <5>)
+define(<S4>, <6>)
+define(<S5>, <7>)
+define(<S6>, <8>)
+define(<S7>, <9>)
+define(<S8>, <10>)
+define(<S9>, <11>)
+define(<S10>, <12>)
+define(<S11>, <13>)
+define(<S12>, <14>)
+define(<S13>, <15>)
+define(<S14>, <16>)
+define(<S15>, <17>)
+
+define(<KX>, <33>)
+define(<S0X>, <34>)
+define(<S1X>, <35>)
+define(<S2X>, <36>)
+define(<S3X>, <37>)
+define(<S4X>, <38>)
+define(<S5X>, <39>)
+define(<S6X>, <40>)
+define(<S7X>, <41>)
+define(<S8X>, <42>)
+define(<S9X>, <43>)
+define(<S10X>, <44>)
+define(<S11X>, <45>)
+define(<S12X>, <46>)
+define(<S13X>, <47>)
+define(<S14X>, <48>)
+define(<S15X>, <49>)
+
+C ZERO vector register is used in place of RoundKey
+C for vncipher instruction because the order of InvMixColumns
+C and Xor processes are flipped in that instruction.
+C The Xor process with RoundKey is executed afterward.
+define(<ZERO>, <18>)
+
+ .file "aes-decrypt-internal.asm"
+
+ C _aes_decrypt(unsigned rounds, const uint32_t *keys,
+ C const struct aes_table *T,
+ C size_t length, uint8_t *dst,
+ C uint8_t *src)
+
+ .text
+.align 5
+PROLOGUE(_nettle_aes_decrypt)
+ vxor ZERO,ZERO,ZERO
+
+ ld 5,.swap_mask@got(TOCP)
+ lvx swap_mask,0,5
+
+ subi ROUNDS,ROUNDS,1
+ srdi LENGTH,LENGTH,4
+
+ srdi 5,LENGTH,4 # 16x loop count
+ cmpldi 5,0
+ beq L8x
+
+ std 17,-120(SP);
+ std 18,-112(SP);
+ std 19,-104(SP);
+ std 20,-96(SP);
+ std 21,-88(SP);
+ std 22,-80(SP);
+ std 23,-72(SP);
+ std 24,-64(SP);
+ std 25,-56(SP);
+ std 26,-48(SP);
+ std 27,-40(SP);
+ std 28,-32(SP);
+ std 29,-24(SP);
+ std 30,-16(SP);
+ std 31,-8(SP);
+
+ li 17,0x10
+ li 18,0x20
+ li 19,0x30
+ li 20,0x40
+ li 21,0x50
+ li 22,0x60
+ li 23,0x70
+ li 24,0x80
+ li 25,0x90
+ li 26,0xA0
+ li 27,0xB0
+ li 28,0xC0
+ li 29,0xD0
+ li 30,0xE0
+ li 31,0xF0
+
+.align 5
+Lx16_loop:
+ lxvd2x KX,0,KEYS
+
+ lxvd2x S0X,0,SRC
+ lxvd2x S1X,17,SRC
+ lxvd2x S2X,18,SRC
+ lxvd2x S3X,19,SRC
+ lxvd2x S4X,20,SRC
+ lxvd2x S5X,21,SRC
+ lxvd2x S6X,22,SRC
+ lxvd2x S7X,23,SRC
+ lxvd2x S8X,24,SRC
+ lxvd2x S9X,25,SRC
+ lxvd2x S10X,26,SRC
+ lxvd2x S11X,27,SRC
+ lxvd2x S12X,28,SRC
+ lxvd2x S13X,29,SRC
+ lxvd2x S14X,30,SRC
+ lxvd2x S15X,31,SRC
+
+ vxor S0,S0,K
+ vxor S1,S1,K
+ vxor S2,S2,K
+ vxor S3,S3,K
+ vxor S4,S4,K
+ vxor S5,S5,K
+ vxor S6,S6,K
+ vxor S7,S7,K
+ vxor S8,S8,K
+ vxor S9,S9,K
+ vxor S10,S10,K
+ vxor S11,S11,K
+ vxor S12,S12,K
+ vxor S13,S13,K
+ vxor S14,S14,K
+ vxor S15,S15,K
+
+ vperm S0,S0,S0,swap_mask
+ vperm S1,S1,S1,swap_mask
+ vperm S2,S2,S2,swap_mask
+ vperm S3,S3,S3,swap_mask
+ vperm S4,S4,S4,swap_mask
+ vperm S5,S5,S5,swap_mask
+ vperm S6,S6,S6,swap_mask
+ vperm S7,S7,S7,swap_mask
+ vperm S8,S8,S8,swap_mask
+ vperm S9,S9,S9,swap_mask
+ vperm S10,S10,S10,swap_mask
+ vperm S11,S11,S11,swap_mask
+ vperm S12,S12,S12,swap_mask
+ vperm S13,S13,S13,swap_mask
+ vperm S14,S14,S14,swap_mask
+ vperm S15,S15,S15,swap_mask
+
+ mtctr ROUNDS
+ li 10,0x10
+.align 5
+L16x_round_loop:
+ lxvd2x KX,10,KEYS
+ vperm K,K,K,swap_mask
+ vncipher S0,S0,ZERO
+ vncipher S1,S1,ZERO
+ vncipher S2,S2,ZERO
+ vncipher S3,S3,ZERO
+ vncipher S4,S4,ZERO
+ vncipher S5,S5,ZERO
+ vncipher S6,S6,ZERO
+ vncipher S7,S7,ZERO
+ vncipher S8,S8,ZERO
+ vncipher S9,S9,ZERO
+ vncipher S10,S10,ZERO
+ vncipher S11,S11,ZERO
+ vncipher S12,S12,ZERO
+ vncipher S13,S13,ZERO
+ vncipher S14,S14,ZERO
+ vncipher S15,S15,ZERO
+ vxor S0,S0,K
+ vxor S1,S1,K
+ vxor S2,S2,K
+ vxor S3,S3,K
+ vxor S4,S4,K
+ vxor S5,S5,K
+ vxor S6,S6,K
+ vxor S7,S7,K
+ vxor S8,S8,K
+ vxor S9,S9,K
+ vxor S10,S10,K
+ vxor S11,S11,K
+ vxor S12,S12,K
+ vxor S13,S13,K
+ vxor S14,S14,K
+ vxor S15,S15,K
+ addi 10,10,0x10
+ bdnz L16x_round_loop
+
+ lxvd2x KX,10,KEYS
+ vperm K,K,K,swap_mask
+ vncipherlast S0,S0,K
+ vncipherlast S1,S1,K
+ vncipherlast S2,S2,K
+ vncipherlast S3,S3,K
+ vncipherlast S4,S4,K
+ vncipherlast S5,S5,K
+ vncipherlast S6,S6,K
+ vncipherlast S7,S7,K
+ vncipherlast S8,S8,K
+ vncipherlast S9,S9,K
+ vncipherlast S10,S10,K
+ vncipherlast S11,S11,K
+ vncipherlast S12,S12,K
+ vncipherlast S13,S13,K
+ vncipherlast S14,S14,K
+ vncipherlast S15,S15,K
+
+ vperm S0,S0,S0,swap_mask
+ vperm S1,S1,S1,swap_mask
+ vperm S2,S2,S2,swap_mask
+ vperm S3,S3,S3,swap_mask
+ vperm S4,S4,S4,swap_mask
+ vperm S5,S5,S5,swap_mask
+ vperm S6,S6,S6,swap_mask
+ vperm S7,S7,S7,swap_mask
+ vperm S8,S8,S8,swap_mask
+ vperm S9,S9,S9,swap_mask
+ vperm S10,S10,S10,swap_mask
+ vperm S11,S11,S11,swap_mask
+ vperm S12,S12,S12,swap_mask
+ vperm S13,S13,S13,swap_mask
+ vperm S14,S14,S14,swap_mask
+ vperm S15,S15,S15,swap_mask
+
+ stxvd2x S0X,0,DST
+ stxvd2x S1X,17,DST
+ stxvd2x S2X,18,DST
+ stxvd2x S3X,19,DST
+ stxvd2x S4X,20,DST
+ stxvd2x S5X,21,DST
+ stxvd2x S6X,22,DST
+ stxvd2x S7X,23,DST
+ stxvd2x S8X,24,DST
+ stxvd2x S9X,25,DST
+ stxvd2x S10X,26,DST
+ stxvd2x S11X,27,DST
+ stxvd2x S12X,28,DST
+ stxvd2x S13X,29,DST
+ stxvd2x S14X,30,DST
+ stxvd2x S15X,31,DST
+
+ addi SRC,SRC,0x100
+ addi DST,DST,0x100
+ subic. 5,5,1
+ bne Lx16_loop
+
+ ld 17,-120(SP);
+ ld 18,-112(SP);
+ ld 19,-104(SP);
+ ld 20,-96(SP);
+ ld 21,-88(SP);
+ ld 22,-80(SP);
+ ld 23,-72(SP);
+ ld 24,-64(SP);
+ ld 25,-56(SP);
+ ld 26,-48(SP);
+ ld 27,-40(SP);
+ ld 28,-32(SP);
+ ld 29,-24(SP);
+ ld 30,-16(SP);
+ ld 31,-8(SP);
+
+ clrldi LENGTH,LENGTH,60
+
+L8x:
+ srdi 5,LENGTH,3
+ cmpldi 5,0
+ beq L4x
+
+ lxvd2x KX,0,KEYS
+
+ lxvd2x S0X,0,SRC
+ li 9,0x10
+ lxvd2x S1X,9,SRC
+ addi 9,9,0x10
+ lxvd2x S2X,9,SRC
+ addi 9,9,0x10
+ lxvd2x S3X,9,SRC
+ addi 9,9,0x10
+ lxvd2x S4X,9,SRC
+ addi 9,9,0x10
+ lxvd2x S5X,9,SRC
+ addi 9,9,0x10
+ lxvd2x S6X,9,SRC
+ addi 9,9,0x10
+ lxvd2x S7X,9,SRC
+
+ vxor S0,S0,K
+ vxor S1,S1,K
+ vxor S2,S2,K
+ vxor S3,S3,K
+ vxor S4,S4,K
+ vxor S5,S5,K
+ vxor S6,S6,K
+ vxor S7,S7,K
+
+ vperm S0,S0,S0,swap_mask
+ vperm S1,S1,S1,swap_mask
+ vperm S2,S2,S2,swap_mask
+ vperm S3,S3,S3,swap_mask
+ vperm S4,S4,S4,swap_mask
+ vperm S5,S5,S5,swap_mask
+ vperm S6,S6,S6,swap_mask
+ vperm S7,S7,S7,swap_mask
+
+ mtctr ROUNDS
+ li 10,0x10
+.align 5
+L8x_round_loop:
+ lxvd2x KX,10,KEYS
+ vperm K,K,K,swap_mask
+ vncipher S0,S0,ZERO
+ vncipher S1,S1,ZERO
+ vncipher S2,S2,ZERO
+ vncipher S3,S3,ZERO
+ vncipher S4,S4,ZERO
+ vncipher S5,S5,ZERO
+ vncipher S6,S6,ZERO
+ vncipher S7,S7,ZERO
+ vxor S0,S0,K
+ vxor S1,S1,K
+ vxor S2,S2,K
+ vxor S3,S3,K
+ vxor S4,S4,K
+ vxor S5,S5,K
+ vxor S6,S6,K
+ vxor S7,S7,K
+ addi 10,10,0x10
+ bdnz L8x_round_loop
+
+ lxvd2x KX,10,KEYS
+ vperm K,K,K,swap_mask
+ vncipherlast S0,S0,K
+ vncipherlast S1,S1,K
+ vncipherlast S2,S2,K
+ vncipherlast S3,S3,K
+ vncipherlast S4,S4,K
+ vncipherlast S5,S5,K
+ vncipherlast S6,S6,K
+ vncipherlast S7,S7,K
+
+ vperm S0,S0,S0,swap_mask
+ vperm S1,S1,S1,swap_mask
+ vperm S2,S2,S2,swap_mask
+ vperm S3,S3,S3,swap_mask
+ vperm S4,S4,S4,swap_mask
+ vperm S5,S5,S5,swap_mask
+ vperm S6,S6,S6,swap_mask
+ vperm S7,S7,S7,swap_mask
+
+ stxvd2x S0X,0,DST
+ li 9,0x10
+ stxvd2x S1X,9,DST
+ addi 9,9,0x10
+ stxvd2x S2X,9,DST
+ addi 9,9,0x10
+ stxvd2x S3X,9,DST
+ addi 9,9,0x10
+ stxvd2x S4X,9,DST
+ addi 9,9,0x10
+ stxvd2x S5X,9,DST
+ addi 9,9,0x10
+ stxvd2x S6X,9,DST
+ addi 9,9,0x10
+ stxvd2x S7X,9,DST
+
+ addi SRC,SRC,0x80
+ addi DST,DST,0x80
+
+ clrldi LENGTH,LENGTH,61
+
+L4x:
+ srdi 5,LENGTH,2
+ cmpldi 5,0
+ beq L2x
+
+ lxvd2x KX,0,KEYS
+
+ lxvd2x S0X,0,SRC
+ li 9,0x10
+ lxvd2x S1X,9,SRC
+ addi 9,9,0x10
+ lxvd2x S2X,9,SRC
+ addi 9,9,0x10
+ lxvd2x S3X,9,SRC
+
+ vxor S0,S0,K
+ vxor S1,S1,K
+ vxor S2,S2,K
+ vxor S3,S3,K
+
+ vperm S0,S0,S0,swap_mask
+ vperm S1,S1,S1,swap_mask
+ vperm S2,S2,S2,swap_mask
+ vperm S3,S3,S3,swap_mask
+
+ mtctr ROUNDS
+ li 10,0x10
+.align 5
+L4x_round_loop:
+ lxvd2x KX,10,KEYS
+ vperm K,K,K,swap_mask
+ vncipher S0,S0,ZERO
+ vncipher S1,S1,ZERO
+ vncipher S2,S2,ZERO
+ vncipher S3,S3,ZERO
+ vxor S0,S0,K
+ vxor S1,S1,K
+ vxor S2,S2,K
+ vxor S3,S3,K
+ addi 10,10,0x10
+ bdnz L4x_round_loop
+
+ lxvd2x KX,10,KEYS
+ vperm K,K,K,swap_mask
+ vncipherlast S0,S0,K
+ vncipherlast S1,S1,K
+ vncipherlast S2,S2,K
+ vncipherlast S3,S3,K
+
+ vperm S0,S0,S0,swap_mask
+ vperm S1,S1,S1,swap_mask
+ vperm S2,S2,S2,swap_mask
+ vperm S3,S3,S3,swap_mask
+
+ stxvd2x S0X,0,DST
+ li 9,0x10
+ stxvd2x S1X,9,DST
+ addi 9,9,0x10
+ stxvd2x S2X,9,DST
+ addi 9,9,0x10
+ stxvd2x S3X,9,DST
+
+ addi SRC,SRC,0x40
+ addi DST,DST,0x40
+
+ clrldi LENGTH,LENGTH,62
+
+L2x:
+ srdi 5,LENGTH,1
+ cmpldi 5,0
+ beq L1x
+
+ lxvd2x KX,0,KEYS
+
+ lxvd2x S0X,0,SRC
+ li 9,0x10
+ lxvd2x S1X,9,SRC
+
+ vxor S0,S0,K
+ vxor S1,S1,K
+
+ vperm S0,S0,S0,swap_mask
+ vperm S1,S1,S1,swap_mask
+
+ mtctr ROUNDS
+ li 10,0x10
+.align 5
+L2x_round_loop:
+ lxvd2x KX,10,KEYS
+ vperm K,K,K,swap_mask
+ vncipher S0,S0,ZERO
+ vncipher S1,S1,ZERO
+ vxor S0,S0,K
+ vxor S1,S1,K
+ addi 10,10,0x10
+ bdnz L2x_round_loop
+
+ lxvd2x KX,10,KEYS
+ vperm K,K,K,swap_mask
+ vncipherlast S0,S0,K
+ vncipherlast S1,S1,K
+
+ vperm S0,S0,S0,swap_mask
+ vperm S1,S1,S1,swap_mask
+
+ stxvd2x S0X,0,DST
+ li 9,0x10
+ stxvd2x S1X,9,DST
+
+ addi SRC,SRC,0x20
+ addi DST,DST,0x20
+
+ clrldi LENGTH,LENGTH,63
+
+L1x:
+ cmpldi LENGTH,0
+ beq Ldone
+
+ lxvd2x KX,0,KEYS
+
+ lxvd2x S0X,0,SRC
+
+ vxor S0,S0,K
+
+ vperm S0,S0,S0,swap_mask
+
+ mtctr ROUNDS
+ li 10,0x10
+.align 5
+L1x_round_loop:
+ lxvd2x KX,10,KEYS
+ vperm K,K,K,swap_mask
+ vncipher S0,S0,ZERO
+ vxor S0,S0,K
+ addi 10,10,0x10
+ bdnz L1x_round_loop
+
+ lxvd2x KX,10,KEYS
+ vperm K,K,K,swap_mask
+ vncipherlast S0,S0,K
+
+ vperm S0,S0,S0,swap_mask
+
+ stxvd2x S0X,0,DST
+
+Ldone:
+ blr
+EPILOGUE(_nettle_aes_decrypt)
+
+ .data
+ .align 4
+.swap_mask:
+ .byte 8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7
diff --git a/powerpc64le/aes-encrypt-internal.asm
b/powerpc64le/aes-encrypt-internal.asm
new file mode 100644
index 00000000..1bbd86a8
--- /dev/null
+++ b/powerpc64le/aes-encrypt-internal.asm
@@ -0,0 +1,534 @@
+C powerpc64le/aes-encrypt-internal.asm
+
+ifelse(<
+ Copyright (C) 2020 Mamone Tarsha
+
+ This file is part of GNU Nettle.
+
+ GNU Nettle is free software: you can redistribute it and/or
+ modify it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+ or
+
+ * the GNU General Public License as published by the Free
+ Software Foundation; either version 2 of the License, or (at your
+ option) any later version.
+
+ or both in parallel, as here.
+
+ GNU Nettle is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received copies of the GNU General Public License and
+ the GNU Lesser General Public License along with this program. If
+ not, see http://www.gnu.org/licenses/.
+>)
+
+C Register usage:
+
+define(<SP>, <1>)
+define(<TOCP>, <2>)
+
+define(<ROUNDS>, <3>)
+define(<KEYS>, <4>)
+define(<LENGTH>, <6>)
+define(<DST>, <7>)
+define(<SRC>, <8>)
+
+define(<swap_mask>, <0>)
+
+define(<K>, <1>)
+define(<S0>, <2>)
+define(<S1>, <3>)
+define(<S2>, <4>)
+define(<S3>, <5>)
+define(<S4>, <6>)
+define(<S5>, <7>)
+define(<S6>, <8>)
+define(<S7>, <9>)
+define(<S8>, <10>)
+define(<S9>, <11>)
+define(<S10>, <12>)
+define(<S11>, <13>)
+define(<S12>, <14>)
+define(<S13>, <15>)
+define(<S14>, <16>)
+define(<S15>, <17>)
+
+define(<KX>, <33>)
+define(<S0X>, <34>)
+define(<S1X>, <35>)
+define(<S2X>, <36>)
+define(<S3X>, <37>)
+define(<S4X>, <38>)
+define(<S5X>, <39>)
+define(<S6X>, <40>)
+define(<S7X>, <41>)
+define(<S8X>, <42>)
+define(<S9X>, <43>)
+define(<S10X>, <44>)
+define(<S11X>, <45>)
+define(<S12X>, <46>)
+define(<S13X>, <47>)
+define(<S14X>, <48>)
+define(<S15X>, <49>)
+
+ .file "aes-encrypt-internal.asm"
+
+ C _aes_encrypt(unsigned rounds, const uint32_t *keys,
+ C const struct aes_table *T,
+ C size_t length, uint8_t *dst,
+ C uint8_t *src)
+
+ .text
+.align 5
+PROLOGUE(_nettle_aes_encrypt)
+ ld 5,.swap_mask@got(TOCP)
+ lvx swap_mask,0,5
+
+ subi ROUNDS,ROUNDS,1
+ srdi LENGTH,LENGTH,4
+
+ srdi 5,LENGTH,4 # 16x loop count
+ cmpldi 5,0
+ beq L8x
+
+ std 17,-120(SP);
+ std 18,-112(SP);
+ std 19,-104(SP);
+ std 20,-96(SP);
+ std 21,-88(SP);
+ std 22,-80(SP);
+ std 23,-72(SP);
+ std 24,-64(SP);
+ std 25,-56(SP);
+ std 26,-48(SP);
+ std 27,-40(SP);
+ std 28,-32(SP);
+ std 29,-24(SP);
+ std 30,-16(SP);
+ std 31,-8(SP);
+
+ li 17,0x10
+ li 18,0x20
+ li 19,0x30
+ li 20,0x40
+ li 21,0x50
+ li 22,0x60
+ li 23,0x70
+ li 24,0x80
+ li 25,0x90
+ li 26,0xA0
+ li 27,0xB0
+ li 28,0xC0
+ li 29,0xD0
+ li 30,0xE0
+ li 31,0xF0
+
+.align 5
+Lx16_loop:
+ lxvd2x KX,0,KEYS
+
+ lxvd2x S0X,0,SRC
+ lxvd2x S1X,17,SRC
+ lxvd2x S2X,18,SRC
+ lxvd2x S3X,19,SRC
+ lxvd2x S4X,20,SRC
+ lxvd2x S5X,21,SRC
+ lxvd2x S6X,22,SRC
+ lxvd2x S7X,23,SRC
+ lxvd2x S8X,24,SRC
+ lxvd2x S9X,25,SRC
+ lxvd2x S10X,26,SRC
+ lxvd2x S11X,27,SRC
+ lxvd2x S12X,28,SRC
+ lxvd2x S13X,29,SRC
+ lxvd2x S14X,30,SRC
+ lxvd2x S15X,31,SRC
+
+ vxor S0,S0,K
+ vxor S1,S1,K
+ vxor S2,S2,K
+ vxor S3,S3,K
+ vxor S4,S4,K
+ vxor S5,S5,K
+ vxor S6,S6,K
+ vxor S7,S7,K
+ vxor S8,S8,K
+ vxor S9,S9,K
+ vxor S10,S10,K
+ vxor S11,S11,K
+ vxor S12,S12,K
+ vxor S13,S13,K
+ vxor S14,S14,K
+ vxor S15,S15,K
+
+ vperm S0,S0,S0,swap_mask
+ vperm S1,S1,S1,swap_mask
+ vperm S2,S2,S2,swap_mask
+ vperm S3,S3,S3,swap_mask
+ vperm S4,S4,S4,swap_mask
+ vperm S5,S5,S5,swap_mask
+ vperm S6,S6,S6,swap_mask
+ vperm S7,S7,S7,swap_mask
+ vperm S8,S8,S8,swap_mask
+ vperm S9,S9,S9,swap_mask
+ vperm S10,S10,S10,swap_mask
+ vperm S11,S11,S11,swap_mask
+ vperm S12,S12,S12,swap_mask
+ vperm S13,S13,S13,swap_mask
+ vperm S14,S14,S14,swap_mask
+ vperm S15,S15,S15,swap_mask
+
+ mtctr ROUNDS
+ li 10,0x10
+.align 5
+L16x_round_loop:
+ lxvd2x KX,10,KEYS
+ vperm K,K,K,swap_mask
+ vcipher S0,S0,K
+ vcipher S1,S1,K
+ vcipher S2,S2,K
+ vcipher S3,S3,K
+ vcipher S4,S4,K
+ vcipher S5,S5,K
+ vcipher S6,S6,K
+ vcipher S7,S7,K
+ vcipher S8,S8,K
+ vcipher S9,S9,K
+ vcipher S10,S10,K
+ vcipher S11,S11,K
+ vcipher S12,S12,K
+ vcipher S13,S13,K
+ vcipher S14,S14,K
+ vcipher S15,S15,K
+ addi 10,10,0x10
+ bdnz L16x_round_loop
+
+ lxvd2x KX,10,KEYS
+ vperm K,K,K,swap_mask
+ vcipherlast S0,S0,K
+ vcipherlast S1,S1,K
+ vcipherlast S2,S2,K
+ vcipherlast S3,S3,K
+ vcipherlast S4,S4,K
+ vcipherlast S5,S5,K
+ vcipherlast S6,S6,K
+ vcipherlast S7,S7,K
+ vcipherlast S8,S8,K
+ vcipherlast S9,S9,K
+ vcipherlast S10,S10,K
+ vcipherlast S11,S11,K
+ vcipherlast S12,S12,K
+ vcipherlast S13,S13,K
+ vcipherlast S14,S14,K
+ vcipherlast S15,S15,K
+
+ vperm S0,S0,S0,swap_mask
+ vperm S1,S1,S1,swap_mask
+ vperm S2,S2,S2,swap_mask
+ vperm S3,S3,S3,swap_mask
+ vperm S4,S4,S4,swap_mask
+ vperm S5,S5,S5,swap_mask
+ vperm S6,S6,S6,swap_mask
+ vperm S7,S7,S7,swap_mask
+ vperm S8,S8,S8,swap_mask
+ vperm S9,S9,S9,swap_mask
+ vperm S10,S10,S10,swap_mask
+ vperm S11,S11,S11,swap_mask
+ vperm S12,S12,S12,swap_mask
+ vperm S13,S13,S13,swap_mask
+ vperm S14,S14,S14,swap_mask
+ vperm S15,S15,S15,swap_mask
+
+ stxvd2x S0X,0,DST
+ stxvd2x S1X,17,DST
+ stxvd2x S2X,18,DST
+ stxvd2x S3X,19,DST
+ stxvd2x S4X,20,DST
+ stxvd2x S5X,21,DST
+ stxvd2x S6X,22,DST
+ stxvd2x S7X,23,DST
+ stxvd2x S8X,24,DST
+ stxvd2x S9X,25,DST
+ stxvd2x S10X,26,DST
+ stxvd2x S11X,27,DST
+ stxvd2x S12X,28,DST
+ stxvd2x S13X,29,DST
+ stxvd2x S14X,30,DST
+ stxvd2x S15X,31,DST
+
+ addi SRC,SRC,0x100
+ addi DST,DST,0x100
+ subic. 5,5,1
+ bne Lx16_loop
+
+ ld 17,-120(SP);
+ ld 18,-112(SP);
+ ld 19,-104(SP);
+ ld 20,-96(SP);
+ ld 21,-88(SP);
+ ld 22,-80(SP);
+ ld 23,-72(SP);
+ ld 24,-64(SP);
+ ld 25,-56(SP);
+ ld 26,-48(SP);
+ ld 27,-40(SP);
+ ld 28,-32(SP);
+ ld 29,-24(SP);
+ ld 30,-16(SP);
+ ld 31,-8(SP);
+
+ clrldi LENGTH,LENGTH,60
+
+L8x:
+ srdi 5,LENGTH,3
+ cmpldi 5,0
+ beq L4x
+
+ lxvd2x KX,0,KEYS
+
+ lxvd2x S0X,0,SRC
+ li 9,0x10
+ lxvd2x S1X,9,SRC
+ addi 9,9,0x10
+ lxvd2x S2X,9,SRC
+ addi 9,9,0x10
+ lxvd2x S3X,9,SRC
+ addi 9,9,0x10
+ lxvd2x S4X,9,SRC
+ addi 9,9,0x10
+ lxvd2x S5X,9,SRC
+ addi 9,9,0x10
+ lxvd2x S6X,9,SRC
+ addi 9,9,0x10
+ lxvd2x S7X,9,SRC
+
+ vxor S0,S0,K
+ vxor S1,S1,K
+ vxor S2,S2,K
+ vxor S3,S3,K
+ vxor S4,S4,K
+ vxor S5,S5,K
+ vxor S6,S6,K
+ vxor S7,S7,K
+
+ vperm S0,S0,S0,swap_mask
+ vperm S1,S1,S1,swap_mask
+ vperm S2,S2,S2,swap_mask
+ vperm S3,S3,S3,swap_mask
+ vperm S4,S4,S4,swap_mask
+ vperm S5,S5,S5,swap_mask
+ vperm S6,S6,S6,swap_mask
+ vperm S7,S7,S7,swap_mask
+
+ mtctr ROUNDS
+ li 10,0x10
+.align 5
+L8x_round_loop:
+ lxvd2x KX,10,KEYS
+ vperm K,K,K,swap_mask
+ vcipher S0,S0,K
+ vcipher S1,S1,K
+ vcipher S2,S2,K
+ vcipher S3,S3,K
+ vcipher S4,S4,K
+ vcipher S5,S5,K
+ vcipher S6,S6,K
+ vcipher S7,S7,K
+ addi 10,10,0x10
+ bdnz L8x_round_loop
+
+ lxvd2x KX,10,KEYS
+ vperm K,K,K,swap_mask
+ vcipherlast S0,S0,K
+ vcipherlast S1,S1,K
+ vcipherlast S2,S2,K
+ vcipherlast S3,S3,K
+ vcipherlast S4,S4,K
+ vcipherlast S5,S5,K
+ vcipherlast S6,S6,K
+ vcipherlast S7,S7,K
+
+ vperm S0,S0,S0,swap_mask
+ vperm S1,S1,S1,swap_mask
+ vperm S2,S2,S2,swap_mask
+ vperm S3,S3,S3,swap_mask
+ vperm S4,S4,S4,swap_mask
+ vperm S5,S5,S5,swap_mask
+ vperm S6,S6,S6,swap_mask
+ vperm S7,S7,S7,swap_mask
+
+ stxvd2x S0X,0,DST
+ li 9,0x10
+ stxvd2x S1X,9,DST
+ addi 9,9,0x10
+ stxvd2x S2X,9,DST
+ addi 9,9,0x10
+ stxvd2x S3X,9,DST
+ addi 9,9,0x10
+ stxvd2x S4X,9,DST
+ addi 9,9,0x10
+ stxvd2x S5X,9,DST
+ addi 9,9,0x10
+ stxvd2x S6X,9,DST
+ addi 9,9,0x10
+ stxvd2x S7X,9,DST
+
+ addi SRC,SRC,0x80
+ addi DST,DST,0x80
+
+ clrldi LENGTH,LENGTH,61
+
+L4x:
+ srdi 5,LENGTH,2
+ cmpldi 5,0
+ beq L2x
+
+ lxvd2x KX,0,KEYS
+
+ lxvd2x S0X,0,SRC
+ li 9,0x10
+ lxvd2x S1X,9,SRC
+ addi 9,9,0x10
+ lxvd2x S2X,9,SRC
+ addi 9,9,0x10
+ lxvd2x S3X,9,SRC
+
+ vxor S0,S0,K
+ vxor S1,S1,K
+ vxor S2,S2,K
+ vxor S3,S3,K
+
+ vperm S0,S0,S0,swap_mask
+ vperm S1,S1,S1,swap_mask
+ vperm S2,S2,S2,swap_mask
+ vperm S3,S3,S3,swap_mask
+
+ mtctr ROUNDS
+ li 10,0x10
+.align 5
+L4x_round_loop:
+ lxvd2x KX,10,KEYS
+ vperm K,K,K,swap_mask
+ vcipher S0,S0,K
+ vcipher S1,S1,K
+ vcipher S2,S2,K
+ vcipher S3,S3,K
+ addi 10,10,0x10
+ bdnz L4x_round_loop
+
+ lxvd2x KX,10,KEYS
+ vperm K,K,K,swap_mask
+ vcipherlast S0,S0,K
+ vcipherlast S1,S1,K
+ vcipherlast S2,S2,K
+ vcipherlast S3,S3,K
+
+ vperm S0,S0,S0,swap_mask
+ vperm S1,S1,S1,swap_mask
+ vperm S2,S2,S2,swap_mask
+ vperm S3,S3,S3,swap_mask
+
+ stxvd2x S0X,0,DST
+ li 9,0x10
+ stxvd2x S1X,9,DST
+ addi 9,9,0x10
+ stxvd2x S2X,9,DST
+ addi 9,9,0x10
+ stxvd2x S3X,9,DST
+
+ addi SRC,SRC,0x40
+ addi DST,DST,0x40
+
+ clrldi LENGTH,LENGTH,62
+
+L2x:
+ srdi 5,LENGTH,1
+ cmpldi 5,0
+ beq L1x
+
+ lxvd2x KX,0,KEYS
+
+ lxvd2x S0X,0,SRC
+ li 9,0x10
+ lxvd2x S1X,9,SRC
+
+ vxor S0,S0,K
+ vxor S1,S1,K
+
+ vperm S0,S0,S0,swap_mask
+ vperm S1,S1,S1,swap_mask
+
+ mtctr ROUNDS
+ li 10,0x10
+.align 5
+L2x_round_loop:
+ lxvd2x KX,10,KEYS
+ vperm K,K,K,swap_mask
+ vcipher S0,S0,K
+ vcipher S1,S1,K
+ addi 10,10,0x10
+ bdnz L2x_round_loop
+
+ lxvd2x KX,10,KEYS
+ vperm K,K,K,swap_mask
+ vcipherlast S0,S0,K
+ vcipherlast S1,S1,K
+
+ vperm S0,S0,S0,swap_mask
+ vperm S1,S1,S1,swap_mask
+
+ stxvd2x S0X,0,DST
+ li 9,0x10
+ stxvd2x S1X,9,DST
+
+ addi SRC,SRC,0x20
+ addi DST,DST,0x20
+
+ clrldi LENGTH,LENGTH,63
+
+L1x:
+ cmpldi LENGTH,0
+ beq Ldone
+
+ lxvd2x KX,0,KEYS
+
+ lxvd2x S0X,0,SRC
+
+ vxor S0,S0,K
+
+ vperm S0,S0,S0,swap_mask
+
+ mtctr ROUNDS
+ li 10,0x10
+.align 5
+L1x_round_loop:
+ lxvd2x KX,10,KEYS
+ vperm K,K,K,swap_mask
+ vcipher S0,S0,K
+ addi 10,10,0x10
+ bdnz L1x_round_loop
+
+ lxvd2x KX,10,KEYS
+ vperm K,K,K,swap_mask
+ vcipherlast S0,S0,K
+
+ vperm S0,S0,S0,swap_mask
+
+ stxvd2x S0X,0,DST
+
+Ldone:
+ blr
+EPILOGUE(_nettle_aes_encrypt)
+
+ .data
+ .align 4
+.swap_mask:
+ .byte 8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7
diff --git a/powerpc64le/gcm-hash8.asm b/powerpc64le/gcm-hash8.asm
new file mode 100644
index 00000000..a809f6ef
--- /dev/null
+++ b/powerpc64le/gcm-hash8.asm
@@ -0,0 +1,992 @@
+C powerpc64le/gcm-hash8.asm
+
+ifelse(<
+ Copyright (C) 2020 Mamone Tarsha
+ This file is part of GNU Nettle.
+
+ GNU Nettle is free software: you can redistribute it and/or
+ modify it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+ or
+
+ * the GNU General Public License as published by the Free
+ Software Foundation; either version 2 of the License, or (at your
+ option) any later version.
+
+ or both in parallel, as here.
+
+ GNU Nettle is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received copies of the GNU General Public License and
+ the GNU Lesser General Public License along with this program. If
+ not, see http://www.gnu.org/licenses/.
+>)
+
+C Register usage:
+C VSX instructions is used to load and store data to memory "lxvd2x,
stxvd2x"
+C instead of VR instructions "lvx, stvx" as a workaround to access
unaligned data
+C VSX registers are defined with "X" suffix
+
+define(<SP>, <1>)
+define(<TOCP>, <2>)
+
+define(<TABLE>, <3>)
+define(<X>, <4>)
+define(<LENGTH>, <5>)
+define(<DATA>, <6>)
+
+define(<zero>, <0>)
+define(<swap_mask>, <1>)
+define(<hidw_mask>, <2>)
+define(<lodw_mask>, <3>)
+define(<poly>, <4>)
+define(<poly_h>, <4>)
+define(<poly_l>, <5>)
+define(<RP>, <6>)
+define(<Mh>, <7>)
+define(<Ml>, <8>)
+define(<H>, <9>)
+define(<Hh>, <10>)
+define(<Hl>, <11>)
+define(<RP2>, <9>)
+define(<M2h>, <10>)
+define(<M2l>, <11>)
+
+define(<HX>, <41>)
+define(<HhX>, <42>)
+define(<HlX>, <43>)
+define(<H_HhX>, <44>)
+define(<H_HX>, <45>)
+define(<H_HlX>, <46>)
+
+define(<sl1>, <1>)
+define(<msb>, <5>)
+define(<H2>, <6>)
+define(<H2h>, <7>)
+define(<H2l>, <8>)
+define(<H_h>, <12>)
+define(<H_m>, <13>)
+define(<H_l>, <14>)
+define(<H_Hh>, <12>)
+define(<H_H>, <13>)
+define(<H_Hl>, <14>)
+define(<H_t>, <15>)
+define(<H2_h>, <16>)
+define(<H2_m>, <17>)
+define(<H2_l>, <18>)
+define(<H2_t>, <19>)
+
+define(<C0X>, <38>)
+define(<C1X>, <39>)
+define(<C2X>, <40>)
+define(<C3X>, <44>)
+define(<C4X>, <38>)
+define(<C5X>, <39>)
+define(<C6X>, <40>)
+define(<C7X>, <44>)
+
+define(<CX>, <45>)
+
+define(<C0>, <6>)
+define(<C1>, <7>)
+define(<C2>, <8>)
+define(<C3>, <12>)
+define(<C4>, <6>)
+define(<C5>, <7>)
+define(<C6>, <8>)
+define(<C7>, <12>)
+
+define(<C>, <13>)
+
+define(<Ch>, <14>)
+define(<Cl>, <15>)
+define(<Cm>, <16>)
+
+define(<C01h>, <14>)
+define(<C01l>, <15>)
+define(<C01>, <16>)
+define(<C23h>, <17>)
+define(<C23l>, <18>)
+define(<C23>, <19>)
+define(<C45h>, <20>)
+define(<C45l>, <21>)
+define(<C45>, <22>)
+define(<C67h>, <6>)
+define(<C67l>, <7>)
+define(<C67>, <8>)
+
+define(<H21>, <9>)
+define(<H21h>, <10>)
+define(<H21l>, <11>)
+define(<H43>, <23>)
+define(<H43h>, <24>)
+define(<H43l>, <25>)
+define(<H65>, <26>)
+define(<H65h>, <27>)
+define(<H65l>, <28>)
+define(<H87>, <29>)
+define(<H87h>, <30>)
+define(<H87l>, <31>)
+
+define(<H21X>, <41>)
+define(<H21hX>, <42>)
+define(<H21lX>, <43>)
+define(<H43X>, <55>)
+define(<H43hX>, <56>)
+define(<H43lX>, <57>)
+define(<H65X>, <58>)
+define(<H65hX>, <59>)
+define(<H65lX>, <60>)
+define(<H87X>, <61>)
+define(<H87hX>, <62>)
+define(<H87lX>, <63>)
+
+# gcm_fill registers:
+
+define(<CTR>, <3>)
+define(<BLOCKS>, <4>)
+define(<BUFFER>, <5>)
+
+define(<CTR0>, <2>)
+define(<CTR0S>, <3>)
+define(<CTR1>, <4>)
+define(<CTR2>, <5>)
+define(<CTR3>, <6>)
+define(<CTR4>, <7>)
+define(<CTR5>, <8>)
+define(<CTR6>, <9>)
+define(<CTR7>, <10>)
+
+define(<CTR0X>, <34>)
+define(<CTR0SX>, <35>)
+define(<CTR1X>, <36>)
+define(<CTR2X>, <37>)
+define(<CTR3X>, <38>)
+define(<CTR4X>, <39>)
+define(<CTR5X>, <40>)
+define(<CTR6X>, <41>)
+define(<CTR7X>, <42>)
+
+define(<I1>, <11>)
+define(<I2>, <12>)
+define(<I3>, <13>)
+define(<I4>, <14>)
+define(<I5>, <15>)
+define(<I6>, <16>)
+define(<I7>, <17>)
+define(<I8>, <18>)
+
+ .file "gcm-hash8.asm"
+
+ # void gcm_init_key (union gcm_block *table)
+
+ .text
+.align 5
+PROLOGUE(_nettle_gcm_init_key8)
+ ld 7,.polynomial@got(TOCP)
+ lvx poly,0,7
+ ld 7,.swap_mask@got(TOCP)
+ lvx swap_mask,0,7
+ ld 7,.hidw_mask@got(TOCP)
+ lvx hidw_mask,0,7
+ ld 7,.lodw_mask@got(TOCP)
+ lvx lodw_mask,0,7
+
+ li 10,0x800
+ lxvd2x HX,10,TABLE # load H
+ vperm H,H,H,swap_mask
+
+ # --- calculate H = H shift left 1 modulo polynomial ---
+
+ vupkhsw msb,H # most significant bit word-extend
+ vspltisb sl1,1 # splat 1 for shift left
+ vspltw msb,msb,0 # most significant bit extend
+ vsl H,H,sl1 # H shift left 1
+ vand msb,msb,poly
+ vxor zero,zero,zero
+ vxor H_t,H,msb
+
+ vsldoi H,H_t,H_t,8 # doubleword swap
+ vsldoi Hh,H,zero,8
+ vsldoi Hl,zero,H,8
+
+ # --- calculate H^2 = H*H ---
+
+ # reduction pre-processing
+ vsldoi poly_h,zero,poly,8
+ vsldoi poly_l,poly_h,poly_h,8
+
+ # polynomial multiplication "classical"
+ vpmsumd H_h,H_t,Hh # H^1h*H^1h
+ vpmsumd H_l,H_t,Hl # H^1l*H^1l
+ vpmsumd H_m,H_t,H # H^1h*H^1l⊕H^1l*H^1h
+
+ # reduction first phase # [1]
+ vpmsumd RP,H_l,poly_h # [1]
+
+ # polynomial multiplication post-processing # [2]
+ vsldoi Mh,zero,H_m,8 # [2]
+ vsldoi Ml,H_m,zero,8 # [2]
+ vsldoi RP,RP,RP,8 # [1]
+ vxor H_h,H_h,Mh # [2]
+ vxor H_l,H_l,Ml # [2]
+ vxor H_l,H_l,RP # [1]
+
+ # reduction second phase
+ vpmsumd RP,H_l,poly_l
+ vxor H_h,H_l,H_h
+ vxor H2_t,H_h,RP
+
+ vsldoi H2,H2_t,H2_t,8
+ vsldoi H2h,H2,zero,8
+ vsldoi H2l,zero,H2,8
+
+ # --- calculate [H^2.Hi⊕H^2.Lo:H^1.Hi⊕H^1.Lo] ---
+
+ vperm H_Hh,H2,H,lodw_mask
+ vperm H_Hl,H2,H,hidw_mask
+ vxor H_H,H_Hh,H_Hl
+
+ # --- store H,[H^2.Hi⊕H^2.Lo:H^1.Hi⊕H^1.Lo] ---
+
+ li 8,0x00
+ li 9,0x100
+ li 10,0x200
+ stxvd2x HlX,8,TABLE
+ stxvd2x HX,9,TABLE
+ stxvd2x HhX,10,TABLE
+
+ li 8,0x300
+ li 9,0x400
+ li 10,0x500
+ stxvd2x H_HhX,8,TABLE
+ stxvd2x H_HX,9,TABLE
+ stxvd2x H_HlX,10,TABLE
+
+ # --- calculate H^3,H^4 ---
+
+ # polynomial multiplication "classical"
+ vpmsumd H_l,H_t,H2l # H^1l*H^2l
+ vpmsumd H_m,H_t,H2 # H^1h*H^2l⊕H^1l*H^2h
+ vpmsumd H_h,H_t,H2h # H^1h*H^2h
+ vpmsumd H2_l,H2_t,H2l # H^2l*H^2l
+ vpmsumd H2_m,H2_t,H2 # H^2h*H^2l⊕H^2l*H^2h
+ vpmsumd H2_h,H2_t,H2h # H^2h*H^2h
+
+ # reduction first phase # [1]
+ vpmsumd RP,H_l,poly_h # [1] H^3
+ vpmsumd RP2,H2_l,poly_h # [1] H^4
+
+ # polynomial multiplication post-processing # [2]
+ vsldoi Mh,zero,H_m,8 # [2] H^3
+ vsldoi M2h,zero,H2_m,8 # [2] H^4
+ vsldoi Ml,H_m,zero,8 # [2] H^3
+ vsldoi M2l,H2_m,zero,8 # [2] H^4
+ vsldoi RP,RP,RP,8 # [1] H^3
+ vsldoi RP2,RP2,RP2,8 # [1] H^4
+ vxor H_h,H_h,Mh # [2] H^3
+ vxor H2_h,H2_h,M2h # [2] H^4
+ vxor H_l,H_l,Ml # [2] H^3
+ vxor H2_l,H2_l,M2l # [2] H^4
+ vxor H_l,H_l,RP # [1] H^3
+ vxor H2_l,H2_l,RP2 # [1] H^4
+
+ # reduction second phase
+ vpmsumd RP,H_l,poly_l # H^3
+ vpmsumd RP2,H2_l,poly_l # H^4
+ vxor H_h,H_l,H_h # H^3
+ vxor H2_h,H2_l,H2_h # H^4
+ vxor H_h,H_h,RP # H^3
+ vxor H2_h,H2_h,RP2 # H^4
+
+ vsldoi H2,H2_h,H2_h,8 # H^4
+ vsldoi H,H_h,H_h,8 # H^3
+ vsldoi H2l,zero,H2,8 # H^4
+ vsldoi H2h,H2,zero,8 # H^4
+
+ # --- calculate [H^4.Hi⊕H^4.Lo:H^3.Hi⊕H^3.Lo] ---
+
+ vperm H_Hh,H2,H,lodw_mask
+ vperm H_Hl,H2,H,hidw_mask
+ vxor H_H,H_Hh,H_Hl
+
+ # --- store [H^4.Hi⊕H^4.Lo:H^3.Hi⊕H^3.Lo] ---
+
+ li 8,0x600
+ li 9,0x700
+ li 10,0x800
+ stxvd2x H_HhX,8,TABLE
+ stxvd2x H_HX,9,TABLE
+ stxvd2x H_HlX,10,TABLE
+
+ # --- calculate H^5,H^6 ---
+
+ # polynomial multiplication "classical"
+ vpmsumd H_l,H_t,H2l # H^1l*H^4l
+ vpmsumd H_m,H_t,H2 # H^1h*H^4l⊕H^1l*H^4h
+ vpmsumd H_h,H_t,H2h # H^1h*H^4h
+ vpmsumd H2_l,H2_t,H2l # H^2l*H^4l
+ vpmsumd H2_m,H2_t,H2 # H^2h*H^4l⊕H^2l*H^4h
+ vpmsumd H2_h,H2_t,H2h # H^2h*H^4h
+
+ # reduction first phase # [1]
+ vpmsumd RP,H_l,poly_h # [1] H^5
+ vpmsumd RP2,H2_l,poly_h # [1] H^6
+
+ # polynomial multiplication post-processing # [2]
+ vsldoi Mh,zero,H_m,8 # [2] H^5
+ vsldoi M2h,zero,H2_m,8 # [2] H^6
+ vsldoi Ml,H_m,zero,8 # [2] H^5
+ vsldoi M2l,H2_m,zero,8 # [2] H^6
+ vsldoi RP,RP,RP,8 # [1] H^5
+ vsldoi RP2,RP2,RP2,8 # [1] H^6
+ vxor H_h,H_h,Mh # [2] H^5
+ vxor H2_h,H2_h,M2h # [2] H^6
+ vxor H_l,H_l,Ml # [2] H^5
+ vxor H2_l,H2_l,M2l # [2] H^6
+ vxor H_l,H_l,RP # [1] H^5
+ vxor H2_l,H2_l,RP2 # [1] H^6
+
+ # reduction second phase
+ vpmsumd RP,H_l,poly_l # H^5
+ vpmsumd RP2,H2_l,poly_l # H^6
+ vxor H_h,H_l,H_h # H^5
+ vxor H2_h,H2_l,H2_h # H^6
+ vxor H_h,H_h,RP # H^5
+ vxor H2_h,H2_h,RP2 # H^6
+
+ vsldoi H2,H2_h,H2_h,8 # H^6
+ vsldoi H,H_h,H_h,8 # H^5
+ vsldoi H2l,zero,H2,8 # H^6
+ vsldoi H2h,H2,zero,8 # H^6
+
+ # --- calculate [H^6.Hi⊕H^6.Lo:H^5.Hi⊕H^5.Lo] ---
+
+ vperm H_Hh,H2,H,lodw_mask
+ vperm H_Hl,H2,H,hidw_mask
+ vxor H_H,H_Hh,H_Hl
+
+ # --- store [H^6.Hi⊕H^6.Lo:H^5.Hi⊕H^5.Lo] ---
+
+ li 8,0x900
+ li 9,0xA00
+ li 10,0xB00
+ stxvd2x H_HhX,8,TABLE
+ stxvd2x H_HX,9,TABLE
+ stxvd2x H_HlX,10,TABLE
+
+ # --- calculate H^7,H^8 ---
+
+ # polynomial multiplication "classical"
+ vpmsumd H_l,H_t,H2l # H^1l*H^6l
+ vpmsumd H_m,H_t,H2 # H^1h*H^6l⊕H^1l*H^6h
+ vpmsumd H_h,H_t,H2h # H^1h*H^6h
+ vpmsumd H2_l,H2_t,H2l # H^2l*H^6l
+ vpmsumd H2_m,H2_t,H2 # H^2h*H^6l⊕H^2l*H^6h
+ vpmsumd H2_h,H2_t,H2h # H^2h*H^6h
+
+ # reduction first phase # [1]
+ vpmsumd RP,H_l,poly_h # [1] H^7
+ vpmsumd RP2,H2_l,poly_h # [1] H^8
+
+ # polynomial multiplication post-processing # [2]
+ vsldoi Mh,zero,H_m,8 # [2] H^7
+ vsldoi M2h,zero,H2_m,8 # [2] H^8
+ vsldoi Ml,H_m,zero,8 # [2] H^7
+ vsldoi M2l,H2_m,zero,8 # [2] H^8
+ vsldoi RP,RP,RP,8 # [1] H^7
+ vsldoi RP2,RP2,RP2,8 # [1] H^8
+ vxor H_h,H_h,Mh # [2] H^7
+ vxor H2_h,H2_h,M2h # [2] H^8
+ vxor H_l,H_l,Ml # [2] H^7
+ vxor H2_l,H2_l,M2l # [2] H^8
+ vxor H_l,H_l,RP # [1] H^7
+ vxor H2_l,H2_l,RP2 # [1] H^8
+
+ # reduction second phase
+ vpmsumd RP,H_l,poly_l # H^7
+ vpmsumd RP2,H2_l,poly_l # H^8
+ vxor H_h,H_l,H_h # H^7
+ vxor H2_h,H2_l,H2_h # H^8
+ vxor H_h,H_h,RP # H^7
+ vxor H2_h,H2_h,RP2 # H^8
+
+ vsldoi H,H_h,H_h,8 # H^7
+ vsldoi H2,H2_h,H2_h,8 # H^8
+
+ # --- calculate [H^8.Hi⊕H^8.Lo:H^7.Hi⊕H^7.Lo] ---
+
+ vperm H_Hh,H2,H,lodw_mask
+ vperm H_Hl,H2,H,hidw_mask
+ vxor H_H,H_Hh,H_Hl
+
+ # --- store [H^8.Hi⊕H^8.Lo:H^7.Hi⊕H^7.Lo] ---
+
+ li 8,0xC00
+ li 9,0xD00
+ li 10,0xE00
+ stxvd2x H_HhX,8,TABLE
+ stxvd2x H_HX,9,TABLE
+ stxvd2x H_HlX,10,TABLE
+
+ blr
+EPILOGUE(_nettle_gcm_init_key8)
+
+ # void gcm_hash (const struct gcm_key *key, union gcm_block *x,
+ # size_t length, const uint8_t *data)
+
+.align 5
+PROLOGUE(_nettle_gcm_hash8)
+ vxor zero,zero,zero
+
+ ld 7,.polynomial@got(TOCP)
+ lvx poly,0,7
+ ld 7,.swap_mask@got(TOCP)
+ lvx swap_mask,0,7
+ ld 7,.hidw_mask@got(TOCP)
+ lvx hidw_mask,0,7
+ ld 7,.lodw_mask@got(TOCP)
+ lvx lodw_mask,0,7
+
+ vsldoi poly_h,zero,poly,8
+ vsldoi poly_l,poly_h,poly_h,8
+
+ lxvd2x CX,0,X # load X
+ vperm C,C,C,swap_mask
+
+ srdi 7,LENGTH,7 # 8x loop count
+ cmpldi 7,0
+ beq L2x
+
+ # backup registers
+ stdu SP,-224(SP)
+ std 28,216(SP)
+ std 29,208(SP)
+ std 30,200(SP)
+ std 31,192(SP)
+ li 8,176
+ stvx 20,8,SP
+ subi 8,8,16
+ stvx 21,8,SP
+ subi 8,8,16
+ stvx 22,8,SP
+ subi 8,8,16
+ stvx 23,8,SP
+ subi 8,8,16
+ stvx 24,8,SP
+ subi 8,8,16
+ stvx 25,8,SP
+ subi 8,8,16
+ stvx 26,8,SP
+ subi 8,8,16
+ stvx 27,8,SP
+ subi 8,8,16
+ stvx 28,8,SP
+ subi 8,8,16
+ stvx 29,8,SP
+ subi 8,8,16
+ stvx 30,8,SP
+ subi 8,8,16
+ stvx 31,8,SP
+
+ # table loading
+ li 8,0x300
+ li 9,0x400
+ li 10,0x500
+ lxvd2x H21hX,8,TABLE
+ lxvd2x H21X,9,TABLE
+ lxvd2x H21lX,10,TABLE
+ li 8,0x600
+ li 9,0x700
+ li 10,0x800
+ lxvd2x H43hX,8,TABLE
+ lxvd2x H43X,9,TABLE
+ lxvd2x H43lX,10,TABLE
+ li 8,0x900
+ li 9,0xA00
+ li 10,0xB00
+ lxvd2x H65hX,8,TABLE
+ lxvd2x H65X,9,TABLE
+ lxvd2x H65lX,10,TABLE
+ li 8,0xC00
+ li 9,0xD00
+ li 10,0xE00
+ lxvd2x H87hX,8,TABLE
+ lxvd2x H87X,9,TABLE
+ lxvd2x H87lX,10,TABLE
+
+ li 8,0x10
+ li 9,0x20
+ li 10,0x30
+ li 28,0x40
+ li 29,0x50
+ li 30,0x60
+ li 31,0x70
+
+ mtctr 7
+.align 5
+L8x_loop:
+ # input loading
+ lxvd2x C0X,0,DATA # load C0
+ lxvd2x C1X,8,DATA # load C1
+ lxvd2x C2X,9,DATA # load C2
+ lxvd2x C3X,10,DATA # load C3
+
+ # swap permuting
+ vperm C0,C0,C0,swap_mask
+ vperm C1,C1,C1,swap_mask
+ vperm C2,C2,C2,swap_mask
+ vperm C3,C3,C3,swap_mask
+
+ # previous digest combining
+ vxor C0,C0,C
+
+ # polynomial multiplication "karatsuba" pre-processing
+ vperm C23h,C2,C3,hidw_mask
+ vperm C23l,C2,C3,lodw_mask
+ vperm C01h,C0,C1,hidw_mask
+ vperm C01l,C0,C1,lodw_mask
+
+ # input loading
+ lxvd2x C4X,28,DATA # load C4
+ lxvd2x C5X,29,DATA # load C5
+ lxvd2x C6X,30,DATA # load C6
+ lxvd2x C7X,31,DATA # load C7
+
+ # swap permuting
+ vperm C4,C4,C4,swap_mask
+ vperm C5,C5,C5,swap_mask
+ vperm C6,C6,C6,swap_mask
+ vperm C7,C7,C7,swap_mask
+
+ # polynomial multiplication "karatsuba" pre-processing
+ vperm C45h,C4,C5,hidw_mask
+ vperm C45l,C4,C5,lodw_mask
+ vperm C67h,C6,C7,hidw_mask
+ vperm C67l,C6,C7,lodw_mask
+ vxor C23,C23h,C23l
+ vxor C01,C01h,C01l
+ vxor C45,C45h,C45l
+ vxor C67,C67h,C67l
+
+ # polynomial multiplication "karatsuba"
+ vpmsumd C23h,C23h,H65h # H23 = H^6h*C2h⊕H^5h*C3h
+ vpmsumd C23l,C23l,H65l # L23 = H^6l*C2l⊕H^5l*C3l
+ vpmsumd C01h,C01h,H87h # H01 = H^8h*C0h⊕H^7h*C1h
+ vpmsumd C01l,C01l,H87l # L01 = H^8l*C0l⊕H^7l*C1l
+ vpmsumd C67h,C67h,H21h # H67 = H^2h*C6h⊕H^1h*C7h
+ vpmsumd C67l,C67l,H21l # L67 = H^2l*C6l⊕H^1l*C7l
+ vpmsumd C45h,C45h,H43h # H45 = H^4h*C4h⊕H^3h*C5h
+ vpmsumd C45l,C45l,H43l # L45 = H^4l*C4l⊕H^3l*C5l
+ vpmsumd C23,C23,H65 # M23 = (H^6h⊕H^5h)*(C2h⊕C3h)⊕(H^6l⊕H^5l)*(C2l⊕C3l)
+ vpmsumd C01,C01,H87 # M01 = (H^8h⊕H^7h)*(C0h⊕C1h)⊕(H^8l⊕H^7l)*(C0l⊕C1l)
+ vpmsumd C45,C45,H43 # M45 = (H^4h⊕H^3h)*(C4h⊕C5h)⊕(H^4l⊕H^3l)*(C4l⊕C5l)
+ vpmsumd C67,C67,H21 # M67 = (H^2h⊕H^1h)*(C6h⊕C7h)⊕(H^2l⊕H^1l)*(C6l⊕C7l)
+
+ # polynomial multiplication "karatsuba" post-processing
+ vxor C23,C23,C23h
+ vxor C01,C01,C01h
+ vxor C45,C45,C45h
+ vxor C67,C67,C67h
+ vxor C23,C23,C23l
+ vxor C01,C01,C01l
+ vxor C45,C45,C45l
+ vxor C67,C67,C67l
+
+ # deferred recombination of partial products
+ vxor C01h,C01h,C23h # H0 = H01⊕H23
+ vxor C45h,C45h,C67h # H1 = H45⊕H67
+ vxor C01l,C01l,C23l # L0 = L01⊕L23
+ vxor C45l,C45l,C67l # L1 = L45⊕L45
+ vxor C01,C01,C23 # M0 = M01⊕M23
+ vxor C45,C45,C67 # M1 = M45⊕M45
+ vxor C01h,C01h,C45h # H = H0⊕H1
+ vxor C01l,C01l,C45l # L = L0⊕L1
+ vxor C01,C01,C45 # M = M0⊕M1
+
+ # reduction first phase # [1]
+ vpmsumd RP,C01l,poly_h # [1]
+
+ # polynomial multiplication post-processing # [2]
+ vsldoi Mh,zero,C01,8 # [2]
+ vsldoi Ml,C01,zero,8 # [2]
+ vsldoi RP,RP,RP,8 # [1]
+ vxor C01h,C01h,Mh # [2]
+ vxor C01l,C01l,Ml # [2]
+ vxor C01l,C01l,RP # [1]
+
+ # reduction second phase
+ vpmsumd RP,C01l,poly_l
+ vxor C01h,C01l,C01h
+ vxor C,C01h,RP
+
+ addi DATA,DATA,0x80
+ bdnz L8x_loop
+
+ # restore registers
+ li 8,0
+ lvx 31,8,SP
+ addi 8,8,16
+ lvx 30,8,SP
+ addi 8,8,16
+ lvx 29,8,SP
+ addi 8,8,16
+ lvx 28,8,SP
+ addi 8,8,16
+ lvx 27,8,SP
+ addi 8,8,16
+ lvx 26,8,SP
+ addi 8,8,16
+ lvx 25,8,SP
+ addi 8,8,16
+ lvx 24,8,SP
+ addi 8,8,16
+ lvx 23,8,SP
+ addi 8,8,16
+ lvx 22,8,SP
+ addi 8,8,16
+ lvx 21,8,SP
+ addi 8,8,16
+ lvx 20,8,SP
+ ld 31,192(SP)
+ ld 30,200(SP)
+ ld 29,208(SP)
+ ld 28,216(SP)
+ addi SP,SP,224
+
+ clrldi LENGTH,LENGTH,57
+L2x:
+ srdi 7,LENGTH,5
+ cmpldi 7,0
+ beq L1x
+
+ # table loading
+ li 8,0x300
+ li 9,0x400
+ li 10,0x500
+ lxvd2x H21hX,8,TABLE
+ lxvd2x H21X,9,TABLE
+ lxvd2x H21lX,10,TABLE
+
+ li 10,0x10
+
+ mtctr 7
+.align 5
+L2x_loop:
+ # input loading
+ lxvd2x C0X,0,DATA # load C0
+ lxvd2x C1X,10,DATA # load C1
+
+ # swap permuting
+ vperm C0,C0,C0,swap_mask
+ vperm C1,C1,C1,swap_mask
+
+ # previous digest combining
+ vxor C0,C0,C
+
+ # polynomial multiplication "karatsuba" pre-processing
+ vperm C01h,C0,C1,hidw_mask
+ vperm C01l,C0,C1,lodw_mask
+ vxor C01,C01h,C01l
+
+ # polynomial multiplication "karatsuba"
+ vpmsumd C01h,C01h,H21h # H01 = H^2h*C0h⊕H^1h*C1h
+ vpmsumd C01l,C01l,H21l # L01 = H^2l*C0l⊕H^1l*C1l
+ vpmsumd C01,C01,H21 # M01 = (H^2h⊕H^1h)*(C0h⊕C1h)⊕(H^2l⊕H^1l)*(C0l⊕C1l)
+
+ # polynomial multiplication "karatsuba" post-processing
+ vxor C01,C01,C01h
+ vxor C01,C01,C01l
+
+ # reduction first phase # [1]
+ vpmsumd RP,C01l,poly_h # [1]
+
+ # polynomial multiplication post-processing # [2]
+ vsldoi Mh,zero,C01,8 # [2]
+ vsldoi Ml,C01,zero,8 # [2]
+ vsldoi RP,RP,RP,8 # [1]
+ vxor C01h,C01h,Mh # [2]
+ vxor C01l,C01l,Ml # [2]
+ vxor C01l,C01l,RP # [1]
+
+ # reduction second phase
+ vpmsumd RP,C01l,poly_l
+ vxor C01h,C01l,C01h
+ vxor C,C01h,RP
+
+ addi DATA,DATA,0x20
+ bdnz L2x_loop
+
+ clrldi LENGTH,LENGTH,59
+L1x:
+ srdi 7,LENGTH,4
+ cmpldi 7,0
+ beq Lrem
+
+ # table loading
+ li 9,0x100
+ li 10,0x200
+ lxvd2x HlX,0,TABLE
+ lxvd2x HX, 9,TABLE
+ lxvd2x HhX,10,TABLE
+
+ # input loading
+ lxvd2x C0X,0,DATA # load C0
+
+ # swap permuting
+ vperm C0,C0,C0,swap_mask
+
+ # previous digest combining
+ vxor C0,C0,C
+
+ vpmsumd Cl,C0,Hl # L = Hl*Cl
+ vpmsumd Cm,C0,H # M = Hh*Cl⊕Hl*Ch
+ vpmsumd Ch,C0,Hh # H = Hh*Ch
+
+ # reduction first phase # [1]
+ vpmsumd RP,Cl,poly_h # [1]
+
+ # polynomial multiplication post-processing # [2]
+ vsldoi Mh,zero,Cm,8 # [2]
+ vsldoi Ml,Cm,zero,8 # [2]
+ vsldoi RP,RP,RP,8 # [1]
+ vxor Ch,Ch,Mh # [2]
+ vxor Cl,Cl,Ml # [2]
+ vxor Cl,Cl,RP # [1]
+
+ # reduction second phase
+ vpmsumd RP,Cl,poly_l
+ vxor Ch,Cl,Ch
+ vxor C,Ch,RP
+
+ addi DATA,DATA,0x10
+ clrldi LENGTH,LENGTH,60
+Lrem:
+ cmpldi LENGTH,0
+ beq Ldone
+
+ # table loading
+ li 9,0x100
+ li 10,0x200
+ lxvd2x HlX,0,TABLE
+ lxvd2x HX, 9,TABLE
+ lxvd2x HhX,10,TABLE
+
+ # input loading
+ stdu SP,-16(SP)
+ stvx zero,0,SP
+Lst_loop:
+ subic. LENGTH,LENGTH,1
+ lbzx 7,LENGTH,DATA
+ stbx 7,LENGTH,SP
+ bne Lst_loop
+ lxvd2x C0X,0,SP
+ addi SP,SP,16
+
+ # swap permuting
+ vperm C0,C0,C0,swap_mask
+
+ # previous digest combining
+ vxor C0,C0,C
+
+ vpmsumd Cl,C0,Hl # L = Hl*Cl
+ vpmsumd Cm,C0,H # M = Hh*Cl⊕Hl*Ch
+ vpmsumd Ch,C0,Hh # H = Hh*Ch
+
+ # reduction first phase # [1]
+ vpmsumd RP,Cl,poly_h # [1]
+
+ # polynomial multiplication post-processing # [2]
+ vsldoi Mh,zero,Cm,8 # [2]
+ vsldoi Ml,Cm,zero,8 # [2]
+ vsldoi RP,RP,RP,8 # [1]
+ vxor Ch,Ch,Mh # [2]
+ vxor Cl,Cl,Ml # [2]
+ vxor Cl,Cl,RP # [1]
+
+ # reduction second phase
+ vpmsumd RP,Cl,poly_l
+ vxor Ch,Cl,Ch
+ vxor C,Ch,RP
+
+Ldone:
+ vperm C,C,C,swap_mask
+ stxvd2x CX,0,X # store C
+ blr
+EPILOGUE(_nettle_gcm_hash8)
+
+ # gcm_fill (uint8_t *ctr, size_t blocks, union gcm_block *buffer)
+
+.align 5
+PROLOGUE(_nettle_gcm_fill)
+ ld 6,.swap_mask@got(TOCP)
+ lvx swap_mask,0,6
+
+ vxor zero,zero,zero
+ vspltisb I1,1
+ vspltisb I2,2
+ vspltisb I3,3
+ vspltisb I4,4
+ vspltisb I5,5
+ vspltisb I6,6
+ vspltisb I7,7
+ vspltisb I8,8
+ vsldoi I1,zero,I1,1
+ vsldoi I2,zero,I2,1
+ vsldoi I3,zero,I3,1
+ vsldoi I4,zero,I4,1
+ vsldoi I5,zero,I5,1
+ vsldoi I6,zero,I6,1
+ vsldoi I7,zero,I7,1
+ vsldoi I8,zero,I8,1
+
+ lxvd2x CTR0X,0,CTR
+ vperm CTR0,CTR0,CTR0,swap_mask
+
+ srdi 6,BLOCKS,3 # 8x loop count
+ cmpldi 6,0
+ beq Lfill_4x
+
+ std 25,-56(SP);
+ std 26,-48(SP);
+ std 27,-40(SP);
+ std 28,-32(SP);
+ std 29,-24(SP);
+ std 30,-16(SP);
+ std 31,-8(SP);
+
+ li 25,0x10
+ li 26,0x20
+ li 27,0x30
+ li 28,0x40
+ li 29,0x50
+ li 30,0x60
+ li 31,0x70
+
+ mtctr 6
+L8x_fill_loop:
+ vadduwm CTR1,CTR0,I1
+ vadduwm CTR2,CTR0,I2
+ vadduwm CTR3,CTR0,I3
+ vadduwm CTR4,CTR0,I4
+ vadduwm CTR5,CTR0,I5
+ vadduwm CTR6,CTR0,I6
+ vadduwm CTR7,CTR0,I7
+
+ vperm CTR0S,CTR0,CTR0,swap_mask
+ vperm CTR1,CTR1,CTR1,swap_mask
+ vperm CTR2,CTR2,CTR2,swap_mask
+ vperm CTR3,CTR3,CTR3,swap_mask
+ vperm CTR4,CTR4,CTR4,swap_mask
+ vperm CTR5,CTR5,CTR5,swap_mask
+ vperm CTR6,CTR6,CTR6,swap_mask
+ vperm CTR7,CTR7,CTR7,swap_mask
+
+ stxvd2x CTR0SX,0,BUFFER
+ stxvd2x CTR1X,25,BUFFER
+ stxvd2x CTR2X,26,BUFFER
+ stxvd2x CTR3X,27,BUFFER
+ stxvd2x CTR4X,28,BUFFER
+ stxvd2x CTR5X,29,BUFFER
+ stxvd2x CTR6X,30,BUFFER
+ stxvd2x CTR7X,31,BUFFER
+
+ vadduwm CTR0,CTR0,I8
+ addi BUFFER,BUFFER,0x80
+ bdnz L8x_fill_loop
+
+ ld 25,-56(SP);
+ ld 26,-48(SP);
+ ld 27,-40(SP);
+ ld 28,-32(SP);
+ ld 29,-24(SP);
+ ld 30,-16(SP);
+ ld 31,-8(SP);
+
+ clrldi BLOCKS,BLOCKS,61
+
+Lfill_4x:
+ srdi 6,BLOCKS,2
+ cmpldi 6,0
+ beq Lfill_2x
+
+ li 8,0x10
+ li 9,0x20
+ li 10,0x30
+
+ vadduwm CTR1,CTR0,I1
+ vadduwm CTR2,CTR0,I2
+ vadduwm CTR3,CTR0,I3
+
+ vperm CTR0S,CTR0,CTR0,swap_mask
+ vperm CTR1,CTR1,CTR1,swap_mask
+ vperm CTR2,CTR2,CTR2,swap_mask
+ vperm CTR3,CTR3,CTR3,swap_mask
+
+ stxvd2x CTR0SX,0,BUFFER
+ stxvd2x CTR1X,8,BUFFER
+ stxvd2x CTR2X,9,BUFFER
+ stxvd2x CTR3X,10,BUFFER
+
+ vadduwm CTR0,CTR0,I4
+ addi BUFFER,BUFFER,0x40
+
+ clrldi BLOCKS,BLOCKS,62
+
+Lfill_2x:
+ srdi 6,BLOCKS,1
+ cmpldi 6,0
+ beq Lfill_1x
+
+ li 10,0x10
+
+ vadduwm CTR1,CTR0,I1
+
+ vperm CTR0S,CTR0,CTR0,swap_mask
+ vperm CTR1,CTR1,CTR1,swap_mask
+
+ stxvd2x CTR0SX,0,BUFFER
+ stxvd2x CTR1X,10,BUFFER
+
+ vadduwm CTR0,CTR0,I2
+ addi BUFFER,BUFFER,0x20
+
+ clrldi BLOCKS,BLOCKS,63
+
+Lfill_1x:
+ cmpldi BLOCKS,0
+ beq Lfill_done
+
+ vperm CTR0S,CTR0,CTR0,swap_mask
+
+ stxvd2x CTR0SX,0,BUFFER
+
+ vadduwm CTR0,CTR0,I1
+
+Lfill_done:
+ vperm CTR0,CTR0,CTR0,swap_mask
+ stxvd2x CTR0X,0,CTR
+
+ blr
+EPILOGUE(_nettle_gcm_fill)
+
+ .data
+ .align 4
+.polynomial:
+ .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
+ .align 4
+.swap_mask:
+ .byte 8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7
+ .align 4
+.hidw_mask:
+ .byte 23,22,21,20,19,18,17,16,7,6,5,4,3,2,1,0
+ .align 4
+.lodw_mask:
+ .byte 31,30,29,28,27,26,25,24,15,14,13,12,11,10,9,8
diff --git a/powerpc64le/machine.m4 b/powerpc64le/machine.m4
new file mode 100644
index 00000000..e69de29b
diff --git a/testsuite/gcm-test.c b/testsuite/gcm-test.c
index c8174019..df1fc94a 100644
--- a/testsuite/gcm-test.c
+++ b/testsuite/gcm-test.c
@@ -170,6 +170,29 @@ test_main(void)
"16aedbf5a0de6a57a637b39b"),
SHEX("619cc5aefffe0bfa462af43c1699d050"));
+ /* Test 128 bytes */
+ test_aead(&nettle_gcm_aes128, NULL,
+ SHEX("feffe9928665731c6d6a8f9467308308"),
+ SHEX(""),
+ SHEX("d9313225f88406e5a55909c5aff5269a"
+ "86a7a9531534f7da2e4c303d8a318a72"
+ "1c3c0c95956809532fcf0e2449a6b525"
+ "b16aedf5aa0de657ba637b391aafd255"
+ "5ae376bc5e9f6a1b08e34db7a6ee0736"
+ "9ba662ea12f6f197e6bc3ed69d2480f3"
+ "ea5691347f2ba69113eb37910ebc18c8"
+ "0f697234582016fa956ca8f63ae6b473"),
+ SHEX("42831ec2217774244b7221b784d0d49c"
+ "e3aa212f2c02a4e035c17e2329aca12e"
+ "21d514b25466931c7d8f6a5aac84aa05"
+ "1ba30b396a0aac973d58e091473f5985"
+ "874b1178906ddbeab04ab2fe6cce8c57"
+ "8d7e961bd13fd6a8c56b66ca5e576492"
+ "1a48cd8bda04e66343e73055118b69b9"
+ "ced486813846958a11e602c03cfc232b"),
+ SHEX("cafebabefacedbaddecaf888"),
+ SHEX("796836f1246c9d735c5e1be0a715ccc3"));
+
/* Test case 7 */
test_aead(&nettle_gcm_aes192, NULL,
SHEX("00000000000000000000000000000000"
--
2.17.1
_______________________________________________
nettle-bugs mailing list
[email protected]
http://lists.lysator.liu.se/mailman/listinfo/nettle-bugs