Currently AES_encrypt, AES_decrypt, and the key expansion are optimized. Direct support for CBC, ECB, CTR, etc. will come in subsequent changes.
The following measurements were taken on a SPARC-T4. Baseline (OPENSSL_sparcv9cap=0): type 16 bytes 64 bytes 256 bytes 1024 bytes 8192 bytes aes-128 cbc 85241.72k 90930.60k 94282.67k 95158.95k 95087.08k aes-192 cbc 73300.41k 77576.49k 80022.95k 80657.75k 80838.66k aes-256 cbc 64390.17k 67656.43k 69442.30k 69893.80k 70022.49k With AES opcodes enabled: type 16 bytes 64 bytes 256 bytes 1024 bytes 8192 bytes aes-128 cbc 298612.77k 353669.87k 389577.22k 400843.61k 406031.02k aes-192 cbc 282841.19k 323486.85k 364641.37k 375664.98k 378989.23k aes-256 cbc 269449.24k 310281.81k 343170.05k 352550.23k 355317.08k There were several interesting implementation issues dealt with here. The AES opcodes need the decryption key in a different format than the generic sparc v9 code wants (basically, no pre-application of the MixColumn). To address this and also to facilitate using the AES opcodes for key expansion, a new aes_sparccore.c file is used in place of aes_core.c when building for sparcv9. The non-AES-opcode sparc code was changed to use a real proper PIC sequence with sparc_arch.h macros. The code which was there flushes the UltraSPARC return address stack, negatively impacting performance. Any call, or jmpl with destination register %o7, that lacks a paired ret/retl will effectively corrupt the return address stack, making every subsequent ret/retl miss the cache and take a full pipeline flush. The sparc_arch.h PIC loading sequences lack this problem, and also they know how to do non-PIC loading of symbol addresses even more efficiently. Next, usage of the AES instructions is unnecessarily difficult if the key is not 8-byte aligned. So we use a trick so that we always have an aligned key to work with. We determine if the AES_KEY is 8 or 4 byte aligned, these are the only two possibilities on sparc. If it is 8 byte aligned, we use the existing interpretation of the AES_KEY contents. However, if it is 4 byte aligned, we put the ->rounds value first and then the key so that they key becomes 8-byte aligned. All of the aes_sparccore.c and aes-sparcv9.pl code is aware of this convention. Since we don't have any control over the alignment of the input buffers, output buffers, and input key, we make use of alignaddr, faligndata, and masked partial stores to deal with the unaligned cases. Signed-off-by: David S. Miller <da...@davemloft.net> --- Configure | 2 +- crypto/aes/Makefile | 4 +- crypto/aes/aes_sparccore.c | 272 ++++++++++++++++++++++ crypto/aes/asm/aes-sparcv9.pl | 502 ++++++++++++++++++++++++++++++++++++++++- crypto/sparc_arch.h | 34 +++ 5 files changed, 802 insertions(+), 12 deletions(-) create mode 100644 crypto/aes/aes_sparccore.c diff --git a/Configure b/Configure index 2333a63..66b4ff8 100755 --- a/Configure +++ b/Configure @@ -130,7 +130,7 @@ my $x86_elf_asm="$x86_asm:elf"; my $x86_64_asm="x86_64cpuid.o:x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o::aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o::rc4-x86_64.o rc4-md5-x86_64.o:::wp-x86_64.o:cmll-x86_64.o cmll_misc.o:ghash-x86_64.o:e_padlock-x86_64.o"; my $ia64_asm="ia64cpuid.o:bn-ia64.o ia64-mont.o::aes_core.o aes_cbc.o aes-ia64.o::md5-ia64.o:sha1-ia64.o sha256-ia64.o sha512-ia64.o::rc4-ia64.o rc4_skey.o:::::ghash-ia64.o::void"; -my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o:aes_core.o aes_cbc.o aes-sparcv9.o::md5-sparcv9.o:sha1-sparcv9.o sha256-sparcv9.o sha512-sparcv9.o:::::::ghash-sparcv9.o::void"; +my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o:aes_sparccore.o aes_cbc.o aes-sparcv9.o::md5-sparcv9.o:sha1-sparcv9.o sha256-sparcv9.o sha512-sparcv9.o:::::::ghash-sparcv9.o::void"; my $sparcv8_asm=":sparcv8.o:des_enc-sparc.o fcrypt_b.o:::::::::::::void"; my $alpha_asm="alphacpuid.o:bn_asm.o alpha-mont.o:::::sha1-alpha.o:::::::ghash-alpha.o::void"; my $mips64_asm=":bn-mips.o mips-mont.o::aes_cbc.o aes-mips.o:::sha1-mips.o sha256-mips.o sha512-mips.o::::::::"; diff --git a/crypto/aes/Makefile b/crypto/aes/Makefile index 8edd358..2f32983 100644 --- a/crypto/aes/Makefile +++ b/crypto/aes/Makefile @@ -66,8 +66,10 @@ aesni-x86_64.s: asm/aesni-x86_64.pl aesni-sha1-x86_64.s: asm/aesni-sha1-x86_64.pl $(PERL) asm/aesni-sha1-x86_64.pl $(PERLASM_SCHEME) > $@ -aes-sparcv9.s: asm/aes-sparcv9.pl +aes-sparcv9.S: asm/aes-sparcv9.pl $(PERL) asm/aes-sparcv9.pl $(CFLAGS) > $@ +aes-sparcv9.s: aes-sparcv9.S + $(CC) $(CFLAGS) -E aes-sparcv9.S > $@ aes-ppc.s: asm/aes-ppc.pl $(PERL) asm/aes-ppc.pl $(PERLASM_SCHEME) $@ diff --git a/crypto/aes/aes_sparccore.c b/crypto/aes/aes_sparccore.c new file mode 100644 index 0000000..2842cbc --- /dev/null +++ b/crypto/aes/aes_sparccore.c @@ -0,0 +1,272 @@ +/* crypto/aes/aes_core.c -*- mode:C; c-file-style: "eay" -*- */ +/** + * rijndael-alg-fst.c + * + * @version 3.0 (December 2000) + * + * Optimised ANSI C code for the Rijndael cipher (now AES) + * + * @author Vincent Rijmen <vincent.rij...@esat.kuleuven.ac.be> + * @author Antoon Bosselaers <antoon.bossela...@esat.kuleuven.ac.be> + * @author Paulo Barreto <paulo.barr...@terra.com.br> + * + * This code is hereby placed in the public domain. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS + * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, + * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * This variant of aes_core.c is used for sparc so that we can use the + * Sparc AES opcodes for key expansion and layout the keys in the format + * that the AES opcode encrypt/decrypt expect. + */ + +#include <assert.h> + +#include <stdlib.h> +#include <openssl/crypto.h> +#include <openssl/aes.h> +#include "aes_locl.h" + +#include "sparc_arch.h" + +static const u8 Te4[256] = { + 0x63U, 0x7cU, 0x77U, 0x7bU, 0xf2U, 0x6bU, 0x6fU, 0xc5U, + 0x30U, 0x01U, 0x67U, 0x2bU, 0xfeU, 0xd7U, 0xabU, 0x76U, + 0xcaU, 0x82U, 0xc9U, 0x7dU, 0xfaU, 0x59U, 0x47U, 0xf0U, + 0xadU, 0xd4U, 0xa2U, 0xafU, 0x9cU, 0xa4U, 0x72U, 0xc0U, + 0xb7U, 0xfdU, 0x93U, 0x26U, 0x36U, 0x3fU, 0xf7U, 0xccU, + 0x34U, 0xa5U, 0xe5U, 0xf1U, 0x71U, 0xd8U, 0x31U, 0x15U, + 0x04U, 0xc7U, 0x23U, 0xc3U, 0x18U, 0x96U, 0x05U, 0x9aU, + 0x07U, 0x12U, 0x80U, 0xe2U, 0xebU, 0x27U, 0xb2U, 0x75U, + 0x09U, 0x83U, 0x2cU, 0x1aU, 0x1bU, 0x6eU, 0x5aU, 0xa0U, + 0x52U, 0x3bU, 0xd6U, 0xb3U, 0x29U, 0xe3U, 0x2fU, 0x84U, + 0x53U, 0xd1U, 0x00U, 0xedU, 0x20U, 0xfcU, 0xb1U, 0x5bU, + 0x6aU, 0xcbU, 0xbeU, 0x39U, 0x4aU, 0x4cU, 0x58U, 0xcfU, + 0xd0U, 0xefU, 0xaaU, 0xfbU, 0x43U, 0x4dU, 0x33U, 0x85U, + 0x45U, 0xf9U, 0x02U, 0x7fU, 0x50U, 0x3cU, 0x9fU, 0xa8U, + 0x51U, 0xa3U, 0x40U, 0x8fU, 0x92U, 0x9dU, 0x38U, 0xf5U, + 0xbcU, 0xb6U, 0xdaU, 0x21U, 0x10U, 0xffU, 0xf3U, 0xd2U, + 0xcdU, 0x0cU, 0x13U, 0xecU, 0x5fU, 0x97U, 0x44U, 0x17U, + 0xc4U, 0xa7U, 0x7eU, 0x3dU, 0x64U, 0x5dU, 0x19U, 0x73U, + 0x60U, 0x81U, 0x4fU, 0xdcU, 0x22U, 0x2aU, 0x90U, 0x88U, + 0x46U, 0xeeU, 0xb8U, 0x14U, 0xdeU, 0x5eU, 0x0bU, 0xdbU, + 0xe0U, 0x32U, 0x3aU, 0x0aU, 0x49U, 0x06U, 0x24U, 0x5cU, + 0xc2U, 0xd3U, 0xacU, 0x62U, 0x91U, 0x95U, 0xe4U, 0x79U, + 0xe7U, 0xc8U, 0x37U, 0x6dU, 0x8dU, 0xd5U, 0x4eU, 0xa9U, + 0x6cU, 0x56U, 0xf4U, 0xeaU, 0x65U, 0x7aU, 0xaeU, 0x08U, + 0xbaU, 0x78U, 0x25U, 0x2eU, 0x1cU, 0xa6U, 0xb4U, 0xc6U, + 0xe8U, 0xddU, 0x74U, 0x1fU, 0x4bU, 0xbdU, 0x8bU, 0x8aU, + 0x70U, 0x3eU, 0xb5U, 0x66U, 0x48U, 0x03U, 0xf6U, 0x0eU, + 0x61U, 0x35U, 0x57U, 0xb9U, 0x86U, 0xc1U, 0x1dU, 0x9eU, + 0xe1U, 0xf8U, 0x98U, 0x11U, 0x69U, 0xd9U, 0x8eU, 0x94U, + 0x9bU, 0x1eU, 0x87U, 0xe9U, 0xceU, 0x55U, 0x28U, 0xdfU, + 0x8cU, 0xa1U, 0x89U, 0x0dU, 0xbfU, 0xe6U, 0x42U, 0x68U, + 0x41U, 0x99U, 0x2dU, 0x0fU, 0xb0U, 0x54U, 0xbbU, 0x16U +}; +static const u32 rcon[] = { + 0x01000000, 0x02000000, 0x04000000, 0x08000000, + 0x10000000, 0x20000000, 0x40000000, 0x80000000, + 0x1B000000, 0x36000000, /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */ +}; + +/* + * We place the actual key->rounds storage either at the beginning, or the end, + * of the AES_KEY storage, in order to align the key itself to 64-bits. + */ + +static u32 *key_pointer(AES_KEY *key) +{ + unsigned long addr = (unsigned long) key; + + addr += (addr & 4); + + return (u32 *) addr; +} + +static int *rounds_pointer(AES_KEY *key) +{ + unsigned long addr = (unsigned long) key; + + addr += (addr & 4) ? 0 : 240; + + return (int *) addr; +} + +extern void aes_sparc_hw_expand_key(const unsigned char *userKey, u32 *rk, + const int bits); + +/** + * Expand the cipher key into the encryption key schedule. + */ +int AES_set_encrypt_key(const unsigned char *userKey, const int bits, + AES_KEY *key) { + u32 *rk; + int i = 0; + u32 temp; + int *rp; + + if (!userKey || !key) + return -1; + if (bits != 128 && bits != 192 && bits != 256) + return -2; + + rk = key_pointer(key); + rp = rounds_pointer(key); + + if (bits==128) + *rp = 10; + else if (bits==192) + *rp = 12; + else + *rp = 14; + + if (OPENSSL_sparcv9cap_P & SPARCV9_AES) { + aes_sparc_hw_expand_key(userKey, rk, bits); + return 0; + } + + rk[0] = GETU32(userKey ); + rk[1] = GETU32(userKey + 4); + rk[2] = GETU32(userKey + 8); + rk[3] = GETU32(userKey + 12); + if (bits == 128) { + while (1) { + temp = rk[3]; + rk[4] = rk[0] ^ + (Te4[(temp >> 16) & 0xff] << 24) ^ + (Te4[(temp >> 8) & 0xff] << 16) ^ + (Te4[(temp ) & 0xff] << 8) ^ + (Te4[(temp >> 24) ]) ^ + rcon[i]; + rk[5] = rk[1] ^ rk[4]; + rk[6] = rk[2] ^ rk[5]; + rk[7] = rk[3] ^ rk[6]; + if (++i == 10) { + return 0; + } + rk += 4; + } + } + rk[4] = GETU32(userKey + 16); + rk[5] = GETU32(userKey + 20); + if (bits == 192) { + while (1) { + temp = rk[ 5]; + rk[ 6] = rk[ 0] ^ + (Te4[(temp >> 16) & 0xff] << 24) ^ + (Te4[(temp >> 8) & 0xff] << 16) ^ + (Te4[(temp ) & 0xff] << 8) ^ + (Te4[(temp >> 24) ]) ^ + rcon[i]; + rk[ 7] = rk[ 1] ^ rk[ 6]; + rk[ 8] = rk[ 2] ^ rk[ 7]; + rk[ 9] = rk[ 3] ^ rk[ 8]; + if (++i == 8) { + return 0; + } + rk[10] = rk[ 4] ^ rk[ 9]; + rk[11] = rk[ 5] ^ rk[10]; + rk += 6; + } + } + rk[6] = GETU32(userKey + 24); + rk[7] = GETU32(userKey + 28); + if (bits == 256) { + while (1) { + temp = rk[ 7]; + rk[ 8] = rk[ 0] ^ + (Te4[(temp >> 16) & 0xff] << 24) ^ + (Te4[(temp >> 8) & 0xff] << 16) ^ + (Te4[(temp ) & 0xff] << 8) ^ + (Te4[(temp >> 24) ]) ^ + rcon[i]; + rk[ 9] = rk[ 1] ^ rk[ 8]; + rk[10] = rk[ 2] ^ rk[ 9]; + rk[11] = rk[ 3] ^ rk[10]; + if (++i == 7) { + return 0; + } + temp = rk[11]; + rk[12] = rk[ 4] ^ + (Te4[(temp >> 24) ] << 24) ^ + (Te4[(temp >> 16) & 0xff] << 16) ^ + (Te4[(temp >> 8) & 0xff] << 8) ^ + (Te4[(temp ) & 0xff]); + rk[13] = rk[ 5] ^ rk[12]; + rk[14] = rk[ 6] ^ rk[13]; + rk[15] = rk[ 7] ^ rk[14]; + + rk += 8; + } + } + return 0; +} + +/** + * Expand the cipher key into the decryption key schedule. + */ +int AES_set_decrypt_key(const unsigned char *userKey, const int bits, + AES_KEY *key) { + + u32 *rk; + int i, j, status; + u32 temp; + int *rp; + + /* first, start with an encryption schedule */ + status = AES_set_encrypt_key(userKey, bits, key); + if (status < 0) + return status; + + rk = key_pointer(key); + rp = rounds_pointer(key); + + /* invert the order of the round keys: */ + for (i = 0, j = 4*(*rp); i < j; i += 4, j -= 4) { + temp = rk[i ]; rk[i ] = rk[j ]; rk[j ] = temp; + temp = rk[i + 1]; rk[i + 1] = rk[j + 1]; rk[j + 1] = temp; + temp = rk[i + 2]; rk[i + 2] = rk[j + 2]; rk[j + 2] = temp; + temp = rk[i + 3]; rk[i + 3] = rk[j + 3]; rk[j + 3] = temp; + } + + if (OPENSSL_sparcv9cap_P & SPARCV9_AES) + return 0; + + /* apply the inverse MixColumn transform to all round keys but the first and the last: */ + for (i = 1; i < (*rp); i++) { + rk += 4; + for (j = 0; j < 4; j++) { + u32 tp1, tp2, tp4, tp8, tp9, tpb, tpd, tpe, m; + + tp1 = rk[j]; + m = tp1 & 0x80808080; + tp2 = ((tp1 & 0x7f7f7f7f) << 1) ^ + ((m - (m >> 7)) & 0x1b1b1b1b); + m = tp2 & 0x80808080; + tp4 = ((tp2 & 0x7f7f7f7f) << 1) ^ + ((m - (m >> 7)) & 0x1b1b1b1b); + m = tp4 & 0x80808080; + tp8 = ((tp4 & 0x7f7f7f7f) << 1) ^ + ((m - (m >> 7)) & 0x1b1b1b1b); + tp9 = tp8 ^ tp1; + tpb = tp9 ^ tp2; + tpd = tp9 ^ tp4; + tpe = tp8 ^ tp4 ^ tp2; + rk[j] = tpe ^ (tpd >> 16) ^ (tpd << 16) ^ + (tp9 >> 8) ^ (tp9 << 24) ^ + (tpb >> 24) ^ (tpb << 8); + } + } + return 0; +} diff --git a/crypto/aes/asm/aes-sparcv9.pl b/crypto/aes/asm/aes-sparcv9.pl index 403c4d1..f022b7b 100755 --- a/crypto/aes/asm/aes-sparcv9.pl +++ b/crypto/aes/asm/aes-sparcv9.pl @@ -79,8 +79,52 @@ $code.=<<___ if ($bits==64); .register %g3,#scratch ___ $code.=<<___; +#include "sparc_arch.h" + +#define ENCRYPT_TWO_ROUNDS(KEY_BASE, I0, I1, T0, T1) \\ + AES_EROUND01(KEY_BASE + 0, I0, I1, T0) \\ + AES_EROUND23(KEY_BASE + 2, I0, I1, T1) \\ + AES_EROUND01(KEY_BASE + 4, T0, T1, I0) \\ + AES_EROUND23(KEY_BASE + 6, T0, T1, I1) + +#define ENCRYPT_TWO_ROUNDS_LAST(KEY_BASE, I0, I1, T0, T1) \\ + AES_EROUND01(KEY_BASE + 0, I0, I1, T0) \\ + AES_EROUND23(KEY_BASE + 2, I0, I1, T1) \\ + AES_EROUND01_L(KEY_BASE + 4, T0, T1, I0) \\ + AES_EROUND23_L(KEY_BASE + 6, T0, T1, I1) + + /* 10 rounds */ +#define ENCRYPT_128(KEY_BASE, I0, I1, T0, T1) \\ + ENCRYPT_TWO_ROUNDS(KEY_BASE + 0, I0, I1, T0, T1) \\ + ENCRYPT_TWO_ROUNDS(KEY_BASE + 8, I0, I1, T0, T1) \\ + ENCRYPT_TWO_ROUNDS(KEY_BASE + 16, I0, I1, T0, T1) \\ + ENCRYPT_TWO_ROUNDS(KEY_BASE + 24, I0, I1, T0, T1) \\ + ENCRYPT_TWO_ROUNDS_LAST(KEY_BASE + 32, I0, I1, T0, T1) + +#define DECRYPT_TWO_ROUNDS(KEY_BASE, I0, I1, T0, T1) \\ + AES_DROUND23(KEY_BASE + 0, I0, I1, T1) \\ + AES_DROUND01(KEY_BASE + 2, I0, I1, T0) \\ + AES_DROUND23(KEY_BASE + 4, T0, T1, I1) \\ + AES_DROUND01(KEY_BASE + 6, T0, T1, I0) + +#define DECRYPT_TWO_ROUNDS_LAST(KEY_BASE, I0, I1, T0, T1) \\ + AES_DROUND23(KEY_BASE + 0, I0, I1, T1) \\ + AES_DROUND01(KEY_BASE + 2, I0, I1, T0) \\ + AES_DROUND23_L(KEY_BASE + 4, T0, T1, I1) \\ + AES_DROUND01_L(KEY_BASE + 6, T0, T1, I0) + + /* 10 rounds */ +#define DECRYPT_128(KEY_BASE, I0, I1, T0, T1) \\ + DECRYPT_TWO_ROUNDS(KEY_BASE + 0, I0, I1, T0, T1) \\ + DECRYPT_TWO_ROUNDS(KEY_BASE + 8, I0, I1, T0, T1) \\ + DECRYPT_TWO_ROUNDS(KEY_BASE + 16, I0, I1, T0, T1) \\ + DECRYPT_TWO_ROUNDS(KEY_BASE + 24, I0, I1, T0, T1) \\ + DECRYPT_TWO_ROUNDS_LAST(KEY_BASE + 32, I0, I1, T0, T1) + .section ".text",#alloc,#execinstr +SPARC_PIC_THUNK(o4) + .align 256 AES_Te: ___ @@ -190,7 +234,12 @@ $code.=<<___; _sparcv9_AES_encrypt: save %sp,-$frame-$locals,%sp stx %i7,[%sp+$bias+$frame+0] ! off-load return address - ld [$key+240],$rounds + andcc $key, 4, $t0 + mov $key, $t1 + add $key, 240, $t2 + movne %icc, $t1, $t2 + ld [$t2],$rounds + add $key, $t0, $key ld [$key+0],$t0 ld [$key+4],$t1 ! ld [$key+8],$t2 @@ -512,6 +561,110 @@ _sparcv9_AES_encrypt: .align 32 .globl AES_encrypt AES_encrypt: + /* %o0=input, %o1=output, %o2=key */ + SPARC_LOAD_V9_CAPS_LEAF(o4, g1) + andcc %o4, SPARCV9_AES, %g0 + be .Lenc_software + andcc %o2, 0x4, %g1 + mov %o2, %g2 + add %g2, 240, %g3 + movne %icc, %g2, %g3 + add %o2, %g1, %o2 + andcc %o0, 0x7, %g0 + be,pt %icc, 1f + ld [%g3], %g1 + + alignaddr %o0, %g0, %o0 + ldd [%o0 + 0x00], %f10 + ldd [%o0 + 0x08], %f12 + ldd [%o0 + 0x10], %f14 + faligndata %f10, %f12, %f4 + ba,pt %icc, 2f + faligndata %f12, %f14, %f6 + +1: + ldd [%o0 + 0x00], %f4 + ldd [%o0 + 0x08], %f6 +2: + ldd [%o2 + 0x00], %f8 + ldd [%o2 + 0x08], %f10 + + cmp %g1, 12 + fxor %f8, %f4, %f4 + bl 2f + fxor %f10, %f6, %f6 + + be 1f + ldd [%o2 + 0x10], %f8 + + /* 256-bit key, 14 rounds */ + ldd [%o2 + 0x18], %f10 + ldd [%o2 + 0x20], %f12 + ldd [%o2 + 0x28], %f14 + add %o2, 0x20, %o2 + ENCRYPT_TWO_ROUNDS(8, 4, 6, 0, 2) + + ldd [%o2 + 0x10], %f8 + +1: + /* 192-bit key, 12 rounds */ + ldd [%o2 + 0x18], %f10 + ldd [%o2 + 0x20], %f12 + ldd [%o2 + 0x28], %f14 + add %o2, 0x20, %o2 + ENCRYPT_TWO_ROUNDS(8, 4, 6, 0, 2) + +2: + /* 128-bit key, 10 rounds */ + ldd [%o2 + 0x10], %f8 + ldd [%o2 + 0x18], %f10 + ldd [%o2 + 0x20], %f12 + ldd [%o2 + 0x28], %f14 + ldd [%o2 + 0x30], %f16 + ldd [%o2 + 0x38], %f18 + ldd [%o2 + 0x40], %f20 + ldd [%o2 + 0x48], %f22 + ldd [%o2 + 0x50], %f24 + ldd [%o2 + 0x58], %f26 + ldd [%o2 + 0x60], %f28 + ldd [%o2 + 0x68], %f30 + ldd [%o2 + 0x70], %f32 + ldd [%o2 + 0x78], %f34 + ldd [%o2 + 0x80], %f36 + ldd [%o2 + 0x88], %f38 + ldd [%o2 + 0x90], %f40 + ldd [%o2 + 0x98], %f42 + ldd [%o2 + 0xa0], %f44 + ldd [%o2 + 0xa8], %f46 + + ENCRYPT_128(8, 4, 6, 0, 2) + + andcc %o1, 0x7, %g0 + bne,pn %icc, 1f + nop + + std %f4, [%o1 + 0x00] + retl + std %f6, [%o1 + 0x08] + +1: + alignaddrl %o1, %g0, %g3 + mov 0xff, %g2 + sub %o1, %g3, %g1 + srl %g2, %g1, %g2 + orn %g0, %g2, %g1 + + faligndata %f4, %f4, %f10 + faligndata %f4, %f6, %f12 + faligndata %f6, %f6, %f14 + + stda %f10, [%g3 + %g2] 0xc0 + std %f12, [%g3 + 0x08] + add %g3, 0x10, %g3 + retl + stda %f14, [%g3 + %g1] 0xc0 + +.Lenc_software: or %o0,%o1,%g1 andcc %g1,3,%g0 bnz,pn %xcc,.Lunaligned_enc @@ -522,8 +675,7 @@ AES_encrypt: ld [%i0+8],%o2 ld [%i0+12],%o3 -1: call .+8 - add %o7,AES_Te-1b,%o4 + SPARC_LOAD_ADDRESS(AES_Te, o4, o5) call _sparcv9_AES_encrypt mov %i2,%o5 @@ -582,8 +734,7 @@ AES_encrypt: or %l7,%l6,%l6 or %l4,%l6,%o3 -1: call .+8 - add %o7,AES_Te-1b,%o4 + SPARC_LOAD_ADDRESS(AES_Te, o4, o5) call _sparcv9_AES_encrypt mov %i2,%o5 @@ -736,7 +887,12 @@ $code.=<<___; _sparcv9_AES_decrypt: save %sp,-$frame-$locals,%sp stx %i7,[%sp+$bias+$frame+0] ! off-load return address - ld [$key+240],$rounds + andcc $key, 4, $t0 + mov $key, $t1 + add $key, 240, $t2 + movne %icc, $t1, $t2 + ld [$t2],$rounds + add $key, $t0, $key ld [$key+0],$t0 ld [$key+4],$t1 ! ld [$key+8],$t2 @@ -1058,6 +1214,110 @@ _sparcv9_AES_decrypt: .align 32 .globl AES_decrypt AES_decrypt: + /* %o0=input, %o1=output, %o2=key */ + SPARC_LOAD_V9_CAPS_LEAF(o4, g1) + andcc %o4, SPARCV9_AES, %g0 + be .Ldec_software + andcc %o2, 0x4, %g1 + mov %o2, %g2 + add %g2, 240, %g3 + movne %icc, %g2, %g3 + add %o2, %g1, %o2 + andcc %o0, 0x7, %g0 + be,pt %icc, 1f + ld [%g3], %g1 + + alignaddr %o0, %g0, %o0 + ldd [%o0 + 0x00], %f10 + ldd [%o0 + 0x08], %f12 + ldd [%o0 + 0x10], %f14 + faligndata %f10, %f12, %f4 + ba,pt %icc, 2f + faligndata %f12, %f14, %f6 + +1: + ldd [%o0 + 0x00], %f4 + ldd [%o0 + 0x08], %f6 +2: + ldd [%o2 + 0x00], %f8 + ldd [%o2 + 0x08], %f10 + + cmp %g1, 12 + fxor %f8, %f4, %f4 + bl 2f + fxor %f10, %f6, %f6 + + be 1f + ldd [%o2 + 0x18], %f8 + + /* 256-bit key, 14 rounds */ + ldd [%o2 + 0x10], %f10 + ldd [%o2 + 0x28], %f12 + ldd [%o2 + 0x20], %f14 + add %o2, 0x20, %o2 + DECRYPT_TWO_ROUNDS(8, 4, 6, 0, 2) + + ldd [%o2 + 0x18], %f8 + +1: + /* 192-bit key, 12 rounds */ + ldd [%o2 + 0x10], %f10 + ldd [%o2 + 0x28], %f12 + ldd [%o2 + 0x20], %f14 + add %o2, 0x20, %o2 + DECRYPT_TWO_ROUNDS(8, 4, 6, 0, 2) + +2: + /* 128-bit key, 10 rounds */ + ldd [%o2 + 0x18], %f8 + ldd [%o2 + 0x10], %f10 + ldd [%o2 + 0x28], %f12 + ldd [%o2 + 0x20], %f14 + ldd [%o2 + 0x38], %f16 + ldd [%o2 + 0x30], %f18 + ldd [%o2 + 0x48], %f20 + ldd [%o2 + 0x40], %f22 + ldd [%o2 + 0x58], %f24 + ldd [%o2 + 0x50], %f26 + ldd [%o2 + 0x68], %f28 + ldd [%o2 + 0x60], %f30 + ldd [%o2 + 0x78], %f32 + ldd [%o2 + 0x70], %f34 + ldd [%o2 + 0x88], %f36 + ldd [%o2 + 0x80], %f38 + ldd [%o2 + 0x98], %f40 + ldd [%o2 + 0x90], %f42 + ldd [%o2 + 0xa8], %f44 + ldd [%o2 + 0xa0], %f46 + + DECRYPT_128(8, 4, 6, 0, 2) + + andcc %o1, 0x7, %g0 + bne,pn %icc, 1f + nop + + std %f4, [%o1 + 0x00] + retl + std %f6, [%o1 + 0x08] + +1: + alignaddrl %o1, %g0, %g3 + mov 0xff, %g2 + sub %o1, %g3, %g1 + srl %g2, %g1, %g2 + orn %g0, %g2, %g1 + + faligndata %f4, %f4, %f10 + faligndata %f4, %f6, %f12 + faligndata %f6, %f6, %f14 + + stda %f10, [%g3 + %g2] 0xc0 + std %f12, [%g3 + 0x08] + add %g3, 0x10, %g3 + retl + stda %f14, [%g3 + %g1] 0xc0 + +.Ldec_software: or %o0,%o1,%g1 andcc %g1,3,%g0 bnz,pn %xcc,.Lunaligned_dec @@ -1068,8 +1328,7 @@ AES_decrypt: ld [%i0+8],%o2 ld [%i0+12],%o3 -1: call .+8 - add %o7,AES_Td-1b,%o4 + SPARC_LOAD_ADDRESS(AES_Td, o4, o5) call _sparcv9_AES_decrypt mov %i2,%o5 @@ -1128,8 +1387,7 @@ AES_decrypt: or %l7,%l6,%l6 or %l4,%l6,%o3 -1: call .+8 - add %o7,AES_Td-1b,%o4 + SPARC_LOAD_ADDRESS(AES_Td, o4, o5) call _sparcv9_AES_decrypt mov %i2,%o5 @@ -1169,6 +1427,230 @@ AES_decrypt: restore .type AES_decrypt,#function .size AES_decrypt,(.-AES_decrypt) + +.align 32 +.globl aes_sparc_hw_expand_key +aes_sparc_hw_expand_key: + /* %o0=input_key, %o1=output_key, %o2=key_len_in_bits */ + andcc %o0, 0x7, %g1 + be,pt %icc, 1f + nop + alignaddr %o0, %g0, %o0 + ldd [%o0 + 0x00], %f10 + ldd [%o0 + 0x08], %f12 + ldd [%o0 + 0x10], %f14 + faligndata %f10, %f12, %f0 + ba,pt %icc, 2f + faligndata %f12, %f14, %f2 + +1: + ldd [%o0 + 0x00], %f0 + ldd [%o0 + 0x08], %f2 + +2: + std %f0, [%o1 + 0x00] + std %f2, [%o1 + 0x08] + add %o1, 0x10, %o1 + + cmp %o2, 192 + bl 4f + nop + + be 3f + nop + + /* 256-bit key expansion */ + andcc %g1, 7, %g0 + be,pt %icc, 1f + nop + + ldd [%o0 + 0x10], %f10 + ldd [%o0 + 0x18], %f12 + ldd [%o0 + 0x20], %f14 + faligndata %f10, %f12, %f4 + ba,pt %icc, 2f + faligndata %f12, %f14, %f6 + +1: + ldd [%o0 + 0x10], %f4 + ldd [%o0 + 0x18], %f6 + +2: + std %f4, [%o1 + 0x00] + std %f6, [%o1 + 0x08] + add %o1, 0x10, %o1 + + AES_KEXPAND1(0, 6, 0x0, 8) + AES_KEXPAND2(2, 8, 10) + AES_KEXPAND0(4, 10, 12) + AES_KEXPAND2(6, 12, 14) + AES_KEXPAND1(8, 14, 0x1, 16) + AES_KEXPAND2(10, 16, 18) + AES_KEXPAND0(12, 18, 20) + AES_KEXPAND2(14, 20, 22) + AES_KEXPAND1(16, 22, 0x2, 24) + AES_KEXPAND2(18, 24, 26) + AES_KEXPAND0(20, 26, 28) + AES_KEXPAND2(22, 28, 30) + AES_KEXPAND1(24, 30, 0x3, 32) + AES_KEXPAND2(26, 32, 34) + AES_KEXPAND0(28, 34, 36) + AES_KEXPAND2(30, 36, 38) + AES_KEXPAND1(32, 38, 0x4, 40) + AES_KEXPAND2(34, 40, 42) + AES_KEXPAND0(36, 42, 44) + AES_KEXPAND2(38, 44, 46) + AES_KEXPAND1(40, 46, 0x5, 48) + AES_KEXPAND2(42, 48, 50) + AES_KEXPAND0(44, 50, 52) + AES_KEXPAND2(46, 52, 54) + AES_KEXPAND1(48, 54, 0x6, 56) + AES_KEXPAND2(50, 56, 58) + + std %f8, [%o1 + 0x00] + std %f10, [%o1 + 0x08] + std %f12, [%o1 + 0x10] + std %f14, [%o1 + 0x18] + std %f16, [%o1 + 0x20] + std %f18, [%o1 + 0x28] + std %f20, [%o1 + 0x30] + std %f22, [%o1 + 0x38] + std %f24, [%o1 + 0x40] + std %f26, [%o1 + 0x48] + std %f28, [%o1 + 0x50] + std %f30, [%o1 + 0x58] + std %f32, [%o1 + 0x60] + std %f34, [%o1 + 0x68] + std %f36, [%o1 + 0x70] + std %f38, [%o1 + 0x78] + std %f40, [%o1 + 0x80] + std %f42, [%o1 + 0x88] + std %f44, [%o1 + 0x90] + std %f46, [%o1 + 0x98] + std %f48, [%o1 + 0xa0] + std %f50, [%o1 + 0xa8] + std %f52, [%o1 + 0xb0] + std %f54, [%o1 + 0xb8] + std %f56, [%o1 + 0xc0] + ba,pt %xcc, 80f + std %f58, [%o1 + 0xc8] + +3: + /* 192-bit key expansion */ + andcc %g1, 7, %g0 + be,pt %icc, 1f + nop + + ldd [%o0 + 0x10], %f10 + ldd [%o0 + 0x18], %f12 + ba,pt %icc, 2f + faligndata %f10, %f12, %f4 + +1: + ldd [%o0 + 0x10], %f4 + +2: + std %f4, [%o1 + 0x00] + add %o1, 0x08, %o1 + + AES_KEXPAND1(0, 4, 0x0, 6) + AES_KEXPAND2(2, 6, 8) + AES_KEXPAND2(4, 8, 10) + AES_KEXPAND1(6, 10, 0x1, 12) + AES_KEXPAND2(8, 12, 14) + AES_KEXPAND2(10, 14, 16) + AES_KEXPAND1(12, 16, 0x2, 18) + AES_KEXPAND2(14, 18, 20) + AES_KEXPAND2(16, 20, 22) + AES_KEXPAND1(18, 22, 0x3, 24) + AES_KEXPAND2(20, 24, 26) + AES_KEXPAND2(22, 26, 28) + AES_KEXPAND1(24, 28, 0x4, 30) + AES_KEXPAND2(26, 30, 32) + AES_KEXPAND2(28, 32, 34) + AES_KEXPAND1(30, 34, 0x5, 36) + AES_KEXPAND2(32, 36, 38) + AES_KEXPAND2(34, 38, 40) + AES_KEXPAND1(36, 40, 0x6, 42) + AES_KEXPAND2(38, 42, 44) + AES_KEXPAND2(40, 44, 46) + AES_KEXPAND1(42, 46, 0x7, 48) + AES_KEXPAND2(44, 48, 50) + + std %f6, [%o1 + 0x00] + std %f8, [%o1 + 0x08] + std %f10, [%o1 + 0x10] + std %f12, [%o1 + 0x18] + std %f14, [%o1 + 0x20] + std %f16, [%o1 + 0x28] + std %f18, [%o1 + 0x30] + std %f20, [%o1 + 0x38] + std %f22, [%o1 + 0x40] + std %f24, [%o1 + 0x48] + std %f26, [%o1 + 0x50] + std %f28, [%o1 + 0x58] + std %f30, [%o1 + 0x60] + std %f32, [%o1 + 0x68] + std %f34, [%o1 + 0x70] + std %f36, [%o1 + 0x78] + std %f38, [%o1 + 0x80] + std %f40, [%o1 + 0x88] + std %f42, [%o1 + 0x90] + std %f44, [%o1 + 0x98] + std %f46, [%o1 + 0xa0] + std %f48, [%o1 + 0xa8] + ba,pt %xcc, 80f + std %f50, [%o1 + 0xb0] + +4: + /* 128-bit key expansion */ + AES_KEXPAND1(0, 2, 0x0, 4) + AES_KEXPAND2(2, 4, 6) + AES_KEXPAND1(4, 6, 0x1, 8) + AES_KEXPAND2(6, 8, 10) + AES_KEXPAND1(8, 10, 0x2, 12) + AES_KEXPAND2(10, 12, 14) + AES_KEXPAND1(12, 14, 0x3, 16) + AES_KEXPAND2(14, 16, 18) + AES_KEXPAND1(16, 18, 0x4, 20) + AES_KEXPAND2(18, 20, 22) + AES_KEXPAND1(20, 22, 0x5, 24) + AES_KEXPAND2(22, 24, 26) + AES_KEXPAND1(24, 26, 0x6, 28) + AES_KEXPAND2(26, 28, 30) + AES_KEXPAND1(28, 30, 0x7, 32) + AES_KEXPAND2(30, 32, 34) + AES_KEXPAND1(32, 34, 0x8, 36) + AES_KEXPAND2(34, 36, 38) + AES_KEXPAND1(36, 38, 0x9, 40) + AES_KEXPAND2(38, 40, 42) + + std %f4, [%o1 + 0x00] + std %f6, [%o1 + 0x08] + std %f8, [%o1 + 0x10] + std %f10, [%o1 + 0x18] + std %f12, [%o1 + 0x20] + std %f14, [%o1 + 0x28] + std %f16, [%o1 + 0x30] + std %f18, [%o1 + 0x38] + std %f20, [%o1 + 0x40] + std %f22, [%o1 + 0x48] + std %f24, [%o1 + 0x50] + std %f26, [%o1 + 0x58] + std %f28, [%o1 + 0x60] + std %f30, [%o1 + 0x68] + std %f32, [%o1 + 0x70] + std %f34, [%o1 + 0x78] + std %f36, [%o1 + 0x80] + std %f38, [%o1 + 0x88] + std %f40, [%o1 + 0x90] + std %f42, [%o1 + 0x98] +80: + retl + nop +.type aes_sparc_hw_expand_key,#function +.size aes_sparc_hw_expand_key,(.-aes_sparc_hw_expand_key) + ___ # fmovs instructions substituting for FP nops were originally added diff --git a/crypto/sparc_arch.h b/crypto/sparc_arch.h index bcb4829..f478ce3 100644 --- a/crypto/sparc_arch.h +++ b/crypto/sparc_arch.h @@ -27,6 +27,40 @@ extern int OPENSSL_sparcv9cap_P; #if __ASSEMBLER__ +#define F3F(x,y,z) (((x)<<30)|((y)<<19)|((z)<<5)) + +#define FPD_ENCODE(x) (((x) >> 5) | ((x) & ~(0x20))) + +#define RS1(x) (FPD_ENCODE(x) << 14) +#define RS2(x) (FPD_ENCODE(x) << 0) +#define RS3(x) (FPD_ENCODE(x) << 9) +#define RD(x) (FPD_ENCODE(x) << 25) +#define IMM5_0(x) ((x) << 0) +#define IMM5_9(x) ((x) << 9) + +#define AES_EROUND01(a,b,c,d) \ + .word (F3F(2, 0x19, 0)|RS1(a)|RS2(b)|RS3(c)|RD(d)); +#define AES_EROUND23(a,b,c,d) \ + .word (F3F(2, 0x19, 1)|RS1(a)|RS2(b)|RS3(c)|RD(d)); +#define AES_DROUND01(a,b,c,d) \ + .word (F3F(2, 0x19, 2)|RS1(a)|RS2(b)|RS3(c)|RD(d)); +#define AES_DROUND23(a,b,c,d) \ + .word (F3F(2, 0x19, 3)|RS1(a)|RS2(b)|RS3(c)|RD(d)); +#define AES_EROUND01_L(a,b,c,d) \ + .word (F3F(2, 0x19, 4)|RS1(a)|RS2(b)|RS3(c)|RD(d)); +#define AES_EROUND23_L(a,b,c,d) \ + .word (F3F(2, 0x19, 5)|RS1(a)|RS2(b)|RS3(c)|RD(d)); +#define AES_DROUND01_L(a,b,c,d) \ + .word (F3F(2, 0x19, 6)|RS1(a)|RS2(b)|RS3(c)|RD(d)); +#define AES_DROUND23_L(a,b,c,d) \ + .word (F3F(2, 0x19, 7)|RS1(a)|RS2(b)|RS3(c)|RD(d)); +#define AES_KEXPAND1(a,b,c,d) \ + .word (F3F(2, 0x19, 8)|RS1(a)|RS2(b)|IMM5_9(c)|RD(d)); +#define AES_KEXPAND0(a,b,c) \ + .word (F3F(2, 0x36, 0x130)|RS1(a)|RS2(b)|RD(c)); +#define AES_KEXPAND2(a,b,c) \ + .word (F3F(2, 0x36, 0x131)|RS1(a)|RS2(b)|RD(c)); + #ifdef __PIC__ #define SPARC_PIC_THUNK(reg) \ .align 32; \ -- 1.7.10.4 ______________________________________________________________________ OpenSSL Project http://www.openssl.org Development Mailing List openssl-dev@openssl.org Automated List Manager majord...@openssl.org