Currently AES_encrypt, AES_decrypt, and the key expansion are
optimized.  Direct support for CBC, ECB, CTR, etc. will come
in subsequent changes.

The following measurements were taken on a SPARC-T4.

Baseline (OPENSSL_sparcv9cap=0):

type             16 bytes     64 bytes    256 bytes   1024 bytes   8192 bytes
aes-128 cbc      85241.72k    90930.60k    94282.67k    95158.95k    95087.08k
aes-192 cbc      73300.41k    77576.49k    80022.95k    80657.75k    80838.66k
aes-256 cbc      64390.17k    67656.43k    69442.30k    69893.80k    70022.49k

With AES opcodes enabled:

type             16 bytes     64 bytes    256 bytes   1024 bytes   8192 bytes
aes-128 cbc     298612.77k   353669.87k   389577.22k   400843.61k   406031.02k
aes-192 cbc     282841.19k   323486.85k   364641.37k   375664.98k   378989.23k
aes-256 cbc     269449.24k   310281.81k   343170.05k   352550.23k   355317.08k

There were several interesting implementation issues dealt with here.

The AES opcodes need the decryption key in a different format than the
generic sparc v9 code wants (basically, no pre-application of the
MixColumn).  To address this and also to facilitate using the AES
opcodes for key expansion, a new aes_sparccore.c file is used in place
of aes_core.c when building for sparcv9.

The non-AES-opcode sparc code was changed to use a real proper PIC sequence
with sparc_arch.h macros.  The code which was there flushes the UltraSPARC
return address stack, negatively impacting performance.

Any call, or jmpl with destination register %o7, that lacks a paired
ret/retl will effectively corrupt the return address stack, making
every subsequent ret/retl miss the cache and take a full pipeline
flush.

The sparc_arch.h PIC loading sequences lack this problem, and also
they know how to do non-PIC loading of symbol addresses even more
efficiently.

Next, usage of the AES instructions is unnecessarily difficult if
the key is not 8-byte aligned.  So we use a trick so that we always
have an aligned key to work with.

We determine if the AES_KEY is 8 or 4 byte aligned, these are the only
two possibilities on sparc.  If it is 8 byte aligned, we use the
existing interpretation of the AES_KEY contents.  However, if it is
4 byte aligned, we put the ->rounds value first and then the key
so that they key becomes 8-byte aligned.  All of the aes_sparccore.c
and aes-sparcv9.pl code is aware of this convention.

Since we don't have any control over the alignment of the input buffers,
output buffers, and input key, we make use of alignaddr, faligndata,
and masked partial stores to deal with the unaligned cases.

Signed-off-by: David S. Miller <da...@davemloft.net>
---
 Configure                     |    2 +-
 crypto/aes/Makefile           |    4 +-
 crypto/aes/aes_sparccore.c    |  272 ++++++++++++++++++++++
 crypto/aes/asm/aes-sparcv9.pl |  502 ++++++++++++++++++++++++++++++++++++++++-
 crypto/sparc_arch.h           |   34 +++
 5 files changed, 802 insertions(+), 12 deletions(-)
 create mode 100644 crypto/aes/aes_sparccore.c

diff --git a/Configure b/Configure
index 2333a63..66b4ff8 100755
--- a/Configure
+++ b/Configure
@@ -130,7 +130,7 @@ my $x86_elf_asm="$x86_asm:elf";
 
 my $x86_64_asm="x86_64cpuid.o:x86_64-gcc.o x86_64-mont.o x86_64-mont5.o 
x86_64-gf2m.o modexp512-x86_64.o::aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o 
aesni-x86_64.o aesni-sha1-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o 
sha512-x86_64.o::rc4-x86_64.o rc4-md5-x86_64.o:::wp-x86_64.o:cmll-x86_64.o 
cmll_misc.o:ghash-x86_64.o:e_padlock-x86_64.o";
 my $ia64_asm="ia64cpuid.o:bn-ia64.o ia64-mont.o::aes_core.o aes_cbc.o 
aes-ia64.o::md5-ia64.o:sha1-ia64.o sha256-ia64.o sha512-ia64.o::rc4-ia64.o 
rc4_skey.o:::::ghash-ia64.o::void";
-my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o 
sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o:aes_core.o aes_cbc.o 
aes-sparcv9.o::md5-sparcv9.o:sha1-sparcv9.o sha256-sparcv9.o 
sha512-sparcv9.o:::::::ghash-sparcv9.o::void";
+my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o 
sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o:aes_sparccore.o aes_cbc.o 
aes-sparcv9.o::md5-sparcv9.o:sha1-sparcv9.o sha256-sparcv9.o 
sha512-sparcv9.o:::::::ghash-sparcv9.o::void";
 my $sparcv8_asm=":sparcv8.o:des_enc-sparc.o fcrypt_b.o:::::::::::::void";
 my $alpha_asm="alphacpuid.o:bn_asm.o 
alpha-mont.o:::::sha1-alpha.o:::::::ghash-alpha.o::void";
 my $mips64_asm=":bn-mips.o mips-mont.o::aes_cbc.o aes-mips.o:::sha1-mips.o 
sha256-mips.o sha512-mips.o::::::::";
diff --git a/crypto/aes/Makefile b/crypto/aes/Makefile
index 8edd358..2f32983 100644
--- a/crypto/aes/Makefile
+++ b/crypto/aes/Makefile
@@ -66,8 +66,10 @@ aesni-x86_64.s: asm/aesni-x86_64.pl
 aesni-sha1-x86_64.s:   asm/aesni-sha1-x86_64.pl
        $(PERL) asm/aesni-sha1-x86_64.pl $(PERLASM_SCHEME) > $@
 
-aes-sparcv9.s: asm/aes-sparcv9.pl
+aes-sparcv9.S: asm/aes-sparcv9.pl
        $(PERL) asm/aes-sparcv9.pl $(CFLAGS) > $@
+aes-sparcv9.s: aes-sparcv9.S
+       $(CC) $(CFLAGS) -E aes-sparcv9.S > $@
 
 aes-ppc.s:     asm/aes-ppc.pl
        $(PERL) asm/aes-ppc.pl $(PERLASM_SCHEME) $@
diff --git a/crypto/aes/aes_sparccore.c b/crypto/aes/aes_sparccore.c
new file mode 100644
index 0000000..2842cbc
--- /dev/null
+++ b/crypto/aes/aes_sparccore.c
@@ -0,0 +1,272 @@
+/* crypto/aes/aes_core.c -*- mode:C; c-file-style: "eay" -*- */
+/**
+ * rijndael-alg-fst.c
+ *
+ * @version 3.0 (December 2000)
+ *
+ * Optimised ANSI C code for the Rijndael cipher (now AES)
+ *
+ * @author Vincent Rijmen <vincent.rij...@esat.kuleuven.ac.be>
+ * @author Antoon Bosselaers <antoon.bossela...@esat.kuleuven.ac.be>
+ * @author Paulo Barreto <paulo.barr...@terra.com.br>
+ *
+ * This code is hereby placed in the public domain.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
+ * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+ * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * This variant of aes_core.c is used for sparc so that we can use the
+ * Sparc AES opcodes for key expansion and layout the keys in the format
+ * that the AES opcode encrypt/decrypt expect.
+ */
+
+#include <assert.h>
+
+#include <stdlib.h>
+#include <openssl/crypto.h>
+#include <openssl/aes.h>
+#include "aes_locl.h"
+
+#include "sparc_arch.h"
+
+static const u8 Te4[256] = {
+    0x63U, 0x7cU, 0x77U, 0x7bU, 0xf2U, 0x6bU, 0x6fU, 0xc5U,
+    0x30U, 0x01U, 0x67U, 0x2bU, 0xfeU, 0xd7U, 0xabU, 0x76U,
+    0xcaU, 0x82U, 0xc9U, 0x7dU, 0xfaU, 0x59U, 0x47U, 0xf0U,
+    0xadU, 0xd4U, 0xa2U, 0xafU, 0x9cU, 0xa4U, 0x72U, 0xc0U,
+    0xb7U, 0xfdU, 0x93U, 0x26U, 0x36U, 0x3fU, 0xf7U, 0xccU,
+    0x34U, 0xa5U, 0xe5U, 0xf1U, 0x71U, 0xd8U, 0x31U, 0x15U,
+    0x04U, 0xc7U, 0x23U, 0xc3U, 0x18U, 0x96U, 0x05U, 0x9aU,
+    0x07U, 0x12U, 0x80U, 0xe2U, 0xebU, 0x27U, 0xb2U, 0x75U,
+    0x09U, 0x83U, 0x2cU, 0x1aU, 0x1bU, 0x6eU, 0x5aU, 0xa0U,
+    0x52U, 0x3bU, 0xd6U, 0xb3U, 0x29U, 0xe3U, 0x2fU, 0x84U,
+    0x53U, 0xd1U, 0x00U, 0xedU, 0x20U, 0xfcU, 0xb1U, 0x5bU,
+    0x6aU, 0xcbU, 0xbeU, 0x39U, 0x4aU, 0x4cU, 0x58U, 0xcfU,
+    0xd0U, 0xefU, 0xaaU, 0xfbU, 0x43U, 0x4dU, 0x33U, 0x85U,
+    0x45U, 0xf9U, 0x02U, 0x7fU, 0x50U, 0x3cU, 0x9fU, 0xa8U,
+    0x51U, 0xa3U, 0x40U, 0x8fU, 0x92U, 0x9dU, 0x38U, 0xf5U,
+    0xbcU, 0xb6U, 0xdaU, 0x21U, 0x10U, 0xffU, 0xf3U, 0xd2U,
+    0xcdU, 0x0cU, 0x13U, 0xecU, 0x5fU, 0x97U, 0x44U, 0x17U,
+    0xc4U, 0xa7U, 0x7eU, 0x3dU, 0x64U, 0x5dU, 0x19U, 0x73U,
+    0x60U, 0x81U, 0x4fU, 0xdcU, 0x22U, 0x2aU, 0x90U, 0x88U,
+    0x46U, 0xeeU, 0xb8U, 0x14U, 0xdeU, 0x5eU, 0x0bU, 0xdbU,
+    0xe0U, 0x32U, 0x3aU, 0x0aU, 0x49U, 0x06U, 0x24U, 0x5cU,
+    0xc2U, 0xd3U, 0xacU, 0x62U, 0x91U, 0x95U, 0xe4U, 0x79U,
+    0xe7U, 0xc8U, 0x37U, 0x6dU, 0x8dU, 0xd5U, 0x4eU, 0xa9U,
+    0x6cU, 0x56U, 0xf4U, 0xeaU, 0x65U, 0x7aU, 0xaeU, 0x08U,
+    0xbaU, 0x78U, 0x25U, 0x2eU, 0x1cU, 0xa6U, 0xb4U, 0xc6U,
+    0xe8U, 0xddU, 0x74U, 0x1fU, 0x4bU, 0xbdU, 0x8bU, 0x8aU,
+    0x70U, 0x3eU, 0xb5U, 0x66U, 0x48U, 0x03U, 0xf6U, 0x0eU,
+    0x61U, 0x35U, 0x57U, 0xb9U, 0x86U, 0xc1U, 0x1dU, 0x9eU,
+    0xe1U, 0xf8U, 0x98U, 0x11U, 0x69U, 0xd9U, 0x8eU, 0x94U,
+    0x9bU, 0x1eU, 0x87U, 0xe9U, 0xceU, 0x55U, 0x28U, 0xdfU,
+    0x8cU, 0xa1U, 0x89U, 0x0dU, 0xbfU, 0xe6U, 0x42U, 0x68U,
+    0x41U, 0x99U, 0x2dU, 0x0fU, 0xb0U, 0x54U, 0xbbU, 0x16U
+};
+static const u32 rcon[] = {
+       0x01000000, 0x02000000, 0x04000000, 0x08000000,
+       0x10000000, 0x20000000, 0x40000000, 0x80000000,
+       0x1B000000, 0x36000000, /* for 128-bit blocks, Rijndael never uses more 
than 10 rcon values */
+};
+
+/*
+ * We place the actual key->rounds storage either at the beginning, or the end,
+ * of the AES_KEY storage, in order to align the key itself to 64-bits.
+ */
+
+static u32 *key_pointer(AES_KEY *key)
+{
+       unsigned long addr = (unsigned long) key;
+
+       addr += (addr & 4);
+
+       return (u32 *) addr;
+}
+
+static int *rounds_pointer(AES_KEY *key)
+{
+       unsigned long addr = (unsigned long) key;
+
+       addr += (addr & 4) ? 0 : 240;
+
+       return (int *) addr;
+}
+
+extern void aes_sparc_hw_expand_key(const unsigned char *userKey, u32 *rk,
+                                   const int bits);
+
+/**
+ * Expand the cipher key into the encryption key schedule.
+ */
+int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
+                       AES_KEY *key) {
+       u32 *rk;
+       int i = 0;
+       u32 temp;
+       int *rp;
+
+       if (!userKey || !key)
+               return -1;
+       if (bits != 128 && bits != 192 && bits != 256)
+               return -2;
+
+       rk = key_pointer(key);
+       rp = rounds_pointer(key);
+
+       if (bits==128)
+               *rp = 10;
+       else if (bits==192)
+               *rp = 12;
+       else
+               *rp = 14;
+
+       if (OPENSSL_sparcv9cap_P & SPARCV9_AES) {
+               aes_sparc_hw_expand_key(userKey, rk, bits);
+               return 0;
+       }
+
+       rk[0] = GETU32(userKey     );
+       rk[1] = GETU32(userKey +  4);
+       rk[2] = GETU32(userKey +  8);
+       rk[3] = GETU32(userKey + 12);
+       if (bits == 128) {
+               while (1) {
+                       temp  = rk[3];
+                       rk[4] = rk[0] ^
+                               (Te4[(temp >> 16) & 0xff] << 24) ^
+                               (Te4[(temp >>  8) & 0xff] << 16) ^
+                               (Te4[(temp      ) & 0xff] << 8) ^
+                               (Te4[(temp >> 24)       ]) ^
+                               rcon[i];
+                       rk[5] = rk[1] ^ rk[4];
+                       rk[6] = rk[2] ^ rk[5];
+                       rk[7] = rk[3] ^ rk[6];
+                       if (++i == 10) {
+                               return 0;
+                       }
+                       rk += 4;
+               }
+       }
+       rk[4] = GETU32(userKey + 16);
+       rk[5] = GETU32(userKey + 20);
+       if (bits == 192) {
+               while (1) {
+                       temp = rk[ 5];
+                       rk[ 6] = rk[ 0] ^
+                               (Te4[(temp >> 16) & 0xff] << 24) ^
+                               (Te4[(temp >>  8) & 0xff] << 16) ^
+                               (Te4[(temp      ) & 0xff] << 8) ^
+                               (Te4[(temp >> 24)       ]) ^
+                               rcon[i];
+                       rk[ 7] = rk[ 1] ^ rk[ 6];
+                       rk[ 8] = rk[ 2] ^ rk[ 7];
+                       rk[ 9] = rk[ 3] ^ rk[ 8];
+                       if (++i == 8) {
+                               return 0;
+                       }
+                       rk[10] = rk[ 4] ^ rk[ 9];
+                       rk[11] = rk[ 5] ^ rk[10];
+                       rk += 6;
+               }
+       }
+       rk[6] = GETU32(userKey + 24);
+       rk[7] = GETU32(userKey + 28);
+       if (bits == 256) {
+               while (1) {
+                       temp = rk[ 7];
+                       rk[ 8] = rk[ 0] ^
+                               (Te4[(temp >> 16) & 0xff] << 24) ^
+                               (Te4[(temp >>  8) & 0xff] << 16) ^
+                               (Te4[(temp      ) & 0xff] << 8) ^
+                               (Te4[(temp >> 24)       ]) ^
+                               rcon[i];
+                       rk[ 9] = rk[ 1] ^ rk[ 8];
+                       rk[10] = rk[ 2] ^ rk[ 9];
+                       rk[11] = rk[ 3] ^ rk[10];
+                       if (++i == 7) {
+                               return 0;
+                       }
+                       temp = rk[11];
+                       rk[12] = rk[ 4] ^
+                               (Te4[(temp >> 24)       ] << 24) ^
+                               (Te4[(temp >> 16) & 0xff] << 16) ^
+                               (Te4[(temp >>  8) & 0xff] << 8) ^
+                               (Te4[(temp      ) & 0xff]);
+                       rk[13] = rk[ 5] ^ rk[12];
+                       rk[14] = rk[ 6] ^ rk[13];
+                       rk[15] = rk[ 7] ^ rk[14];
+
+                       rk += 8;
+               }
+       }
+       return 0;
+}
+
+/**
+ * Expand the cipher key into the decryption key schedule.
+ */
+int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
+                        AES_KEY *key) {
+
+        u32 *rk;
+       int i, j, status;
+       u32 temp;
+       int *rp;
+
+       /* first, start with an encryption schedule */
+       status = AES_set_encrypt_key(userKey, bits, key);
+       if (status < 0)
+               return status;
+
+       rk = key_pointer(key);
+       rp = rounds_pointer(key);
+
+       /* invert the order of the round keys: */
+       for (i = 0, j = 4*(*rp); i < j; i += 4, j -= 4) {
+               temp = rk[i    ]; rk[i    ] = rk[j    ]; rk[j    ] = temp;
+               temp = rk[i + 1]; rk[i + 1] = rk[j + 1]; rk[j + 1] = temp;
+               temp = rk[i + 2]; rk[i + 2] = rk[j + 2]; rk[j + 2] = temp;
+               temp = rk[i + 3]; rk[i + 3] = rk[j + 3]; rk[j + 3] = temp;
+       }
+
+       if (OPENSSL_sparcv9cap_P & SPARCV9_AES)
+               return 0;
+
+       /* apply the inverse MixColumn transform to all round keys but the 
first and the last: */
+       for (i = 1; i < (*rp); i++) {
+               rk += 4;
+               for (j = 0; j < 4; j++) {
+                       u32 tp1, tp2, tp4, tp8, tp9, tpb, tpd, tpe, m;
+
+                       tp1 = rk[j];
+                       m = tp1 & 0x80808080;
+                       tp2 = ((tp1 & 0x7f7f7f7f) << 1) ^
+                               ((m - (m >> 7)) & 0x1b1b1b1b);
+                       m = tp2 & 0x80808080;
+                       tp4 = ((tp2 & 0x7f7f7f7f) << 1) ^
+                               ((m - (m >> 7)) & 0x1b1b1b1b);
+                       m = tp4 & 0x80808080;
+                       tp8 = ((tp4 & 0x7f7f7f7f) << 1) ^
+                               ((m - (m >> 7)) & 0x1b1b1b1b);
+                       tp9 = tp8 ^ tp1;
+                       tpb = tp9 ^ tp2;
+                       tpd = tp9 ^ tp4;
+                       tpe = tp8 ^ tp4 ^ tp2;
+                       rk[j] = tpe ^ (tpd >> 16) ^ (tpd << 16) ^
+                               (tp9 >> 8) ^ (tp9 << 24) ^
+                               (tpb >> 24) ^ (tpb << 8);
+               }
+       }
+       return 0;
+}
diff --git a/crypto/aes/asm/aes-sparcv9.pl b/crypto/aes/asm/aes-sparcv9.pl
index 403c4d1..f022b7b 100755
--- a/crypto/aes/asm/aes-sparcv9.pl
+++ b/crypto/aes/asm/aes-sparcv9.pl
@@ -79,8 +79,52 @@ $code.=<<___ if ($bits==64);
 .register      %g3,#scratch
 ___
 $code.=<<___;
+#include "sparc_arch.h"
+
+#define ENCRYPT_TWO_ROUNDS(KEY_BASE, I0, I1, T0, T1) \\
+       AES_EROUND01(KEY_BASE +  0, I0, I1, T0) \\
+       AES_EROUND23(KEY_BASE +  2, I0, I1, T1) \\
+       AES_EROUND01(KEY_BASE +  4, T0, T1, I0) \\
+       AES_EROUND23(KEY_BASE +  6, T0, T1, I1)
+
+#define ENCRYPT_TWO_ROUNDS_LAST(KEY_BASE, I0, I1, T0, T1) \\
+       AES_EROUND01(KEY_BASE +  0, I0, I1, T0) \\
+       AES_EROUND23(KEY_BASE +  2, I0, I1, T1) \\
+       AES_EROUND01_L(KEY_BASE +  4, T0, T1, I0) \\
+       AES_EROUND23_L(KEY_BASE +  6, T0, T1, I1)
+
+       /* 10 rounds */
+#define ENCRYPT_128(KEY_BASE, I0, I1, T0, T1) \\
+       ENCRYPT_TWO_ROUNDS(KEY_BASE +  0, I0, I1, T0, T1) \\
+       ENCRYPT_TWO_ROUNDS(KEY_BASE +  8, I0, I1, T0, T1) \\
+       ENCRYPT_TWO_ROUNDS(KEY_BASE + 16, I0, I1, T0, T1) \\
+       ENCRYPT_TWO_ROUNDS(KEY_BASE + 24, I0, I1, T0, T1) \\
+       ENCRYPT_TWO_ROUNDS_LAST(KEY_BASE + 32, I0, I1, T0, T1)
+
+#define DECRYPT_TWO_ROUNDS(KEY_BASE, I0, I1, T0, T1) \\
+       AES_DROUND23(KEY_BASE +  0, I0, I1, T1) \\
+       AES_DROUND01(KEY_BASE +  2, I0, I1, T0) \\
+       AES_DROUND23(KEY_BASE +  4, T0, T1, I1) \\
+       AES_DROUND01(KEY_BASE +  6, T0, T1, I0)
+
+#define DECRYPT_TWO_ROUNDS_LAST(KEY_BASE, I0, I1, T0, T1) \\
+       AES_DROUND23(KEY_BASE +  0, I0, I1, T1) \\
+       AES_DROUND01(KEY_BASE +  2, I0, I1, T0) \\
+       AES_DROUND23_L(KEY_BASE +  4, T0, T1, I1) \\
+       AES_DROUND01_L(KEY_BASE +  6, T0, T1, I0)
+
+       /* 10 rounds */
+#define DECRYPT_128(KEY_BASE, I0, I1, T0, T1) \\
+       DECRYPT_TWO_ROUNDS(KEY_BASE +  0, I0, I1, T0, T1) \\
+       DECRYPT_TWO_ROUNDS(KEY_BASE +  8, I0, I1, T0, T1) \\
+       DECRYPT_TWO_ROUNDS(KEY_BASE + 16, I0, I1, T0, T1) \\
+       DECRYPT_TWO_ROUNDS(KEY_BASE + 24, I0, I1, T0, T1) \\
+       DECRYPT_TWO_ROUNDS_LAST(KEY_BASE + 32, I0, I1, T0, T1)
+
 .section       ".text",#alloc,#execinstr
 
+SPARC_PIC_THUNK(o4)
+
 .align 256
 AES_Te:
 ___
@@ -190,7 +234,12 @@ $code.=<<___;
 _sparcv9_AES_encrypt:
        save    %sp,-$frame-$locals,%sp
        stx     %i7,[%sp+$bias+$frame+0]        ! off-load return address
-       ld      [$key+240],$rounds
+       andcc   $key, 4, $t0
+       mov     $key, $t1
+       add     $key, 240, $t2
+       movne   %icc, $t1, $t2
+       ld      [$t2],$rounds
+       add     $key, $t0, $key
        ld      [$key+0],$t0
        ld      [$key+4],$t1                    !
        ld      [$key+8],$t2
@@ -512,6 +561,110 @@ _sparcv9_AES_encrypt:
 .align 32
 .globl AES_encrypt
 AES_encrypt:
+       /* %o0=input, %o1=output, %o2=key */
+       SPARC_LOAD_V9_CAPS_LEAF(o4, g1)
+       andcc   %o4, SPARCV9_AES, %g0
+       be      .Lenc_software
+        andcc  %o2, 0x4, %g1
+       mov     %o2, %g2
+       add     %g2, 240, %g3
+       movne   %icc, %g2, %g3
+       add     %o2, %g1, %o2
+       andcc   %o0, 0x7, %g0
+       be,pt   %icc, 1f
+        ld     [%g3], %g1
+
+       alignaddr %o0, %g0, %o0
+       ldd     [%o0 + 0x00], %f10
+       ldd     [%o0 + 0x08], %f12
+       ldd     [%o0 + 0x10], %f14
+       faligndata %f10, %f12, %f4
+       ba,pt   %icc, 2f
+        faligndata %f12, %f14, %f6
+
+1:
+       ldd     [%o0 + 0x00], %f4
+       ldd     [%o0 + 0x08], %f6
+2:
+       ldd     [%o2 + 0x00], %f8
+       ldd     [%o2 + 0x08], %f10
+
+       cmp     %g1, 12
+       fxor    %f8, %f4, %f4
+       bl      2f
+        fxor   %f10, %f6, %f6
+
+       be      1f
+        ldd    [%o2 + 0x10], %f8
+
+       /* 256-bit key, 14 rounds */
+       ldd     [%o2 + 0x18], %f10
+       ldd     [%o2 + 0x20], %f12
+       ldd     [%o2 + 0x28], %f14
+       add     %o2, 0x20, %o2
+       ENCRYPT_TWO_ROUNDS(8, 4, 6, 0, 2)
+
+       ldd     [%o2 + 0x10], %f8
+
+1:
+       /* 192-bit key, 12 rounds */
+       ldd     [%o2 + 0x18], %f10
+       ldd     [%o2 + 0x20], %f12
+       ldd     [%o2 + 0x28], %f14
+       add     %o2, 0x20, %o2
+       ENCRYPT_TWO_ROUNDS(8, 4, 6, 0, 2)
+
+2:
+       /* 128-bit key, 10 rounds */
+       ldd     [%o2 + 0x10], %f8
+       ldd     [%o2 + 0x18], %f10
+       ldd     [%o2 + 0x20], %f12
+       ldd     [%o2 + 0x28], %f14
+       ldd     [%o2 + 0x30], %f16
+       ldd     [%o2 + 0x38], %f18
+       ldd     [%o2 + 0x40], %f20
+       ldd     [%o2 + 0x48], %f22
+       ldd     [%o2 + 0x50], %f24
+       ldd     [%o2 + 0x58], %f26
+       ldd     [%o2 + 0x60], %f28
+       ldd     [%o2 + 0x68], %f30
+       ldd     [%o2 + 0x70], %f32
+       ldd     [%o2 + 0x78], %f34
+       ldd     [%o2 + 0x80], %f36
+       ldd     [%o2 + 0x88], %f38
+       ldd     [%o2 + 0x90], %f40
+       ldd     [%o2 + 0x98], %f42
+       ldd     [%o2 + 0xa0], %f44
+       ldd     [%o2 + 0xa8], %f46
+
+       ENCRYPT_128(8, 4, 6, 0, 2)
+
+       andcc   %o1, 0x7, %g0
+       bne,pn  %icc, 1f
+        nop
+
+       std     %f4, [%o1 + 0x00]
+       retl
+        std    %f6, [%o1 + 0x08]
+
+1:
+       alignaddrl %o1, %g0, %g3
+       mov     0xff, %g2
+       sub     %o1, %g3, %g1
+       srl     %g2, %g1, %g2
+       orn     %g0, %g2, %g1
+
+       faligndata %f4, %f4, %f10
+       faligndata %f4, %f6, %f12
+       faligndata %f6, %f6, %f14
+
+       stda    %f10, [%g3 + %g2] 0xc0
+       std     %f12, [%g3 + 0x08]
+       add     %g3, 0x10, %g3
+       retl
+        stda   %f14, [%g3 + %g1] 0xc0
+
+.Lenc_software:
        or      %o0,%o1,%g1
        andcc   %g1,3,%g0
        bnz,pn  %xcc,.Lunaligned_enc
@@ -522,8 +675,7 @@ AES_encrypt:
        ld      [%i0+8],%o2
        ld      [%i0+12],%o3
 
-1:     call    .+8
-       add     %o7,AES_Te-1b,%o4
+       SPARC_LOAD_ADDRESS(AES_Te, o4, o5)
        call    _sparcv9_AES_encrypt
        mov     %i2,%o5
 
@@ -582,8 +734,7 @@ AES_encrypt:
        or      %l7,%l6,%l6
        or      %l4,%l6,%o3
 
-1:     call    .+8
-       add     %o7,AES_Te-1b,%o4
+       SPARC_LOAD_ADDRESS(AES_Te, o4, o5)
        call    _sparcv9_AES_encrypt
        mov     %i2,%o5
 
@@ -736,7 +887,12 @@ $code.=<<___;
 _sparcv9_AES_decrypt:
        save    %sp,-$frame-$locals,%sp
        stx     %i7,[%sp+$bias+$frame+0]        ! off-load return address
-       ld      [$key+240],$rounds
+       andcc   $key, 4, $t0
+       mov     $key, $t1
+       add     $key, 240, $t2
+       movne   %icc, $t1, $t2
+       ld      [$t2],$rounds
+       add     $key, $t0, $key
        ld      [$key+0],$t0
        ld      [$key+4],$t1                    !
        ld      [$key+8],$t2
@@ -1058,6 +1214,110 @@ _sparcv9_AES_decrypt:
 .align 32
 .globl AES_decrypt
 AES_decrypt:
+       /* %o0=input, %o1=output, %o2=key */
+       SPARC_LOAD_V9_CAPS_LEAF(o4, g1)
+       andcc   %o4, SPARCV9_AES, %g0
+       be      .Ldec_software
+        andcc  %o2, 0x4, %g1
+       mov     %o2, %g2
+       add     %g2, 240, %g3
+       movne   %icc, %g2, %g3
+       add     %o2, %g1, %o2
+       andcc   %o0, 0x7, %g0
+       be,pt   %icc, 1f
+        ld     [%g3], %g1
+
+       alignaddr %o0, %g0, %o0
+       ldd     [%o0 + 0x00], %f10
+       ldd     [%o0 + 0x08], %f12
+       ldd     [%o0 + 0x10], %f14
+       faligndata %f10, %f12, %f4
+       ba,pt   %icc, 2f
+        faligndata %f12, %f14, %f6
+
+1:
+       ldd     [%o0 + 0x00], %f4
+       ldd     [%o0 + 0x08], %f6
+2:
+       ldd     [%o2 + 0x00], %f8
+       ldd     [%o2 + 0x08], %f10
+
+       cmp     %g1, 12
+       fxor    %f8, %f4, %f4
+       bl      2f
+        fxor   %f10, %f6, %f6
+
+       be      1f
+        ldd    [%o2 + 0x18], %f8
+
+       /* 256-bit key, 14 rounds */
+       ldd     [%o2 + 0x10], %f10
+       ldd     [%o2 + 0x28], %f12
+       ldd     [%o2 + 0x20], %f14
+       add     %o2, 0x20, %o2
+       DECRYPT_TWO_ROUNDS(8, 4, 6, 0, 2)
+
+       ldd     [%o2 + 0x18], %f8
+
+1:
+       /* 192-bit key, 12 rounds */
+       ldd     [%o2 + 0x10], %f10
+       ldd     [%o2 + 0x28], %f12
+       ldd     [%o2 + 0x20], %f14
+       add     %o2, 0x20, %o2
+       DECRYPT_TWO_ROUNDS(8, 4, 6, 0, 2)
+
+2:
+       /* 128-bit key, 10 rounds */
+       ldd     [%o2 + 0x18], %f8
+       ldd     [%o2 + 0x10], %f10
+       ldd     [%o2 + 0x28], %f12
+       ldd     [%o2 + 0x20], %f14
+       ldd     [%o2 + 0x38], %f16
+       ldd     [%o2 + 0x30], %f18
+       ldd     [%o2 + 0x48], %f20
+       ldd     [%o2 + 0x40], %f22
+       ldd     [%o2 + 0x58], %f24
+       ldd     [%o2 + 0x50], %f26
+       ldd     [%o2 + 0x68], %f28
+       ldd     [%o2 + 0x60], %f30
+       ldd     [%o2 + 0x78], %f32
+       ldd     [%o2 + 0x70], %f34
+       ldd     [%o2 + 0x88], %f36
+       ldd     [%o2 + 0x80], %f38
+       ldd     [%o2 + 0x98], %f40
+       ldd     [%o2 + 0x90], %f42
+       ldd     [%o2 + 0xa8], %f44
+       ldd     [%o2 + 0xa0], %f46
+
+       DECRYPT_128(8, 4, 6, 0, 2)
+
+       andcc   %o1, 0x7, %g0
+       bne,pn  %icc, 1f
+        nop
+
+       std     %f4, [%o1 + 0x00]
+       retl
+        std    %f6, [%o1 + 0x08]
+
+1:
+       alignaddrl %o1, %g0, %g3
+       mov     0xff, %g2
+       sub     %o1, %g3, %g1
+       srl     %g2, %g1, %g2
+       orn     %g0, %g2, %g1
+
+       faligndata %f4, %f4, %f10
+       faligndata %f4, %f6, %f12
+       faligndata %f6, %f6, %f14
+
+       stda    %f10, [%g3 + %g2] 0xc0
+       std     %f12, [%g3 + 0x08]
+       add     %g3, 0x10, %g3
+       retl
+        stda   %f14, [%g3 + %g1] 0xc0
+
+.Ldec_software:
        or      %o0,%o1,%g1
        andcc   %g1,3,%g0
        bnz,pn  %xcc,.Lunaligned_dec
@@ -1068,8 +1328,7 @@ AES_decrypt:
        ld      [%i0+8],%o2
        ld      [%i0+12],%o3
 
-1:     call    .+8
-       add     %o7,AES_Td-1b,%o4
+       SPARC_LOAD_ADDRESS(AES_Td, o4, o5)
        call    _sparcv9_AES_decrypt
        mov     %i2,%o5
 
@@ -1128,8 +1387,7 @@ AES_decrypt:
        or      %l7,%l6,%l6
        or      %l4,%l6,%o3
 
-1:     call    .+8
-       add     %o7,AES_Td-1b,%o4
+       SPARC_LOAD_ADDRESS(AES_Td, o4, o5)
        call    _sparcv9_AES_decrypt
        mov     %i2,%o5
 
@@ -1169,6 +1427,230 @@ AES_decrypt:
        restore
 .type  AES_decrypt,#function
 .size  AES_decrypt,(.-AES_decrypt)
+
+.align 32
+.globl aes_sparc_hw_expand_key
+aes_sparc_hw_expand_key:
+       /* %o0=input_key, %o1=output_key, %o2=key_len_in_bits */
+       andcc   %o0, 0x7, %g1
+       be,pt   %icc, 1f
+        nop
+       alignaddr %o0, %g0, %o0
+       ldd     [%o0 + 0x00], %f10
+       ldd     [%o0 + 0x08], %f12
+       ldd     [%o0 + 0x10], %f14
+       faligndata %f10, %f12, %f0
+       ba,pt   %icc, 2f
+        faligndata %f12, %f14, %f2
+
+1:
+       ldd     [%o0 + 0x00], %f0
+       ldd     [%o0 + 0x08], %f2
+
+2:
+       std     %f0, [%o1 + 0x00]
+       std     %f2, [%o1 + 0x08]
+       add     %o1, 0x10, %o1
+
+       cmp     %o2, 192
+       bl      4f
+        nop
+
+       be      3f
+        nop
+
+       /* 256-bit key expansion */
+       andcc   %g1, 7, %g0
+       be,pt   %icc, 1f
+        nop
+
+       ldd     [%o0 + 0x10], %f10
+       ldd     [%o0 + 0x18], %f12
+       ldd     [%o0 + 0x20], %f14
+       faligndata %f10, %f12, %f4
+       ba,pt   %icc, 2f
+        faligndata %f12, %f14, %f6
+
+1:
+       ldd     [%o0 + 0x10], %f4
+       ldd     [%o0 + 0x18], %f6
+
+2:
+       std     %f4, [%o1 + 0x00]
+       std     %f6, [%o1 + 0x08]
+       add     %o1, 0x10, %o1
+
+       AES_KEXPAND1(0, 6, 0x0, 8)
+       AES_KEXPAND2(2, 8, 10)
+       AES_KEXPAND0(4, 10, 12)
+       AES_KEXPAND2(6, 12, 14)
+       AES_KEXPAND1(8, 14, 0x1, 16)
+       AES_KEXPAND2(10, 16, 18)
+       AES_KEXPAND0(12, 18, 20)
+       AES_KEXPAND2(14, 20, 22)
+       AES_KEXPAND1(16, 22, 0x2, 24)
+       AES_KEXPAND2(18, 24, 26)
+       AES_KEXPAND0(20, 26, 28)
+       AES_KEXPAND2(22, 28, 30)
+       AES_KEXPAND1(24, 30, 0x3, 32)
+       AES_KEXPAND2(26, 32, 34)
+       AES_KEXPAND0(28, 34, 36)
+       AES_KEXPAND2(30, 36, 38)
+       AES_KEXPAND1(32, 38, 0x4, 40)
+       AES_KEXPAND2(34, 40, 42)
+       AES_KEXPAND0(36, 42, 44)
+       AES_KEXPAND2(38, 44, 46)
+       AES_KEXPAND1(40, 46, 0x5, 48)
+       AES_KEXPAND2(42, 48, 50)
+       AES_KEXPAND0(44, 50, 52)
+       AES_KEXPAND2(46, 52, 54)
+       AES_KEXPAND1(48, 54, 0x6, 56)
+       AES_KEXPAND2(50, 56, 58)
+
+       std     %f8, [%o1 + 0x00]
+       std     %f10, [%o1 + 0x08]
+       std     %f12, [%o1 + 0x10]
+       std     %f14, [%o1 + 0x18]
+       std     %f16, [%o1 + 0x20]
+       std     %f18, [%o1 + 0x28]
+       std     %f20, [%o1 + 0x30]
+       std     %f22, [%o1 + 0x38]
+       std     %f24, [%o1 + 0x40]
+       std     %f26, [%o1 + 0x48]
+       std     %f28, [%o1 + 0x50]
+       std     %f30, [%o1 + 0x58]
+       std     %f32, [%o1 + 0x60]
+       std     %f34, [%o1 + 0x68]
+       std     %f36, [%o1 + 0x70]
+       std     %f38, [%o1 + 0x78]
+       std     %f40, [%o1 + 0x80]
+       std     %f42, [%o1 + 0x88]
+       std     %f44, [%o1 + 0x90]
+       std     %f46, [%o1 + 0x98]
+       std     %f48, [%o1 + 0xa0]
+       std     %f50, [%o1 + 0xa8]
+       std     %f52, [%o1 + 0xb0]
+       std     %f54, [%o1 + 0xb8]
+       std     %f56, [%o1 + 0xc0]
+       ba,pt   %xcc, 80f
+        std    %f58, [%o1 + 0xc8]
+
+3:
+       /* 192-bit key expansion */
+       andcc   %g1, 7, %g0
+       be,pt   %icc, 1f
+        nop
+
+       ldd     [%o0 + 0x10], %f10
+       ldd     [%o0 + 0x18], %f12
+       ba,pt   %icc, 2f
+        faligndata %f10, %f12, %f4
+
+1:
+       ldd     [%o0 + 0x10], %f4
+
+2:
+       std     %f4, [%o1 + 0x00]
+       add     %o1, 0x08, %o1
+
+       AES_KEXPAND1(0, 4, 0x0, 6)
+       AES_KEXPAND2(2, 6, 8)
+       AES_KEXPAND2(4, 8, 10)
+       AES_KEXPAND1(6, 10, 0x1, 12)
+       AES_KEXPAND2(8, 12, 14)
+       AES_KEXPAND2(10, 14, 16)
+       AES_KEXPAND1(12, 16, 0x2, 18)
+       AES_KEXPAND2(14, 18, 20)
+       AES_KEXPAND2(16, 20, 22)
+       AES_KEXPAND1(18, 22, 0x3, 24)
+       AES_KEXPAND2(20, 24, 26)
+       AES_KEXPAND2(22, 26, 28)
+       AES_KEXPAND1(24, 28, 0x4, 30)
+       AES_KEXPAND2(26, 30, 32)
+       AES_KEXPAND2(28, 32, 34)
+       AES_KEXPAND1(30, 34, 0x5, 36)
+       AES_KEXPAND2(32, 36, 38)
+       AES_KEXPAND2(34, 38, 40)
+       AES_KEXPAND1(36, 40, 0x6, 42)
+       AES_KEXPAND2(38, 42, 44)
+       AES_KEXPAND2(40, 44, 46)
+       AES_KEXPAND1(42, 46, 0x7, 48)
+       AES_KEXPAND2(44, 48, 50)
+
+       std     %f6, [%o1 + 0x00]
+       std     %f8, [%o1 + 0x08]
+       std     %f10, [%o1 + 0x10]
+       std     %f12, [%o1 + 0x18]
+       std     %f14, [%o1 + 0x20]
+       std     %f16, [%o1 + 0x28]
+       std     %f18, [%o1 + 0x30]
+       std     %f20, [%o1 + 0x38]
+       std     %f22, [%o1 + 0x40]
+       std     %f24, [%o1 + 0x48]
+       std     %f26, [%o1 + 0x50]
+       std     %f28, [%o1 + 0x58]
+       std     %f30, [%o1 + 0x60]
+       std     %f32, [%o1 + 0x68]
+       std     %f34, [%o1 + 0x70]
+       std     %f36, [%o1 + 0x78]
+       std     %f38, [%o1 + 0x80]
+       std     %f40, [%o1 + 0x88]
+       std     %f42, [%o1 + 0x90]
+       std     %f44, [%o1 + 0x98]
+       std     %f46, [%o1 + 0xa0]
+       std     %f48, [%o1 + 0xa8]
+       ba,pt   %xcc, 80f
+        std    %f50, [%o1 + 0xb0]
+
+4:
+       /* 128-bit key expansion */
+       AES_KEXPAND1(0, 2, 0x0, 4)
+       AES_KEXPAND2(2, 4, 6)
+       AES_KEXPAND1(4, 6, 0x1, 8)
+       AES_KEXPAND2(6, 8, 10)
+       AES_KEXPAND1(8, 10, 0x2, 12)
+       AES_KEXPAND2(10, 12, 14)
+       AES_KEXPAND1(12, 14, 0x3, 16)
+       AES_KEXPAND2(14, 16, 18)
+       AES_KEXPAND1(16, 18, 0x4, 20)
+       AES_KEXPAND2(18, 20, 22)
+       AES_KEXPAND1(20, 22, 0x5, 24)
+       AES_KEXPAND2(22, 24, 26)
+       AES_KEXPAND1(24, 26, 0x6, 28)
+       AES_KEXPAND2(26, 28, 30)
+       AES_KEXPAND1(28, 30, 0x7, 32)
+       AES_KEXPAND2(30, 32, 34)
+       AES_KEXPAND1(32, 34, 0x8, 36)
+       AES_KEXPAND2(34, 36, 38)
+       AES_KEXPAND1(36, 38, 0x9, 40)
+       AES_KEXPAND2(38, 40, 42)
+
+       std     %f4, [%o1 + 0x00]
+       std     %f6, [%o1 + 0x08]
+       std     %f8, [%o1 + 0x10]
+       std     %f10, [%o1 + 0x18]
+       std     %f12, [%o1 + 0x20]
+       std     %f14, [%o1 + 0x28]
+       std     %f16, [%o1 + 0x30]
+       std     %f18, [%o1 + 0x38]
+       std     %f20, [%o1 + 0x40]
+       std     %f22, [%o1 + 0x48]
+       std     %f24, [%o1 + 0x50]
+       std     %f26, [%o1 + 0x58]
+       std     %f28, [%o1 + 0x60]
+       std     %f30, [%o1 + 0x68]
+       std     %f32, [%o1 + 0x70]
+       std     %f34, [%o1 + 0x78]
+       std     %f36, [%o1 + 0x80]
+       std     %f38, [%o1 + 0x88]
+       std     %f40, [%o1 + 0x90]
+       std     %f42, [%o1 + 0x98]
+80:
+       retl
+        nop
+.type  aes_sparc_hw_expand_key,#function
+.size  aes_sparc_hw_expand_key,(.-aes_sparc_hw_expand_key)
+
 ___
 
 # fmovs instructions substituting for FP nops were originally added
diff --git a/crypto/sparc_arch.h b/crypto/sparc_arch.h
index bcb4829..f478ce3 100644
--- a/crypto/sparc_arch.h
+++ b/crypto/sparc_arch.h
@@ -27,6 +27,40 @@ extern int OPENSSL_sparcv9cap_P;
 
 #if __ASSEMBLER__
 
+#define F3F(x,y,z)     (((x)<<30)|((y)<<19)|((z)<<5))
+
+#define FPD_ENCODE(x)  (((x) >> 5) | ((x) & ~(0x20)))
+
+#define RS1(x)         (FPD_ENCODE(x) << 14)
+#define RS2(x)         (FPD_ENCODE(x) <<  0)
+#define RS3(x)         (FPD_ENCODE(x) <<  9)
+#define RD(x)          (FPD_ENCODE(x) << 25)
+#define IMM5_0(x)      ((x)           <<  0)
+#define IMM5_9(x)      ((x)           <<  9)
+
+#define AES_EROUND01(a,b,c,d)  \
+       .word   (F3F(2, 0x19, 0)|RS1(a)|RS2(b)|RS3(c)|RD(d));
+#define AES_EROUND23(a,b,c,d)  \
+       .word   (F3F(2, 0x19, 1)|RS1(a)|RS2(b)|RS3(c)|RD(d));
+#define AES_DROUND01(a,b,c,d)  \
+       .word   (F3F(2, 0x19, 2)|RS1(a)|RS2(b)|RS3(c)|RD(d));
+#define AES_DROUND23(a,b,c,d)  \
+       .word   (F3F(2, 0x19, 3)|RS1(a)|RS2(b)|RS3(c)|RD(d));
+#define AES_EROUND01_L(a,b,c,d)        \
+       .word   (F3F(2, 0x19, 4)|RS1(a)|RS2(b)|RS3(c)|RD(d));
+#define AES_EROUND23_L(a,b,c,d)        \
+       .word   (F3F(2, 0x19, 5)|RS1(a)|RS2(b)|RS3(c)|RD(d));
+#define AES_DROUND01_L(a,b,c,d)        \
+       .word   (F3F(2, 0x19, 6)|RS1(a)|RS2(b)|RS3(c)|RD(d));
+#define AES_DROUND23_L(a,b,c,d)        \
+       .word   (F3F(2, 0x19, 7)|RS1(a)|RS2(b)|RS3(c)|RD(d));
+#define AES_KEXPAND1(a,b,c,d)  \
+       .word   (F3F(2, 0x19, 8)|RS1(a)|RS2(b)|IMM5_9(c)|RD(d));
+#define AES_KEXPAND0(a,b,c)    \
+       .word   (F3F(2, 0x36, 0x130)|RS1(a)|RS2(b)|RD(c));
+#define AES_KEXPAND2(a,b,c)    \
+       .word   (F3F(2, 0x36, 0x131)|RS1(a)|RS2(b)|RD(c));
+
 #ifdef __PIC__
 #define SPARC_PIC_THUNK(reg)   \
        .align  32;             \
-- 
1.7.10.4

______________________________________________________________________
OpenSSL Project                                 http://www.openssl.org
Development Mailing List                       openssl-dev@openssl.org
Automated List Manager                           majord...@openssl.org

Reply via email to