The techniques used in this plain v9 implementation are:

1) Use little-endian 32-bit loads when input data is aligned.

2) Avoid having to accumulate into the context hash values every
   loop iteration.

3) In the aligned case try to seperate the loads from the first
   use by as many instructions as possible, without sacrificing
   the schedule too much.

4) Attempt to dual-issue as much as possible on UltraSPARC-I/II/III/IV
   and SPARC-T4.

The following measurements of "openssl speed md5" were taken on a
SPARC-T4.

Baseline:

type             16 bytes     64 bytes    256 bytes   1024 bytes   8192 bytes
md5              11952.12k    40947.20k   113453.25k   204308.82k   266010.62k

With md5-sparcv9.S assembler:

type             16 bytes     64 bytes    256 bytes   1024 bytes   8192 bytes
md5              13062.19k    43100.63k   129569.37k   260317.18k   366650.51k

Signed-off-by: David S. Miller <da...@davemloft.net>
---
 Configure                    |    2 +-
 crypto/md5/Makefile          |    3 +
 crypto/md5/asm/md5-sparcv9.S |  242 ++++++++++++++++++++++++++++++++++++++++++
 crypto/md5/md5_locl.h        |    2 +
 4 files changed, 248 insertions(+), 1 deletion(-)
 create mode 100644 crypto/md5/asm/md5-sparcv9.S

diff --git a/Configure b/Configure
index f6c271f..2333a63 100755
--- a/Configure
+++ b/Configure
@@ -130,7 +130,7 @@ my $x86_elf_asm="$x86_asm:elf";
 
 my $x86_64_asm="x86_64cpuid.o:x86_64-gcc.o x86_64-mont.o x86_64-mont5.o 
x86_64-gf2m.o modexp512-x86_64.o::aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o 
aesni-x86_64.o aesni-sha1-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o 
sha512-x86_64.o::rc4-x86_64.o rc4-md5-x86_64.o:::wp-x86_64.o:cmll-x86_64.o 
cmll_misc.o:ghash-x86_64.o:e_padlock-x86_64.o";
 my $ia64_asm="ia64cpuid.o:bn-ia64.o ia64-mont.o::aes_core.o aes_cbc.o 
aes-ia64.o::md5-ia64.o:sha1-ia64.o sha256-ia64.o sha512-ia64.o::rc4-ia64.o 
rc4_skey.o:::::ghash-ia64.o::void";
-my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o 
sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o:aes_core.o aes_cbc.o 
aes-sparcv9.o:::sha1-sparcv9.o sha256-sparcv9.o 
sha512-sparcv9.o:::::::ghash-sparcv9.o::void";
+my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o 
sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o:aes_core.o aes_cbc.o 
aes-sparcv9.o::md5-sparcv9.o:sha1-sparcv9.o sha256-sparcv9.o 
sha512-sparcv9.o:::::::ghash-sparcv9.o::void";
 my $sparcv8_asm=":sparcv8.o:des_enc-sparc.o fcrypt_b.o:::::::::::::void";
 my $alpha_asm="alphacpuid.o:bn_asm.o 
alpha-mont.o:::::sha1-alpha.o:::::::ghash-alpha.o::void";
 my $mips64_asm=":bn-mips.o mips-mont.o::aes_cbc.o aes-mips.o:::sha1-mips.o 
sha256-mips.o sha512-mips.o::::::::";
diff --git a/crypto/md5/Makefile b/crypto/md5/Makefile
index 7d42da4..075b8dd 100644
--- a/crypto/md5/Makefile
+++ b/crypto/md5/Makefile
@@ -52,6 +52,9 @@ md5-ia64.s: asm/md5-ia64.S
        $(CC) $(CFLAGS) -E asm/md5-ia64.S | \
        $(PERL) -ne 's/;\s+/;\n/g; print;' > $@
 
+md5-sparcv9.s: asm/md5-sparcv9.S
+       $(CC) $(CFLAGS) -E asm/md5-sparcv9.S > $@
+
 files:
        $(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO
 
diff --git a/crypto/md5/asm/md5-sparcv9.S b/crypto/md5/asm/md5-sparcv9.S
new file mode 100644
index 0000000..30a2fdf
--- /dev/null
+++ b/crypto/md5/asm/md5-sparcv9.S
@@ -0,0 +1,242 @@
+/* Written by David S. Miller <da...@davemloft.net> for the OpenSSL
+ * project. The module is, however, dual licensed under OpenSSL and
+ * CRYPTOGAMS licenses depending on where you obtain it. For further
+ * details see http://www.openssl.org/~appro/cryptogams/.
+ */
+
+#define OASI   %i3
+
+#define H0     %o0
+#define H1     %o1
+#define H2     %o2
+#define H3     %o3
+
+#define A      %l0
+#define B      %l1
+#define C      %l2
+#define D      %l3
+
+#define TMP1   %l4
+#define TMP2   %l5
+#define TMP3   %l6
+#define TMP4   %l7
+
+#define F1_I(wsrc, wdst, x, y, z, index, const, shift)         \
+       LOAD(%i1 + (index * 4), TMP4, TMP3, TMP2);      \
+       xor     y, z, TMP1;                             \
+       and     x, TMP1, TMP2;                          \
+       sethi   %hi(const), TMP1;                       \
+       xor     z, TMP2, TMP3;                          \
+       or      TMP1, %lo(const), TMP1;                 \
+       add     wsrc, TMP3, wdst;                       \
+       add     TMP1, TMP4, TMP1;                       \
+       add     wdst, TMP1, wdst;                       \
+       sll     wdst, shift, TMP2;                      \
+       srl     wdst, (32 - shift), TMP3;               \
+       or      TMP2, TMP3, TMP2;                       \
+       add     x, TMP2, wdst;
+
+#define F1(w, x, y, z, index, const, shift)            \
+       LOAD(%i1 + (index * 4), TMP4, TMP3, TMP2);      \
+       xor     y, z, TMP1;                             \
+       and     x, TMP1, TMP2;                          \
+       sethi   %hi(const), TMP1;                       \
+       xor     z, TMP2, TMP3;                          \
+       or      TMP1, %lo(const), TMP1;                 \
+       add     w, TMP3, w;                             \
+       add     TMP1, TMP4, TMP1;                       \
+       add     w, TMP1, w;                             \
+       sll     w, shift, TMP2;                         \
+       srl     w, (32 - shift), TMP3;                  \
+       or      TMP2, TMP3, TMP2;                       \
+       add     x, TMP2, w;
+
+#define F2(w, x, y, z, index, const, shift)            \
+       LOAD(%i1 + (index * 4), TMP4, TMP3, TMP2);      \
+       xor     x, y, TMP1;                             \
+       and     z, TMP1, TMP2;                          \
+       sethi   %hi(const), TMP1;                       \
+       xor     y, TMP2, TMP3;                          \
+       or      TMP1, %lo(const), TMP1;                 \
+       add     w, TMP3, w;                             \
+       add     TMP1, TMP4, TMP1;                       \
+       add     w, TMP1, w;                             \
+       sll     w, shift, TMP2;                         \
+       srl     w, (32 - shift), TMP3;                  \
+       or      TMP2, TMP3, TMP2;                       \
+       add     x, TMP2, w;
+
+#define F3(w, x, y, z, index, const, shift)            \
+       LOAD(%i1 + (index * 4), TMP4, TMP3, TMP2);      \
+       xor     x, y, TMP1;                             \
+       sethi   %hi(const), TMP2;                       \
+       xor     z, TMP1, TMP3;                          \
+       or      TMP2, %lo(const), TMP2;                 \
+       add     w, TMP3, w;                             \
+       add     TMP2, TMP4, TMP1;                       \
+       add     w, TMP1, w;                             \
+       sll     w, shift, TMP2;                         \
+       srl     w, (32 - shift), TMP3;                  \
+       or      TMP2, TMP3, TMP2;                       \
+       add     x, TMP2, w;
+
+#define F4(w, x, y, z, index, const, shift)            \
+       LOAD(%i1 + (index * 4), TMP4, TMP3, TMP2);      \
+       orn     x, z, TMP1;                             \
+       sethi   %hi(const), TMP2;                       \
+       xor     y, TMP1, TMP3;                          \
+       or      TMP2, %lo(const), TMP2;                 \
+       add     w, TMP3, w;                             \
+       add     TMP2, TMP4, TMP1;                       \
+       add     w, TMP1, w;                             \
+       sll     w, shift, TMP2;                         \
+       srl     w, (32 - shift), TMP3;                  \
+       or      TMP2, TMP3, TMP2;                       \
+       add     x, TMP2, w;
+
+#define MD5_TRANSFORM \
+       F1_I(H0, A, H1, H2, H3,  0, 0xd76aa478,  7) \
+       F1_I(H3, D,  A, H1, H2,  1, 0xe8c7b756, 12) \
+       F1_I(H2, C,  D,  A, H1,  2, 0x242070db, 17) \
+       F1_I(H1, B,  C,  D,  A,  3, 0xc1bdceee, 22) \
+       F1(A, B, C, D,  4, 0xf57c0faf,  7) \
+       F1(D, A, B, C,  5, 0x4787c62a, 12) \
+       F1(C, D, A, B,  6, 0xa8304613, 17) \
+       F1(B, C, D, A,  7, 0xfd469501, 22) \
+       F1(A, B, C, D,  8, 0x698098d8,  7) \
+       F1(D, A, B, C,  9, 0x8b44f7af, 12) \
+       F1(C, D, A, B, 10, 0xffff5bb1, 17) \
+       F1(B, C, D, A, 11, 0x895cd7be, 22) \
+       F1(A, B, C, D, 12, 0x6b901122,  7) \
+       F1(D, A, B, C, 13, 0xfd987193, 12) \
+       F1(C, D, A, B, 14, 0xa679438e, 17) \
+       F1(B, C, D, A, 15, 0x49b40821, 22) \
+       F2(A, B, C, D,  1, 0xf61e2562,  5) \
+       F2(D, A, B, C,  6, 0xc040b340,  9) \
+       F2(C, D, A, B, 11, 0x265e5a51, 14) \
+       F2(B, C, D, A,  0, 0xe9b6c7aa, 20) \
+       F2(A, B, C, D,  5, 0xd62f105d,  5) \
+       F2(D, A, B, C, 10, 0x02441453,  9) \
+       F2(C, D, A, B, 15, 0xd8a1e681, 14) \
+       F2(B, C, D, A,  4, 0xe7d3fbc8, 20) \
+       F2(A, B, C, D,  9, 0x21e1cde6,  5) \
+       F2(D, A, B, C, 14, 0xc33707d6,  9) \
+       F2(C, D, A, B,  3, 0xf4d50d87, 14) \
+       F2(B, C, D, A,  8, 0x455a14ed, 20) \
+       F2(A, B, C, D, 13, 0xa9e3e905,  5) \
+       F2(D, A, B, C,  2, 0xfcefa3f8,  9) \
+       F2(C, D, A, B,  7, 0x676f02d9, 14) \
+       F2(B, C, D, A, 12, 0x8d2a4c8a, 20) \
+       F3(A, B, C, D,  5, 0xfffa3942,  4) \
+       F3(D, A, B, C,  8, 0x8771f681, 11) \
+       F3(C, D, A, B, 11, 0x6d9d6122, 16) \
+       F3(B, C, D, A, 14, 0xfde5380c, 23) \
+       F3(A, B, C, D,  1, 0xa4beea44,  4) \
+       F3(D, A, B, C,  4, 0x4bdecfa9, 11) \
+       F3(C, D, A, B,  7, 0xf6bb4b60, 16) \
+       F3(B, C, D, A, 10, 0xbebfbc70, 23) \
+       F3(A, B, C, D, 13, 0x289b7ec6,  4) \
+       F3(D, A, B, C,  0, 0xeaa127fa, 11) \
+       F3(C, D, A, B,  3, 0xd4ef3085, 16) \
+       F3(B, C, D, A,  6, 0x04881d05, 23) \
+       F3(A, B, C, D,  9, 0xd9d4d039,  4) \
+       F3(D, A, B, C, 12, 0xe6db99e5, 11) \
+       F3(C, D, A, B, 15, 0x1fa27cf8, 16) \
+       F3(B, C, D, A,  2, 0xc4ac5665, 23) \
+       F4(A, B, C, D,  0, 0xf4292244,  6) \
+       F4(D, A, B, C,  7, 0x432aff97, 10) \
+       F4(C, D, A, B, 14, 0xab9423a7, 15) \
+       F4(B, C, D, A,  5, 0xfc93a039, 21) \
+       F4(A, B, C, D, 12, 0x655b59c3,  6) \
+       F4(D, A, B, C,  3, 0x8f0ccc92, 10) \
+       F4(C, D, A, B, 10, 0xffeff47d, 15) \
+       F4(B, C, D, A,  1, 0x85845dd1, 21) \
+       F4(A, B, C, D,  8, 0x6fa87e4f,  6) \
+       F4(D, A, B, C, 15, 0xfe2ce6e0, 10) \
+       F4(C, D, A, B,  6, 0xa3014314, 15) \
+       F4(B, C, D, A, 13, 0x4e0811a1, 21) \
+       F4(A, B, C, D,  4, 0xf7537e82,  6) \
+       F4(D, A, B, C, 11, 0xbd3af235, 10) \
+       F4(C, D, A, B,  2, 0x2ad7d2bb, 15) \
+       F4(B, C, D, A,  9, 0xeb86d391, 21)
+
+#ifdef __arch64__
+       .register       %g2,#scratch
+       .register       %g3,#scratch
+#define FRAME_SIZE     176
+#else
+#define FRAME_SIZE     96
+#endif
+
+       .section        ".text",#alloc,#execinstr
+
+       .align  32
+       .globl  md5_block_asm_data_order
+       .type   md5_block_asm_data_order,#function
+md5_block_asm_data_order:
+       /* %i0=ctx, %i1=data, %i2=num */
+       save    %sp, -96, %sp
+       rd      %asi, OASI
+       wr      %g0, 0x88, %asi         ! ASI_PL
+
+       ld      [%i0 + 0x00], H0
+       ld      [%i0 + 0x04], H1
+       andcc   %i1, 0x3, %g0
+       ld      [%i0 + 0x08], H2
+       bne,pn  %icc, .Lunaligned
+        ld     [%i0 + 0x0c], H3
+
+#define LOAD(ADDR, DEST, TMP1, TMP2)   \
+       lda     [ADDR] %asi, DEST;
+
+1:
+       MD5_TRANSFORM
+
+       add     A, H0, H0
+       add     B, H1, H1
+       add     C, H2, H2
+       add     D, H3, H3
+
+       subcc   %i2, 1, %i2
+       bne,pt  %icc, 1b
+        add    %i1, 16 * 4, %i1
+
+.Lfinish:
+       st      H0, [%i0 + 0x00]
+       st      H1, [%i0 + 0x04]
+       st      H2, [%i0 + 0x08]
+       st      H3, [%i0 + 0x0c]
+
+       wr      OASI, 0x0, %asi
+
+       ret
+        restore
+
+.Lunaligned:
+#undef LOAD
+#define LOAD(ADDR, DEST, TMP1, TMP2)   \
+       ldub    [ADDR + 0], DEST;       \
+       ldub    [ADDR + 1], TMP1;       \
+       ldub    [ADDR + 2], TMP2;       \
+       sll     TMP1,  8, TMP1;         \
+       sll     TMP2, 16, TMP2;         \
+       or      DEST, TMP1, DEST;       \
+       ldub    [ADDR + 3], TMP1;       \
+       or      DEST, TMP2, DEST;       \
+       sll     TMP1, 24, TMP1;         \
+       or      DEST, TMP1, DEST;
+
+1:
+       MD5_TRANSFORM
+
+       add     A, H0, H0
+       add     B, H1, H1
+       add     C, H2, H2
+       add     D, H3, H3
+
+       subcc   %i2, 1, %i2
+       bne,pt  %icc, 1b
+        add    %i1, 16 * 4, %i1
+
+       ba,a,pt %xcc, .Lfinish
+       .size   md5_block_asm_data_order,.-md5_block_asm_data_order
diff --git a/crypto/md5/md5_locl.h b/crypto/md5/md5_locl.h
index 968d577..f39832e 100644
--- a/crypto/md5/md5_locl.h
+++ b/crypto/md5/md5_locl.h
@@ -71,6 +71,8 @@
 #  define md5_block_data_order md5_block_asm_data_order
 # elif defined(__ia64) || defined(__ia64__) || defined(_M_IA64)
 #  define md5_block_data_order md5_block_asm_data_order
+# elif defined(__sparc__)
+#  define md5_block_data_order md5_block_asm_data_order
 # endif
 #endif
 
-- 
1.7.10.4

______________________________________________________________________
OpenSSL Project                                 http://www.openssl.org
Development Mailing List                       openssl-dev@openssl.org
Automated List Manager                           majord...@openssl.org

Reply via email to