The techniques used in this plain v9 implementation are: 1) Use little-endian 32-bit loads when input data is aligned.
2) Avoid having to accumulate into the context hash values every loop iteration. 3) In the aligned case try to seperate the loads from the first use by as many instructions as possible, without sacrificing the schedule too much. 4) Attempt to dual-issue as much as possible on UltraSPARC-I/II/III/IV and SPARC-T4. The following measurements of "openssl speed md5" were taken on a SPARC-T4. Baseline: type 16 bytes 64 bytes 256 bytes 1024 bytes 8192 bytes md5 11952.12k 40947.20k 113453.25k 204308.82k 266010.62k With md5-sparcv9.S assembler: type 16 bytes 64 bytes 256 bytes 1024 bytes 8192 bytes md5 13062.19k 43100.63k 129569.37k 260317.18k 366650.51k Signed-off-by: David S. Miller <da...@davemloft.net> --- Configure | 2 +- crypto/md5/Makefile | 3 + crypto/md5/asm/md5-sparcv9.S | 242 ++++++++++++++++++++++++++++++++++++++++++ crypto/md5/md5_locl.h | 2 + 4 files changed, 248 insertions(+), 1 deletion(-) create mode 100644 crypto/md5/asm/md5-sparcv9.S diff --git a/Configure b/Configure index f6c271f..2333a63 100755 --- a/Configure +++ b/Configure @@ -130,7 +130,7 @@ my $x86_elf_asm="$x86_asm:elf"; my $x86_64_asm="x86_64cpuid.o:x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o::aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o::rc4-x86_64.o rc4-md5-x86_64.o:::wp-x86_64.o:cmll-x86_64.o cmll_misc.o:ghash-x86_64.o:e_padlock-x86_64.o"; my $ia64_asm="ia64cpuid.o:bn-ia64.o ia64-mont.o::aes_core.o aes_cbc.o aes-ia64.o::md5-ia64.o:sha1-ia64.o sha256-ia64.o sha512-ia64.o::rc4-ia64.o rc4_skey.o:::::ghash-ia64.o::void"; -my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o:aes_core.o aes_cbc.o aes-sparcv9.o:::sha1-sparcv9.o sha256-sparcv9.o sha512-sparcv9.o:::::::ghash-sparcv9.o::void"; +my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o:aes_core.o aes_cbc.o aes-sparcv9.o::md5-sparcv9.o:sha1-sparcv9.o sha256-sparcv9.o sha512-sparcv9.o:::::::ghash-sparcv9.o::void"; my $sparcv8_asm=":sparcv8.o:des_enc-sparc.o fcrypt_b.o:::::::::::::void"; my $alpha_asm="alphacpuid.o:bn_asm.o alpha-mont.o:::::sha1-alpha.o:::::::ghash-alpha.o::void"; my $mips64_asm=":bn-mips.o mips-mont.o::aes_cbc.o aes-mips.o:::sha1-mips.o sha256-mips.o sha512-mips.o::::::::"; diff --git a/crypto/md5/Makefile b/crypto/md5/Makefile index 7d42da4..075b8dd 100644 --- a/crypto/md5/Makefile +++ b/crypto/md5/Makefile @@ -52,6 +52,9 @@ md5-ia64.s: asm/md5-ia64.S $(CC) $(CFLAGS) -E asm/md5-ia64.S | \ $(PERL) -ne 's/;\s+/;\n/g; print;' > $@ +md5-sparcv9.s: asm/md5-sparcv9.S + $(CC) $(CFLAGS) -E asm/md5-sparcv9.S > $@ + files: $(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO diff --git a/crypto/md5/asm/md5-sparcv9.S b/crypto/md5/asm/md5-sparcv9.S new file mode 100644 index 0000000..30a2fdf --- /dev/null +++ b/crypto/md5/asm/md5-sparcv9.S @@ -0,0 +1,242 @@ +/* Written by David S. Miller <da...@davemloft.net> for the OpenSSL + * project. The module is, however, dual licensed under OpenSSL and + * CRYPTOGAMS licenses depending on where you obtain it. For further + * details see http://www.openssl.org/~appro/cryptogams/. + */ + +#define OASI %i3 + +#define H0 %o0 +#define H1 %o1 +#define H2 %o2 +#define H3 %o3 + +#define A %l0 +#define B %l1 +#define C %l2 +#define D %l3 + +#define TMP1 %l4 +#define TMP2 %l5 +#define TMP3 %l6 +#define TMP4 %l7 + +#define F1_I(wsrc, wdst, x, y, z, index, const, shift) \ + LOAD(%i1 + (index * 4), TMP4, TMP3, TMP2); \ + xor y, z, TMP1; \ + and x, TMP1, TMP2; \ + sethi %hi(const), TMP1; \ + xor z, TMP2, TMP3; \ + or TMP1, %lo(const), TMP1; \ + add wsrc, TMP3, wdst; \ + add TMP1, TMP4, TMP1; \ + add wdst, TMP1, wdst; \ + sll wdst, shift, TMP2; \ + srl wdst, (32 - shift), TMP3; \ + or TMP2, TMP3, TMP2; \ + add x, TMP2, wdst; + +#define F1(w, x, y, z, index, const, shift) \ + LOAD(%i1 + (index * 4), TMP4, TMP3, TMP2); \ + xor y, z, TMP1; \ + and x, TMP1, TMP2; \ + sethi %hi(const), TMP1; \ + xor z, TMP2, TMP3; \ + or TMP1, %lo(const), TMP1; \ + add w, TMP3, w; \ + add TMP1, TMP4, TMP1; \ + add w, TMP1, w; \ + sll w, shift, TMP2; \ + srl w, (32 - shift), TMP3; \ + or TMP2, TMP3, TMP2; \ + add x, TMP2, w; + +#define F2(w, x, y, z, index, const, shift) \ + LOAD(%i1 + (index * 4), TMP4, TMP3, TMP2); \ + xor x, y, TMP1; \ + and z, TMP1, TMP2; \ + sethi %hi(const), TMP1; \ + xor y, TMP2, TMP3; \ + or TMP1, %lo(const), TMP1; \ + add w, TMP3, w; \ + add TMP1, TMP4, TMP1; \ + add w, TMP1, w; \ + sll w, shift, TMP2; \ + srl w, (32 - shift), TMP3; \ + or TMP2, TMP3, TMP2; \ + add x, TMP2, w; + +#define F3(w, x, y, z, index, const, shift) \ + LOAD(%i1 + (index * 4), TMP4, TMP3, TMP2); \ + xor x, y, TMP1; \ + sethi %hi(const), TMP2; \ + xor z, TMP1, TMP3; \ + or TMP2, %lo(const), TMP2; \ + add w, TMP3, w; \ + add TMP2, TMP4, TMP1; \ + add w, TMP1, w; \ + sll w, shift, TMP2; \ + srl w, (32 - shift), TMP3; \ + or TMP2, TMP3, TMP2; \ + add x, TMP2, w; + +#define F4(w, x, y, z, index, const, shift) \ + LOAD(%i1 + (index * 4), TMP4, TMP3, TMP2); \ + orn x, z, TMP1; \ + sethi %hi(const), TMP2; \ + xor y, TMP1, TMP3; \ + or TMP2, %lo(const), TMP2; \ + add w, TMP3, w; \ + add TMP2, TMP4, TMP1; \ + add w, TMP1, w; \ + sll w, shift, TMP2; \ + srl w, (32 - shift), TMP3; \ + or TMP2, TMP3, TMP2; \ + add x, TMP2, w; + +#define MD5_TRANSFORM \ + F1_I(H0, A, H1, H2, H3, 0, 0xd76aa478, 7) \ + F1_I(H3, D, A, H1, H2, 1, 0xe8c7b756, 12) \ + F1_I(H2, C, D, A, H1, 2, 0x242070db, 17) \ + F1_I(H1, B, C, D, A, 3, 0xc1bdceee, 22) \ + F1(A, B, C, D, 4, 0xf57c0faf, 7) \ + F1(D, A, B, C, 5, 0x4787c62a, 12) \ + F1(C, D, A, B, 6, 0xa8304613, 17) \ + F1(B, C, D, A, 7, 0xfd469501, 22) \ + F1(A, B, C, D, 8, 0x698098d8, 7) \ + F1(D, A, B, C, 9, 0x8b44f7af, 12) \ + F1(C, D, A, B, 10, 0xffff5bb1, 17) \ + F1(B, C, D, A, 11, 0x895cd7be, 22) \ + F1(A, B, C, D, 12, 0x6b901122, 7) \ + F1(D, A, B, C, 13, 0xfd987193, 12) \ + F1(C, D, A, B, 14, 0xa679438e, 17) \ + F1(B, C, D, A, 15, 0x49b40821, 22) \ + F2(A, B, C, D, 1, 0xf61e2562, 5) \ + F2(D, A, B, C, 6, 0xc040b340, 9) \ + F2(C, D, A, B, 11, 0x265e5a51, 14) \ + F2(B, C, D, A, 0, 0xe9b6c7aa, 20) \ + F2(A, B, C, D, 5, 0xd62f105d, 5) \ + F2(D, A, B, C, 10, 0x02441453, 9) \ + F2(C, D, A, B, 15, 0xd8a1e681, 14) \ + F2(B, C, D, A, 4, 0xe7d3fbc8, 20) \ + F2(A, B, C, D, 9, 0x21e1cde6, 5) \ + F2(D, A, B, C, 14, 0xc33707d6, 9) \ + F2(C, D, A, B, 3, 0xf4d50d87, 14) \ + F2(B, C, D, A, 8, 0x455a14ed, 20) \ + F2(A, B, C, D, 13, 0xa9e3e905, 5) \ + F2(D, A, B, C, 2, 0xfcefa3f8, 9) \ + F2(C, D, A, B, 7, 0x676f02d9, 14) \ + F2(B, C, D, A, 12, 0x8d2a4c8a, 20) \ + F3(A, B, C, D, 5, 0xfffa3942, 4) \ + F3(D, A, B, C, 8, 0x8771f681, 11) \ + F3(C, D, A, B, 11, 0x6d9d6122, 16) \ + F3(B, C, D, A, 14, 0xfde5380c, 23) \ + F3(A, B, C, D, 1, 0xa4beea44, 4) \ + F3(D, A, B, C, 4, 0x4bdecfa9, 11) \ + F3(C, D, A, B, 7, 0xf6bb4b60, 16) \ + F3(B, C, D, A, 10, 0xbebfbc70, 23) \ + F3(A, B, C, D, 13, 0x289b7ec6, 4) \ + F3(D, A, B, C, 0, 0xeaa127fa, 11) \ + F3(C, D, A, B, 3, 0xd4ef3085, 16) \ + F3(B, C, D, A, 6, 0x04881d05, 23) \ + F3(A, B, C, D, 9, 0xd9d4d039, 4) \ + F3(D, A, B, C, 12, 0xe6db99e5, 11) \ + F3(C, D, A, B, 15, 0x1fa27cf8, 16) \ + F3(B, C, D, A, 2, 0xc4ac5665, 23) \ + F4(A, B, C, D, 0, 0xf4292244, 6) \ + F4(D, A, B, C, 7, 0x432aff97, 10) \ + F4(C, D, A, B, 14, 0xab9423a7, 15) \ + F4(B, C, D, A, 5, 0xfc93a039, 21) \ + F4(A, B, C, D, 12, 0x655b59c3, 6) \ + F4(D, A, B, C, 3, 0x8f0ccc92, 10) \ + F4(C, D, A, B, 10, 0xffeff47d, 15) \ + F4(B, C, D, A, 1, 0x85845dd1, 21) \ + F4(A, B, C, D, 8, 0x6fa87e4f, 6) \ + F4(D, A, B, C, 15, 0xfe2ce6e0, 10) \ + F4(C, D, A, B, 6, 0xa3014314, 15) \ + F4(B, C, D, A, 13, 0x4e0811a1, 21) \ + F4(A, B, C, D, 4, 0xf7537e82, 6) \ + F4(D, A, B, C, 11, 0xbd3af235, 10) \ + F4(C, D, A, B, 2, 0x2ad7d2bb, 15) \ + F4(B, C, D, A, 9, 0xeb86d391, 21) + +#ifdef __arch64__ + .register %g2,#scratch + .register %g3,#scratch +#define FRAME_SIZE 176 +#else +#define FRAME_SIZE 96 +#endif + + .section ".text",#alloc,#execinstr + + .align 32 + .globl md5_block_asm_data_order + .type md5_block_asm_data_order,#function +md5_block_asm_data_order: + /* %i0=ctx, %i1=data, %i2=num */ + save %sp, -96, %sp + rd %asi, OASI + wr %g0, 0x88, %asi ! ASI_PL + + ld [%i0 + 0x00], H0 + ld [%i0 + 0x04], H1 + andcc %i1, 0x3, %g0 + ld [%i0 + 0x08], H2 + bne,pn %icc, .Lunaligned + ld [%i0 + 0x0c], H3 + +#define LOAD(ADDR, DEST, TMP1, TMP2) \ + lda [ADDR] %asi, DEST; + +1: + MD5_TRANSFORM + + add A, H0, H0 + add B, H1, H1 + add C, H2, H2 + add D, H3, H3 + + subcc %i2, 1, %i2 + bne,pt %icc, 1b + add %i1, 16 * 4, %i1 + +.Lfinish: + st H0, [%i0 + 0x00] + st H1, [%i0 + 0x04] + st H2, [%i0 + 0x08] + st H3, [%i0 + 0x0c] + + wr OASI, 0x0, %asi + + ret + restore + +.Lunaligned: +#undef LOAD +#define LOAD(ADDR, DEST, TMP1, TMP2) \ + ldub [ADDR + 0], DEST; \ + ldub [ADDR + 1], TMP1; \ + ldub [ADDR + 2], TMP2; \ + sll TMP1, 8, TMP1; \ + sll TMP2, 16, TMP2; \ + or DEST, TMP1, DEST; \ + ldub [ADDR + 3], TMP1; \ + or DEST, TMP2, DEST; \ + sll TMP1, 24, TMP1; \ + or DEST, TMP1, DEST; + +1: + MD5_TRANSFORM + + add A, H0, H0 + add B, H1, H1 + add C, H2, H2 + add D, H3, H3 + + subcc %i2, 1, %i2 + bne,pt %icc, 1b + add %i1, 16 * 4, %i1 + + ba,a,pt %xcc, .Lfinish + .size md5_block_asm_data_order,.-md5_block_asm_data_order diff --git a/crypto/md5/md5_locl.h b/crypto/md5/md5_locl.h index 968d577..f39832e 100644 --- a/crypto/md5/md5_locl.h +++ b/crypto/md5/md5_locl.h @@ -71,6 +71,8 @@ # define md5_block_data_order md5_block_asm_data_order # elif defined(__ia64) || defined(__ia64__) || defined(_M_IA64) # define md5_block_data_order md5_block_asm_data_order +# elif defined(__sparc__) +# define md5_block_data_order md5_block_asm_data_order # endif #endif -- 1.7.10.4 ______________________________________________________________________ OpenSSL Project http://www.openssl.org Development Mailing List openssl-dev@openssl.org Automated List Manager majord...@openssl.org