LP64: MD5 woes

Andy Polyakov Mon, 10 May 1999 05:08:24 -0700

Hi! Time for more LP64 woes:-)

Yapp, it's MD5's turn now. I've found the same (unacceptable from my
point of view) sizeof(int/long)/*_ENDIAN dependencies on LP64 platforms
as in SHA and Blowfish. Just like in SHA case it was operational on
Alpha only because L_ENDIAN wasn't defined. But it was operaional under
Solaris7/64 with B_ENDIAN around in compiler command line. How come?
Here is explanation:

from crypto/md5/md5_locl.h:

/* On sparc, this actually slows things down :-( */
#if defined(sun)
#undef B_ENDIAN
#endif

As far as I can tell the comment was introduced between SSLeay-0.8.1b
and SSLeay-0.9.0b... Again, it's nothing but sheer luck! Sigh...
Speaking of which, what's the rush with releases?

In either case find the jumbo patch attached. I've tested it on SPARC
Solaris, Intel Linux and Alpha Linux. The code was extensively
reorginized (new crypto/md32_common.h with sligtly incomprehensible
#ifdef spahgetti was introduced) and slightly redesigned to eliminate
unnecessary copying cycles. Here's small matrix representing performance
improvement over current implementation.

+-----------------------+-----------------------+-------------------+
|arch                   |input is 32 bit aligned|input isn't aligned|
|-----------------------|-----------------------|-------------------|
|SPARC v8plus SC5.0     |+20%                   |+4%                |   
|SPARC v8plus gcc-1.1.2 |+28%                   |+10%               |
|SPARC v9 SC5.0         |+37% (wow!)            |+24% (wow!)        |
|Alpha Linux gcc-1.0.2  |+44% (wow!)            |+20% (wow!)        |
|Intel Linux gcc-1.1.2  |+2%                    |+14%               |
+-----------------------+-----------------------+-------------------+

Don't get too impressed over improvement in aligned case for SPARC
v8plus. It's UltraSPARC-specific assembler module exploiting "load in
little-endian order" which is responsible for this (well, feel free to
get impressed over the fact that I actually hand-coded it:-). Do get
impressed over 24% in unaligned SPARC v9. Well, there is no black magic
there and this tune-up simply brought the performance back up to v8plus
level so to say... Do get impressed over Alpha cases. At least I was
impressed! Do note though that I've feeded -DL_ENDIAN to Alpha compiler
in order to get the above result. I.e. I did something the current
Configure does *not* do! There're some comments provided explaining what
I did and why it's better. I might want to add some extra ones
afterwards...

All above brings up to an interesting question. What do we know about
alignment of data SSL layer feeds cipher and hash routines with?

And the comment about Configure brings us to another question. I assume
everybody would agree on that it would be nice to get rid of
LP64/*_ENDIAN dependencies and make the *_ENDIAN flag autoconfigurable.
Does the development team have guts to do it now? :-) Another candidate
for autoconfiguration is choice between proposed __LP32__, __ILP32__,
__LP64__ and __ILP64__ flags.

What's next? I mean for my part. I'm going to reorginize and (re-)tune
SHA. After that I'm going to examine RC4, DES and RC2 as these are
extensively used by the Web browsers. Somewhere in the middle I'll
probably be able to squeeze IRIX bignum, i.e. bug hunting and mips3.s
rewrite (which will probably be most profitable as one can address 128
multiplication and division instructions:-).

Cheers. Andy.

*** ./Configure.orig    Fri May  7 18:00:05 1999
--- ./Configure Sun May  9 17:04:40 1999
***************
*** 106,112 ****
  # Solaris setups
  "solaris-x86-gcc","gcc:-O3 -fomit-frame-pointer -m486 -Wall 
-DL_ENDIAN:-D_REENTRANT:-lsocket -lnsl:BN_LLONG $x86_gcc_des 
$x86_gcc_opts:$x86_sol_asm",
  "solaris-sparc-gcc","gcc:-O3 -fomit-frame-pointer -mv8 -Wall 
-DB_ENDIAN:-D_REENTRANT:-lsocket -lnsl:BN_LLONG RC4_CHAR DES_UNROLL 
BF_PTR:asm/sparcv8.o::",
! "solaris-usparc-gcc","gcc:-O3 -fomit-frame-pointer -mcpu=ultrasparc -Wall 
-DB_ENDIAN:-D_REENTRANT:-lsocket -lnsl:BN_LLONG RC4_CHAR DES_UNROLL 
BF_PTR:asm/sparcv8plus-gcc.o::",
  "debug-solaris-sparc-gcc","gcc:-O3 -g -mv8 -Wall -DB_ENDIAN:-D_REENTRANT:-lsocket 
-lnsl:BN_LLONG RC4_CHAR DES_UNROLL BF_PTR:::",
  
  # DO NOT use /xO[34] on sparc with SC3.0.  It is broken, and will not pass the tests
--- 106,112 ----
  # Solaris setups
  "solaris-x86-gcc","gcc:-O3 -fomit-frame-pointer -m486 -Wall 
-DL_ENDIAN:-D_REENTRANT:-lsocket -lnsl:BN_LLONG $x86_gcc_des 
$x86_gcc_opts:$x86_sol_asm",
  "solaris-sparc-gcc","gcc:-O3 -fomit-frame-pointer -mv8 -Wall 
-DB_ENDIAN:-D_REENTRANT:-lsocket -lnsl:BN_LLONG RC4_CHAR DES_UNROLL 
BF_PTR:asm/sparcv8.o::",
! "solaris-usparc-gcc","gcc:-O3 -fomit-frame-pointer -mcpu=ultrasparc -Wall -DB_ENDIAN 
-DULTRASPARC:-D_REENTRANT:-lsocket -lnsl:BN_LLONG RC4_CHAR DES_UNROLL 
BF_PTR:asm/sparcv8plus-gcc.o:::asm/md5-sparcv8plus.o:",
  "debug-solaris-sparc-gcc","gcc:-O3 -g -mv8 -Wall -DB_ENDIAN:-D_REENTRANT:-lsocket 
-lnsl:BN_LLONG RC4_CHAR DES_UNROLL BF_PTR:::",
  
  # DO NOT use /xO[34] on sparc with SC3.0.  It is broken, and will not pass the tests
***************
*** 114,125 ****
  # SC4 is ok, better than gcc even on bn as long as you tell it -xarch=v8
  # -fast slows things like DES down quite a lot
  # Don't use -xtarget=ultra with SC4.2. It is broken, and will break exptest.
- # SC5.0 with the compiler common patch works.
  "solaris-sparc-sc4","cc:-xarch=v8 -xstrconst -xO5 -xdepend -Xa -DB_ENDIAN 
-DBN_DIV2W:-D_REENTRANT:-lsocket -lnsl:BN_LLONG RC4_CHAR DES_PTR DES_RISC1 DES_UNROLL 
BF_PTR:asm/sparcv8.o::",
  "solaris-usparc-sc4","cc:-xarch=v8plus -xstrconst -xO5 -xdepend -Xa -DB_ENDIAN 
-DBN_DIV2W:-D_REENTRANT:-lsocket -lnsl:BN_LLONG RC4_CHAR DES_PTR DES_RISC1 DES_UNROLL 
BF_PTR:asm/sparcv8plus.o::",
  # SC5.0 note: Compiler common patch 107357-01 or later is required!
! "solaris-usparc-sc5","cc:-xtarget=ultra -xarch=v8plus -xstrconst -xO5 -xdepend -Xa 
-DB_ENDIAN -DBN_DIV2W:-D_REENTRANT:-lsocket -lnsl:BN_LLONG RC4_CHAR DES_PTR DES_RISC1 
DES_UNROLL BF_PTR:asm/sparcv8plus.o::",
! "solaris64-usparc-sc5","cc:-xtarget=ultra -xarch=v9 -xstrconst -xO5 -xdepend -Xa 
-DB_ENDIAN:-D_REENTRANT:-lsocket -lnsl:SIXTY_FOUR_BIT_LONG RC4_CHAR DES_INT DES_PTR 
DES_RISC1 DES_UNROLL BF_PTR:::",
  
  # Sunos configs, assuming sparc for the gcc one.
  ##"sunos-cc", "cc:-O4 -DNOPROTO -DNOCONST:(unknown)::DES_UNROLL:::",
--- 114,124 ----
  # SC4 is ok, better than gcc even on bn as long as you tell it -xarch=v8
  # -fast slows things like DES down quite a lot
  # Don't use -xtarget=ultra with SC4.2. It is broken, and will break exptest.
  "solaris-sparc-sc4","cc:-xarch=v8 -xstrconst -xO5 -xdepend -Xa -DB_ENDIAN 
-DBN_DIV2W:-D_REENTRANT:-lsocket -lnsl:BN_LLONG RC4_CHAR DES_PTR DES_RISC1 DES_UNROLL 
BF_PTR:asm/sparcv8.o::",
  "solaris-usparc-sc4","cc:-xarch=v8plus -xstrconst -xO5 -xdepend -Xa -DB_ENDIAN 
-DBN_DIV2W:-D_REENTRANT:-lsocket -lnsl:BN_LLONG RC4_CHAR DES_PTR DES_RISC1 DES_UNROLL 
BF_PTR:asm/sparcv8plus.o::",
  # SC5.0 note: Compiler common patch 107357-01 or later is required!
! "solaris-usparc-sc5","cc:-xtarget=ultra -xarch=v8plus -xstrconst -xO5 -xdepend -Xa 
-DB_ENDIAN -DULTRASPARC -DBN_DIV2W:-D_REENTRANT:-lsocket -lnsl:BN_LLONG RC4_CHAR 
DES_PTR DES_RISC1 DES_UNROLL BF_PTR:asm/sparcv8plus.o:::asm/md5-sparcv8plus.o:",
! "solaris64-usparc-sc5","cc:-xtarget=ultra -xarch=v9 -xstrconst -xO5 -xdepend -Xa 
-DB_ENDIAN -DULTRASPARC:-D_REENTRANT:-lsocket -lnsl:SIXTY_FOUR_BIT_LONG RC4_CHAR 
DES_INT DES_PTR DES_RISC1 DES_UNROLL BF_PTR::::asm/md5-sparcv9.o:",
  
  # Sunos configs, assuming sparc for the gcc one.
  ##"sunos-cc", "cc:-O4 -DNOPROTO -DNOCONST:(unknown)::DES_UNROLL:::",
*** ./crypto/md5/md5_dgst.c.orig        Mon Apr 26 19:00:25 1999
--- ./crypto/md5/md5_dgst.c     Sun May  9 16:42:49 1999
***************
*** 70,81 ****
  #define INIT_DATA_C (unsigned long)0x98badcfeL
  #define INIT_DATA_D (unsigned long)0x10325476L
  
- #  ifdef MD5_ASM
-      void md5_block_x86(MD5_CTX *c, unsigned long *p,int num);
- #    define md5_block md5_block_x86
- #  else
-      static void md5_block(MD5_CTX *c, unsigned long *p,int num);
- #  endif
  void MD5_Init(MD5_CTX *c)
        {
        c->A=INIT_DATA_A;
--- 70,75 ----
***************
*** 87,269 ****
        c->num=0;
        }
  
! void MD5_Update(MD5_CTX *c, const void *_data, unsigned long len)
        {
!       register const unsigned char *data=_data;
!       register ULONG *p;
!       int sw,sc;
!       ULONG l;
  
-       if (len == 0) return;
- 
-       l=(c->Nl+(len<<3))&0xffffffffL;
-       /* 95-05-24 eay Fixed a bug with the overflow handling, thanks to
-        * Wei Dai <[EMAIL PROTECTED]> for pointing it out. */
-       if (l < c->Nl) /* overflow */
-               c->Nh++;
-       c->Nh+=(len>>29);
-       c->Nl=l;
- 
-       if (c->num != 0)
-               {
-               p=c->data;
-               sw=c->num>>2;
-               sc=c->num&0x03;
- 
-               if ((c->num+len) >= MD5_CBLOCK)
-                       {
-                       l= p[sw];
-                       p_c2l(data,l,sc);
-                       p[sw++]=l;
-                       for (; sw<MD5_LBLOCK; sw++)
-                               {
-                               c2l(data,l);
-                               p[sw]=l;
-                               }
-                       len-=(MD5_CBLOCK-c->num);
- 
-                       md5_block(c,p,64);
-                       c->num=0;
-                       /* drop through and do the rest */
-                       }
-               else
-                       {
-                       int ew,ec;
- 
-                       c->num+=(int)len;
-                       if ((sc+len) < 4) /* ugly, add char's to a word */
-                               {
-                               l= p[sw];
-                               p_c2l_p(data,l,sc,len);
-                               p[sw]=l;
-                               }
-                       else
-                               {
-                               ew=(c->num>>2);
-                               ec=(c->num&0x03);
-                               l= p[sw];
-                               p_c2l(data,l,sc);
-                               p[sw++]=l;
-                               for (; sw < ew; sw++)
-                                       { c2l(data,l); p[sw]=l; }
-                               if (ec)
-                                       {
-                                       c2l_p(data,l,ec);
-                                       p[sw]=l;
-                                       }
-                               }
-                       return;
-                       }
-               }
-       /* we now can process the input data in blocks of MD5_CBLOCK
-        * chars and save the leftovers to c->data. */
- #ifdef L_ENDIAN
-       if ((((unsigned long)data)%sizeof(ULONG)) == 0)
-               {
-               sw=(int)len/MD5_CBLOCK;
-               if (sw > 0)
-                       {
-                       sw*=MD5_CBLOCK;
-                       md5_block(c,(ULONG *)data,sw);
-                       data+=sw;
-                       len-=sw;
-                       }
-               }
- #endif
-       p=c->data;
-       while (len >= MD5_CBLOCK)
-               {
- #if defined(L_ENDIAN) || defined(B_ENDIAN)
-               if (p != (unsigned long *)data)
-                       memcpy(p,data,MD5_CBLOCK);
-               data+=MD5_CBLOCK;
- #ifdef B_ENDIAN
-               for (sw=(MD5_LBLOCK/4); sw; sw--)
-                       {
-                       Endian_Reverse32(p[0]);
-                       Endian_Reverse32(p[1]);
-                       Endian_Reverse32(p[2]);
-                       Endian_Reverse32(p[3]);
-                       p+=4;
-                       }
- #endif
- #else
-               for (sw=(MD5_LBLOCK/4); sw; sw--)
-                       {
-                       c2l(data,l); *(p++)=l;
-                       c2l(data,l); *(p++)=l;
-                       c2l(data,l); *(p++)=l;
-                       c2l(data,l); *(p++)=l; 
-                       } 
- #endif
-               p=c->data;
-               md5_block(c,p,64);
-               len-=MD5_CBLOCK;
-               }
-       sc=(int)len;
-       c->num=sc;
-       if (sc)
-               {
-               sw=sc>>2;       /* words to copy */
- #ifdef L_ENDIAN
-               p[sw]=0;
-               memcpy(p,data,sc);
- #else
-               sc&=0x03;
-               for ( ; sw; sw--)
-                       { c2l(data,l); *(p++)=l; }
-               c2l_p(data,l,sc);
-               *p=l;
- #endif
-               }
-       }
- 
- void MD5_Transform(MD5_CTX *c, unsigned char *b)
-       {
-       ULONG p[16];
- #if !defined(L_ENDIAN)
-       ULONG *q;
-       int i;
- #endif
- 
- #if defined(B_ENDIAN) || defined(L_ENDIAN)
-       memcpy(p,b,64);
- #ifdef B_ENDIAN
-       q=p;
-       for (i=(MD5_LBLOCK/4); i; i--)
-               {
-               Endian_Reverse32(q[0]);
-               Endian_Reverse32(q[1]);
-               Endian_Reverse32(q[2]);
-               Endian_Reverse32(q[3]);
-               q+=4;
-               }
- #endif
- #else
-       q=p;
-       for (i=(MD5_LBLOCK/4); i; i--)
-               {
-               ULONG l;
-               c2l(b,l); *(q++)=l;
-               c2l(b,l); *(q++)=l;
-               c2l(b,l); *(q++)=l;
-               c2l(b,l); *(q++)=l; 
-               } 
- #endif
-       md5_block(c,p,64);
-       }
- 
- #ifndef MD5_ASM
- 
- static void md5_block(MD5_CTX *c, register ULONG *X, int num)
-       {
-       register ULONG A,B,C,D;
- 
        A=c->A;
        B=c->B;
        C=c->C;
        D=c->D;
!       for (;;)
                {
        /* Round 0 */
        R0(A,B,C,D,X[ 0], 7,0xd76aa478L);
--- 81,111 ----
        c->num=0;
        }
  
! #ifndef md5_block_host_order
! static void md5_block_host_order (MD5_CTX *c, const MD5_LONG *X, int num)
        {
!       register unsigned long A,B,C,D;
!       /*
!        * In case you wonder why A-D are declared as long and not
!        * as MD5_LONG. Doing so results in slight performance
!        * boost on LP64 architectures. The catch is we don't
!        * really care if 32 MSBs of a 64-bit register get polluted
!        * with eventual overflows as we *save* only 32 LSBs in
!        * *either* case. Now declaring 'em long excuses the compiler
!        * from keeping 32 MSBs zeroed resulting in 13% performance
!        * improvement under SPARC Solaris7/64 and 5% under AlphaLinux.
!        * Well, to be honest it should say that this *prevents* 
!        * performance degradation.
!        *
!        *                              <[EMAIL PROTECTED]>
!        */
  
        A=c->A;
        B=c->B;
        C=c->C;
        D=c->D;
! 
!       for (;num--;X+=HASH_LBLOCK)
                {
        /* Round 0 */
        R0(A,B,C,D,X[ 0], 7,0xd76aa478L);
***************
*** 334,407 ****
        R3(C,D,A,B,X[ 2],15,0x2ad7d2bbL);
        R3(B,C,D,A,X[ 9],21,0xeb86d391L);
  
!       A+=c->A&0xffffffffL;
!       B+=c->B&0xffffffffL;
!       c->A=A;
!       c->B=B;
!       C+=c->C&0xffffffffL;
!       D+=c->D&0xffffffffL;
!       c->C=C;
!       c->D=D;
!       X+=16;
!       num-=64;
!       if (num <= 0) break;
                }
        }
  #endif
  
! void MD5_Final(unsigned char *md, MD5_CTX *c)
        {
!       register int i,j;
!       register ULONG l;
!       register ULONG *p;
!       static unsigned char end[4]={0x80,0x00,0x00,0x00};
!       unsigned char *cp=end;
  
!       /* c->num should definitly have room for at least one more byte. */
!       p=c->data;
!       j=c->num;
!       i=j>>2;
  
!       /* purify often complains about the following line as an
!        * Uninitialized Memory Read.  While this can be true, the
!        * following p_c2l macro will reset l when that case is true.
!        * This is because j&0x03 contains the number of 'valid' bytes
!        * already in p[i].  If and only if j&0x03 == 0, the UMR will
!        * occur but this is also the only time p_c2l will do
!        * l= *(cp++) instead of l|= *(cp++)
!        * Many thanks to Alex Tang <[EMAIL PROTECTED]> for pickup this
!        * 'potential bug' */
! #ifdef PURIFY
!       if ((j&0x03) == 0) p[i]=0;
! #endif
!       l=p[i];
!       p_c2l(cp,l,j&0x03);
!       p[i]=l;
!       i++;
!       /* i is the next 'undefined word' */
!       if (c->num >= MD5_LAST_BLOCK)
                {
!               for (; i<MD5_LBLOCK; i++)
!                       p[i]=0;
!               md5_block(c,p,64);
!               i=0;
!               }
!       for (; i<(MD5_LBLOCK-2); i++)
!               p[i]=0;
!       p[MD5_LBLOCK-2]=c->Nl;
!       p[MD5_LBLOCK-1]=c->Nh;
!       md5_block(c,p,64);
!       cp=md;
!       l=c->A; l2c(l,cp);
!       l=c->B; l2c(l,cp);
!       l=c->C; l2c(l,cp);
!       l=c->D; l2c(l,cp);
  
!       /* clear stuff, md5_block may be leaving some stuff on the stack
!        * but I'm not worried :-) */
!       c->num=0;
! /*    memset((char *)&c,0,sizeof(c));*/
        }
  
  #ifdef undef
  int printit(unsigned long *l)
--- 176,302 ----
        R3(C,D,A,B,X[ 2],15,0x2ad7d2bbL);
        R3(B,C,D,A,X[ 9],21,0xeb86d391L);
  
!       A = c->A += A;
!       B = c->B += B;
!       C = c->C += C;
!       D = c->D += D;
                }
        }
  #endif
  
! #ifndef md5_block_data_order
! static void md5_block_data_order (MD5_CTX *c, const unsigned char *data, int num)
        {
!       register unsigned long A,B,C,D,l;
!       /*
!        * In case you wonder why A-D are declared as long and not
!        * as MD5_LONG. Doing so results in slight performance
!        * boost on LP64 architectures. The catch is we don't
!        * really care if 32 MSBs of a 64-bit register get polluted
!        * with eventual overflows as we *save* only 32 LSBs in
!        * *either* case. Now declaring 'em long excuses the compiler
!        * from keeping 32 MSBs zeroed resulting in 13% performance
!        * improvement under SPARC Solaris7/64 and 5% under AlphaLinux.
!        * Well, to be honest it should say that this *prevents* 
!        * performance degradation.
!        *
!        *                              <[EMAIL PROTECTED]>
!        */
!       MD5_LONG X[MD5_LBLOCK];
!       /*
!        * In case you wonder why don't I use c->data for this.
!        * RISCs usually have a handful of registers and if X is
!        * declared as automatic array good optimizing compiler
!        * shall accomodate at least part of it in register bank
!        * instead of memory.
!        *
!        *                              <[EMAIL PROTECTED]>
!        */
  
!       A=c->A;
!       B=c->B;
!       C=c->C;
!       D=c->D;
  
!       for (;num--;)
                {
!       HOST_c2l(data,l); X[ 0]=l;              HOST_c2l(data,l); X[ 1]=l;
!       /* Round 0 */
!       R0(A,B,C,D,X[ 0], 7,0xd76aa478L);       HOST_c2l(data,l); X[ 2]=l;
!       R0(D,A,B,C,X[ 1],12,0xe8c7b756L);       HOST_c2l(data,l); X[ 3]=l;
!       R0(C,D,A,B,X[ 2],17,0x242070dbL);       HOST_c2l(data,l); X[ 4]=l;
!       R0(B,C,D,A,X[ 3],22,0xc1bdceeeL);       HOST_c2l(data,l); X[ 5]=l;
!       R0(A,B,C,D,X[ 4], 7,0xf57c0fafL);       HOST_c2l(data,l); X[ 6]=l;
!       R0(D,A,B,C,X[ 5],12,0x4787c62aL);       HOST_c2l(data,l); X[ 7]=l;
!       R0(C,D,A,B,X[ 6],17,0xa8304613L);       HOST_c2l(data,l); X[ 8]=l;
!       R0(B,C,D,A,X[ 7],22,0xfd469501L);       HOST_c2l(data,l); X[ 9]=l;
!       R0(A,B,C,D,X[ 8], 7,0x698098d8L);       HOST_c2l(data,l); X[10]=l;
!       R0(D,A,B,C,X[ 9],12,0x8b44f7afL);       HOST_c2l(data,l); X[11]=l;
!       R0(C,D,A,B,X[10],17,0xffff5bb1L);       HOST_c2l(data,l); X[12]=l;
!       R0(B,C,D,A,X[11],22,0x895cd7beL);       HOST_c2l(data,l); X[13]=l;
!       R0(A,B,C,D,X[12], 7,0x6b901122L);       HOST_c2l(data,l); X[14]=l;
!       R0(D,A,B,C,X[13],12,0xfd987193L);       HOST_c2l(data,l); X[15]=l;
!       R0(C,D,A,B,X[14],17,0xa679438eL);
!       R0(B,C,D,A,X[15],22,0x49b40821L);
!       /* Round 1 */
!       R1(A,B,C,D,X[ 1], 5,0xf61e2562L);
!       R1(D,A,B,C,X[ 6], 9,0xc040b340L);
!       R1(C,D,A,B,X[11],14,0x265e5a51L);
!       R1(B,C,D,A,X[ 0],20,0xe9b6c7aaL);
!       R1(A,B,C,D,X[ 5], 5,0xd62f105dL);
!       R1(D,A,B,C,X[10], 9,0x02441453L);
!       R1(C,D,A,B,X[15],14,0xd8a1e681L);
!       R1(B,C,D,A,X[ 4],20,0xe7d3fbc8L);
!       R1(A,B,C,D,X[ 9], 5,0x21e1cde6L);
!       R1(D,A,B,C,X[14], 9,0xc33707d6L);
!       R1(C,D,A,B,X[ 3],14,0xf4d50d87L);
!       R1(B,C,D,A,X[ 8],20,0x455a14edL);
!       R1(A,B,C,D,X[13], 5,0xa9e3e905L);
!       R1(D,A,B,C,X[ 2], 9,0xfcefa3f8L);
!       R1(C,D,A,B,X[ 7],14,0x676f02d9L);
!       R1(B,C,D,A,X[12],20,0x8d2a4c8aL);
!       /* Round 2 */
!       R2(A,B,C,D,X[ 5], 4,0xfffa3942L);
!       R2(D,A,B,C,X[ 8],11,0x8771f681L);
!       R2(C,D,A,B,X[11],16,0x6d9d6122L);
!       R2(B,C,D,A,X[14],23,0xfde5380cL);
!       R2(A,B,C,D,X[ 1], 4,0xa4beea44L);
!       R2(D,A,B,C,X[ 4],11,0x4bdecfa9L);
!       R2(C,D,A,B,X[ 7],16,0xf6bb4b60L);
!       R2(B,C,D,A,X[10],23,0xbebfbc70L);
!       R2(A,B,C,D,X[13], 4,0x289b7ec6L);
!       R2(D,A,B,C,X[ 0],11,0xeaa127faL);
!       R2(C,D,A,B,X[ 3],16,0xd4ef3085L);
!       R2(B,C,D,A,X[ 6],23,0x04881d05L);
!       R2(A,B,C,D,X[ 9], 4,0xd9d4d039L);
!       R2(D,A,B,C,X[12],11,0xe6db99e5L);
!       R2(C,D,A,B,X[15],16,0x1fa27cf8L);
!       R2(B,C,D,A,X[ 2],23,0xc4ac5665L);
!       /* Round 3 */
!       R3(A,B,C,D,X[ 0], 6,0xf4292244L);
!       R3(D,A,B,C,X[ 7],10,0x432aff97L);
!       R3(C,D,A,B,X[14],15,0xab9423a7L);
!       R3(B,C,D,A,X[ 5],21,0xfc93a039L);
!       R3(A,B,C,D,X[12], 6,0x655b59c3L);
!       R3(D,A,B,C,X[ 3],10,0x8f0ccc92L);
!       R3(C,D,A,B,X[10],15,0xffeff47dL);
!       R3(B,C,D,A,X[ 1],21,0x85845dd1L);
!       R3(A,B,C,D,X[ 8], 6,0x6fa87e4fL);
!       R3(D,A,B,C,X[15],10,0xfe2ce6e0L);
!       R3(C,D,A,B,X[ 6],15,0xa3014314L);
!       R3(B,C,D,A,X[13],21,0x4e0811a1L);
!       R3(A,B,C,D,X[ 4], 6,0xf7537e82L);
!       R3(D,A,B,C,X[11],10,0xbd3af235L);
!       R3(C,D,A,B,X[ 2],15,0x2ad7d2bbL);
!       R3(B,C,D,A,X[ 9],21,0xeb86d391L);
  
!       A = c->A += A;
!       B = c->B += B;
!       C = c->C += C;
!       D = c->D += D;
!               }
        }
+ #endif
  
  #ifdef undef
  int printit(unsigned long *l)
*** ./crypto/md5/md5.h.orig     Tue Apr 27 04:00:16 1999
--- ./crypto/md5/md5.h  Sun May  9 17:14:29 1999
***************
*** 67,89 ****
  #error MD5 is disabled.
  #endif
  
  #define MD5_CBLOCK    64
! #define MD5_LBLOCK    16
! #define MD5_BLOCK     16
! #define MD5_LAST_BLOCK  56
! #define MD5_LENGTH_BLOCK 8
  #define MD5_DIGEST_LENGTH 16
  
  typedef struct MD5state_st
        {
!       unsigned long A,B,C,D;
!       unsigned long Nl,Nh;
!       unsigned long data[MD5_LBLOCK];
        int num;
        } MD5_CTX;
  
  void MD5_Init(MD5_CTX *c);
! void MD5_Update(MD5_CTX *c, const void *data, unsigned long len);
  void MD5_Final(unsigned char *md, MD5_CTX *c);
  unsigned char *MD5(unsigned char *d, unsigned long n, unsigned char *md);
  void MD5_Transform(MD5_CTX *c, unsigned char *b);
--- 67,109 ----
  #error MD5 is disabled.
  #endif
  
+ /*
+  * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+  * ! MD5_LONG has to be at least 32 bits wide. If it's wider, then !
+  * ! MD5_LONG_LOG2 has to be defined along.                      !
+  * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+  */
+ 
+ #if defined(WIN16) || defined(__LP32__)
+ #define MD5_LONG unsigned long
+ #elif defined(_CRAY) || defined(__ILP64__)
+ #define MD5_LONG unsigned long
+ #define MD5_LONG_LOG2 3
+ /*
+  * _CRAY note. I could declare short, but I have no idea what impact
+  * does it have on performance on none-T3E machines. I could declare
+  * int, but at least on C90 sizeof(int) can be chosen at compile time.
+  * So I've chosen long...
+  *                                    <[EMAIL PROTECTED]>
+  */
+ #else
+ #define MD5_LONG unsigned int
+ #endif
+ 
  #define MD5_CBLOCK    64
! #define MD5_LBLOCK    (MD5_CBLOCK/4)
  #define MD5_DIGEST_LENGTH 16
  
  typedef struct MD5state_st
        {
!       MD5_LONG A,B,C,D;
!       MD5_LONG Nl,Nh;
!       MD5_LONG data[MD5_LBLOCK];
        int num;
        } MD5_CTX;
  
  void MD5_Init(MD5_CTX *c);
! void MD5_Update(MD5_CTX *c, const unsigned char *data, unsigned long len);
  void MD5_Final(unsigned char *md, MD5_CTX *c);
  unsigned char *MD5(unsigned char *d, unsigned long n, unsigned char *md);
  void MD5_Transform(MD5_CTX *c, unsigned char *b);
*** ./crypto/md5/md5_locl.h.orig        Mon Apr 26 19:00:25 1999
--- ./crypto/md5/md5_locl.h     Sun May  9 17:51:26 1999
***************
*** 56,153 ****
   * [including the GNU Public Licence.]
   */
  
- /* On sparc, this actually slows things down :-( */
- #if defined(sun)
- #undef B_ENDIAN
- #endif
- 
  #include <stdlib.h>
  #include <string.h>
  #include <openssl/md5.h>
  
! #define ULONG unsigned long
! #define UCHAR unsigned char
! #define UINT  unsigned int
  
! #undef c2l
! #define c2l(c,l)      (l = ((unsigned long)(*((c)++)))     , \
!                        l|=(((unsigned long)(*((c)++)))<< 8), \
!                        l|=(((unsigned long)(*((c)++)))<<16), \
!                        l|=(((unsigned long)(*((c)++)))<<24))
  
! #undef p_c2l
! #define p_c2l(c,l,n)  { \
!                       switch (n) { \
!                       case 0: l =((unsigned long)(*((c)++))); \
!                       case 1: l|=((unsigned long)(*((c)++)))<< 8; \
!                       case 2: l|=((unsigned long)(*((c)++)))<<16; \
!                       case 3: l|=((unsigned long)(*((c)++)))<<24; \
!                               } \
!                       }
  
! /* NOTE the pointer is not incremented at the end of this */
! #undef c2l_p
! #define c2l_p(c,l,n)  { \
!                       l=0; \
!                       (c)+=n; \
!                       switch (n) { \
!                       case 3: l =((unsigned long)(*(--(c))))<<16; \
!                       case 2: l|=((unsigned long)(*(--(c))))<< 8; \
!                       case 1: l|=((unsigned long)(*(--(c))))    ; \
!                               } \
!                       }
  
! #undef p_c2l_p
! #define p_c2l_p(c,l,sc,len) { \
!                       switch (sc) \
!                               { \
!                       case 0: l =((unsigned long)(*((c)++))); \
!                               if (--len == 0) break; \
!                       case 1: l|=((unsigned long)(*((c)++)))<< 8; \
!                               if (--len == 0) break; \
!                       case 2: l|=((unsigned long)(*((c)++)))<<16; \
!                               } \
!                       }
  
! #undef l2c
! #define l2c(l,c)      (*((c)++)=(unsigned char)(((l)    )&0xff), \
!                        *((c)++)=(unsigned char)(((l)>> 8)&0xff), \
!                        *((c)++)=(unsigned char)(((l)>>16)&0xff), \
!                        *((c)++)=(unsigned char)(((l)>>24)&0xff))
  
! /* NOTE - c is not incremented as per l2c */
! #undef l2cn
! #define l2cn(l1,l2,c,n)       { \
!                       c+=n; \
!                       switch (n) { \
!                       case 8: *(--(c))=(unsigned char)(((l2)>>24)&0xff); \
!                       case 7: *(--(c))=(unsigned char)(((l2)>>16)&0xff); \
!                       case 6: *(--(c))=(unsigned char)(((l2)>> 8)&0xff); \
!                       case 5: *(--(c))=(unsigned char)(((l2)    )&0xff); \
!                       case 4: *(--(c))=(unsigned char)(((l1)>>24)&0xff); \
!                       case 3: *(--(c))=(unsigned char)(((l1)>>16)&0xff); \
!                       case 2: *(--(c))=(unsigned char)(((l1)>> 8)&0xff); \
!                       case 1: *(--(c))=(unsigned char)(((l1)    )&0xff); \
!                               } \
!                       }
! 
! /* A nice byte order reversal from Wei Dai <[EMAIL PROTECTED]> */
! #if defined(WIN32)
! /* 5 instructions with rotate instruction, else 9 */
! #define Endian_Reverse32(a) \
!       { \
!       unsigned long l=(a); \
!       (a)=((ROTATE(l,8)&0x00FF00FF)|(ROTATE(l,24)&0xFF00FF00)); \
!       }
! #else
! /* 6 instructions with rotate instruction, else 8 */
! #define Endian_Reverse32(a) \
!       { \
!       unsigned long l=(a); \
!       l=(((l&0xFF00FF00)>>8L)|((l&0x00FF00FF)<<8L)); \
!       (a)=ROTATE(l,16L); \
!       }
  #endif
  /*
  #define       F(x,y,z)        (((x) & (y))  |  ((~(x)) & (z)))
  #define       G(x,y,z)        (((x) & (z))  |  ((y) & (~(z))))
--- 56,141 ----
   * [including the GNU Public Licence.]
   */
  
  #include <stdlib.h>
  #include <string.h>
  #include <openssl/md5.h>
  
! #ifndef MD5_LONG_LOG2
! #define MD5_LONG_LOG2 2 /* default to 32 bits */
! #endif
  
! #ifdef MD5_ASM
! # if defined(__i386) || defined(WIN32)
! #  define md5_block_host_order md5_block_asm_host_order
! # elif defined(__sparc) && defined(ULTRASPARC)
!    void md5_block_asm_data_order_aligned (MD5_CTX *c, const MD5_LONG *p,int num);
! #  define HASH_BLOCK_DATA_ORDER_ALIGNED md5_block_asm_data_order_aligned
! # endif
! #endif
  
! #ifndef md5_block_host_order
! static
! #endif
! void md5_block_host_order (MD5_CTX *c, const MD5_LONG *p,int num);
  
! #ifndef md5_block_data_order
! static
! #endif
! void md5_block_data_order (MD5_CTX *c, const unsigned char *p,int num);
  
! #if defined(__i386)
! /*
!  * *_block_host_order is expected to handle aligned data while
!  * *_block_data_order - unaligned. As algorithm and host (x86)
!  * are in this case of the same "endianess" these two are
!  * otherwise indistinguishable. But normally you don't want to
!  * call the same function because unaligned access in places
!  * where alignment is expected is usually a "Bad Thing". Indeed,
!  * on RISCs you get punished with BUS ERROR signal or *severe*
!  * performance degradation. Intel CPUs are in turn perfectly
!  * capable of loading unaligned data without such drastic side
!  * effect. Yes, they say it's slower than aligned load, but no
!  * exception is generated and therefore performance degradation
!  * is *incomparable* with RISCs. What we should weight here is
!  * costs of unaligned access against costs of aligning data.
!  * According to my measurements allowing unaligned access results
!  * in ~9% performance improvement on Pentium II operating at
!  * 266MHz. I won't be surprised if the difference will be higher
!  * on faster systems:-)
!  *
!  *                            <[EMAIL PROTECTED]>
!  */
! #define md5_block_data_order  md5_block_host_order
! #endif
  
! #define DATA_ORDER_IS_LITTLE_ENDIAN
  
! #define HASH_LONG             MD5_LONG
! #define HASH_LONG_LOG2                MD5_LONG_LOG2
! #define HASH_CTX              MD5_CTX
! #define HASH_CBLOCK           MD5_CBLOCK
! #define HASH_LBLOCK           MD5_LBLOCK
! #define HASH_UPDATE           MD5_Update
! #define HASH_TRANSFORM                MD5_Transform
! #define HASH_FINAL            MD5_Final
! #define HASH_BLOCK_HOST_ORDER md5_block_host_order
! #if defined(B_ENDIAN) || defined(md5_block_data_order)
! #define       HASH_BLOCK_DATA_ORDER   md5_block_data_order
! /*
!  * Little-endians (Intel and Alpha) feel better without this.
!  * It looks like memcpy does better job than generic
!  * md5_block_data_order on copying-n-aligning input data.
!  * But franlky speaking I didn't expect such result on Alpha.
!  * On the other hand I've got this with egcs-1.0.2 and if
!  * program is compiled with another (better?) compiler it
!  * might turn out other way around.
!  *
!  *                            <[EMAIL PROTECTED]>
!  */
  #endif
+ 
+ #include "../md32_common.h"
+   
  /*
  #define       F(x,y,z)        (((x) & (y))  |  ((~(x)) & (z)))
  #define       G(x,y,z)        (((x) & (z))  |  ((y) & (~(z))))
***************
*** 162,175 ****
  #define       H(b,c,d)        ((b) ^ (c) ^ (d))
  #define       I(b,c,d)        (((~(d)) | (b)) ^ (c))
  
- #undef ROTATE
- #if defined(WIN32)
- #define ROTATE(a,n)     _lrotl(a,n)
- #else
- #define ROTATE(a,n)     (((a)<<(n))|(((a)&0xffffffff)>>(32-(n))))
- #endif
- 
- 
  #define R0(a,b,c,d,k,s,t) { \
        a+=((k)+(t)+F((b),(c),(d))); \
        a=ROTATE(a,s); \
--- 150,155 ----
*** ./crypto/md5/md5_one.c.orig Tue Apr 20 00:00:41 1999
--- ./crypto/md5/md5_one.c      Sun May  9 16:42:49 1999
***************
*** 57,63 ****
   */
  
  #include <stdio.h>
! #include "md5_locl.h"
  
  unsigned char *MD5(unsigned char *d, unsigned long n, unsigned char *md)
        {
--- 57,63 ----
   */
  
  #include <stdio.h>
! #include <openssl/md5.h>
  
  unsigned char *MD5(unsigned char *d, unsigned long n, unsigned char *md)
        {
*** ./crypto/md5/Makefile.ssl.orig      Fri Apr 30 00:00:18 1999
--- ./crypto/md5/Makefile.ssl   Sun May  9 17:08:22 1999
***************
*** 66,71 ****
--- 66,79 ----
  asm/mx86unix.cpp: asm/md5-586.pl
        (cd asm; $(PERL) md5-586.pl cpp >mx86unix.cpp)
  
+ # works for both SC and gcc
+ asm/md5-sparcv8plus.o: asm/md5-sparcv9.S
+       $(CPP) -DULTRASPARC -DMD5_BLOCK_DATA_ORDER asm/md5-sparcv9.S | as 
+-xarch=v8plus /dev/fd/0 -o asm/md5-sparcv8plus.o
+ 
+ asm/md5-sparcv9.o: asm/md5-sparcv9.S
+       $(CC) -xarch=v9 -DULTRASPARC -DMD5_BLOCK_DATA_ORDER -c asm/md5-sparcv9.S -o 
+asm/md5-sparcv9.o
+ 
+ 
  files:
        $(PERL) $(TOP)/util/files.pl Makefile.ssl >> $(TOP)/MINFO
  
***************
*** 103,107 ****
  # DO NOT DELETE THIS LINE -- make depend depends on it.
  
  md5_dgst.o: ../../include/openssl/md5.h ../../include/openssl/opensslv.h
! md5_dgst.o: md5_locl.h
  md5_one.o: ../../include/openssl/md5.h md5_locl.h
--- 111,115 ----
  # DO NOT DELETE THIS LINE -- make depend depends on it.
  
  md5_dgst.o: ../../include/openssl/md5.h ../../include/openssl/opensslv.h
! md5_dgst.o: ../md32_common.h md5_locl.h
  md5_one.o: ../../include/openssl/md5.h md5_locl.h
*** ./crypto/md32_common.h.orig Sun May  9 16:42:49 1999
--- ./crypto/md32_common.h      Sun May  9 17:48:51 1999
***************
*** 0 ****
--- 1,590 ----
+ /* crypto/md32_common.h */
+ /* ====================================================================
+  * Copyright (c) 1999 The OpenSSL Project.  All rights reserved.
+  *
+  * Redistribution and use in source and binary forms, with or without
+  * modification, are permitted provided that the following conditions
+  * are met:
+  *
+  * 1. Redistributions of source code must retain the above copyright
+  *    notice, this list of conditions and the following disclaimer. 
+  *
+  * 2. Redistributions in binary form must reproduce the above copyright
+  *    notice, this list of conditions and the following disclaimer in
+  *    the documentation and/or other materials provided with the
+  *    distribution.
+  *
+  * 3. All advertising materials mentioning features or use of this
+  *    software must display the following acknowledgment:
+  *    "This product includes software developed by the OpenSSL Project
+  *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+  *
+  * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+  *    endorse or promote products derived from this software without
+  *    prior written permission. For written permission, please contact
+  *    [EMAIL PROTECTED]
+  *
+  * 5. Products derived from this software may not be called "OpenSSL"
+  *    nor may "OpenSSL" appear in their names without prior written
+  *    permission of the OpenSSL Project.
+  *
+  * 6. Redistributions of any form whatsoever must retain the following
+  *    acknowledgment:
+  *    "This product includes software developed by the OpenSSL Project
+  *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+  *
+  * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+  * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+  * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+  * OF THE POSSIBILITY OF SUCH DAMAGE.
+  * ====================================================================
+  *
+  * This product includes cryptographic software written by Eric Young
+  * ([EMAIL PROTECTED]).  This product includes software written by Tim
+  * Hudson ([EMAIL PROTECTED]).
+  *
+  */
+ 
+ /*
+  * This is a generic 32 bit "collector" for message digest algorithms.
+  * Whenever needed it collects input character stream into chunks of
+  * 32 bit values and invokes a block function that performs actual hash
+  * calculations.
+  *
+  * Porting guide.
+  *
+  * Obligatory macros:
+  *
+  * DATA_ORDER_IS_BIG_ENDIAN or DATA_ORDER_IS_LITTLE_ENDIAN
+  *    this macro defines byte order of input stream.
+  * HASH_CBLOCK
+  *    size of a unit chunk HASH_BLOCK operates on.
+  * HASH_LONG
+  *    has to be at lest 32 bit wide, if it's wider, then
+  *    HASH_LONG_LOG2 *has to* be defined along
+  * HASH_CTX
+  *    context structure that at least contains following
+  *    members:
+  *            typedef struct {
+  *                    ...
+  *                    HASH_LONG       Nl,Nh;
+  *                    HASH_LONG       data[HASH_LBLOCK];
+  *                    int             num;
+  *                    ...
+  *                    } HASH_CTX;
+  * HASH_UPDATE
+  *    name of "Update" function, implemented here.
+  * HASH_TRANSFORM
+  *    name of "Transform" function, implemented here.
+  * HASH_FINAL
+  *    name of "Final" function, implemented here.
+  * HASH_BLOCK_HOST_ORDER
+  *    name of "block" function treating *aligned* input message
+  *    in host byte order, implemented externally.
+  * HASH_BLOCK_DATA_ORDER
+  *    name of "block" function treating *unaligned* input message
+  *    in original (data) byte order, implemented externally (it
+  *    actually is optional if data and host are of the same
+  *    "endianess").
+  *
+  * Optional macros:
+  *
+  * B_ENDIAN or L_ENDIAN
+  *    defines host byte-order.
+  * HASH_LONG_LOG2
+  *    defaults to 2 if not states otherwise.
+  * HASH_LBLOCK
+  *    assumed to be HASH_CBLOCK/4 if not stated otherwise.
+  * HASH_BLOCK_DATA_ORDER_ALIGNED
+  *    alternative "block" function capable of treating
+  *    aligned input message in original (data) order,
+  *    implemented externally.
+  *
+  * MD5 example:
+  *
+  *    #define DATA_ORDER_IS_LITTLE_ENDIAN
+  *
+  *    #define HASH_LONG               MD5_LONG
+  *    #define HASH_LONG_LOG2          MD5_LONG_LOG2
+  *    #define HASH_CTX                MD5_CTX
+  *    #define HASH_CBLOCK             MD5_CBLOCK
+  *    #define HASH_LBLOCK             MD5_LBLOCK
+  *    #define HASH_UPDATE             MD5_Update
+  *    #define HASH_TRANSFORM          MD5_Transform
+  *    #define HASH_FINAL              MD5_Final
+  *    #define HASH_BLOCK_HOST_ORDER   md5_block_host_order
+  *    #define HASH_BLOCK_DATA_ORDER   md5_block_data_order
+  *
+  *                                    <[EMAIL PROTECTED]>
+  */
+ 
+ #if !defined(DATA_ORDER_IS_BIG_ENDIAN) && !defined(DATA_ORDER_IS_LITTLE_ENDIAN)
+ #error "DATA_ORDER must be defined!"
+ #endif
+ 
+ #ifndef HASH_CBLOCK
+ #error "HASH_CBLOCK must be defined!"
+ #endif
+ #ifndef HASH_LONG
+ #error "HASH_LONG must be defined!"
+ #endif
+ #ifndef HASH_CTX
+ #error "HASH_CTX must be defined!"
+ #endif
+ 
+ #ifndef HASH_UPDATE
+ #error "HASH_UPDATE must be defined!"
+ #endif
+ #ifndef HASH_TRANSFORM
+ #error "HASH_TRANSFORM must be defined!"
+ #endif
+ #ifndef HASH_FINAL
+ #error "HASH_FINAL must be defined!"
+ #endif
+ 
+ #ifndef HASH_BLOCK_HOST_ORDER
+ #error "HASH_BLOCK_HOST_ORDER must be defined!"
+ #endif
+ 
+ #if 0
+ /*
+  * Moved below as it's required only if HASH_BLOCK_DATA_ORDER_ALIGNED
+  * isn't defined.
+  */
+ #ifndef HASH_BLOCK_DATA_ORDER
+ #error "HASH_BLOCK_DATA_ORDER must be defined!"
+ #endif
+ #endif
+ 
+ #ifndef HASH_LBLOCK
+ #define HASH_LBLOCK   (HASH_CBLOCK/4)
+ #endif
+ 
+ #ifndef HASH_LONG_LOG2
+ #define HASH_LONG_LOG2        2
+ #endif
+ 
+ /*
+  * Engage compiler specific rotate intrinsic function if available.
+  */
+ #undef ROTATE
+ #if defined(_MSC_VER)
+ # define ROTATE(a,n)     _lrotl(a,n)
+ #elif defined(__GNUC__) && __GNUC__>=2
+   /*
+    * Some GNU C inline assembler templates. Note that these are
+    * rotates by *constant* number of bits! But that's exactly
+    * what we need here...
+    *
+    *                                  <[EMAIL PROTECTED]>
+    */
+ # if defined(__i386)
+ #  define ROTATE(a,n) ({ register unsigned int ret;   \
+                               asm volatile (          \
+                               "roll %1,%0"            \
+                               : "=r"(ret)             \
+                               : "I"(n), "0"(a)        \
+                               : "cc");                \
+                          ret;                         \
+                       })
+ # elif defined(__powerpc)
+ #  define ROTATE(a,n) ({ register unsigned int ret;   \
+                               asm volatile (          \
+                               "rlwinm %0,%1,%2,0,31"  \
+                               : "=r"(ret)             \
+                               : "r"(a), "I"(n));      \
+                          ret;                         \
+                       })
+ # endif
+ #endif
+ 
+ /*
+  * Engage compiler specific "fetch in reverse byte order"
+  * intrinsic function if available.
+  */
+ #if defined(__GNUC__) && __GNUC__>=2
+   /* some GNU C inline assembler templates by <[EMAIL PROTECTED]> */
+ # if defined(__i386) && !defined(I386_ONLY)
+ #  define BE_FETCH32(a)       ({ register unsigned int l=(a);\
+                               asm volatile (          \
+                               "bswapl %0"             \
+                               : "=r"(l) : "0"(l));    \
+                         l;                            \
+                       })
+ # elif defined(__powerpc)
+ #  define LE_FETCH32(a)       ({ register unsigned int l;     \
+                               asm volatile (          \
+                               "lwbrx %0,0,%1"         \
+                               : "=r"(l)               \
+                               : "r"(a));              \
+                          l;                           \
+                       })
+ 
+ # elif defined(__sparc) && defined(ULTRASPARC)
+ #  define LE_FETCH32(a)       ({ register unsigned int l;             \
+                               asm volatile (                  \
+                               "lda [%1]#ASI_PRIMARY_LITTLE,%0"\
+                               : "=r"(l)                       \
+                               : "r"(a));                      \
+                          l;                                   \
+                       })
+ # endif
+ #endif
+ 
+ #if HASH_LONG_LOG2==2 /* Engage only if sizeof(HASH_LONG)== 4 */
+ /* A nice byte order reversal from Wei Dai <[EMAIL PROTECTED]> */
+ #ifdef ROTATE
+ /* 5 instructions with rotate instruction, else 9 */
+ #define REVERSE_FETCH32(a,l)  (                                       \
+               l=*(const HASH_LONG *)(a),                              \
+               ((ROTATE(l,8)&0x00FF00FF)|(ROTATE((l&0x00FF00FF),24)))  \
+                               )
+ #else
+ /* 6 instructions with rotate instruction, else 8 */
+ #define REVERSE_FETCH32(a,l)  (                               \
+               l=*(const HASH_LONG *)(a),                      \
+               l=(((l>>8)&0x00FF00FF)|((l&0x00FF00FF)<<8)),    \
+               ROTATE(l,16)                                    \
+                               )
+ /*
+  * Originally the middle line started with l=(((l&0xFF00FF00)>>8)|...
+  * It's rewritten as above for two reasons:
+  *    - RISCs aren't good at long constants and have to explicitely
+  *      compose 'em with several (well, usually 2) instructions in a
+  *      register before performing the actual operation and (as you
+  *      already realized:-) having same constant should inspire the
+  *      compiler to permanently allocate the only register for it;
+  *    - most modern CPUs have two ALUs, but usually only one has
+  *      circuitry for shifts:-( this minor tweak inspires compiler
+  *      to schedule shift instructions in a better way...
+  *
+  *                            <[EMAIL PROTECTED]>
+  */
+ #endif
+ #endif
+ 
+ #ifndef ROTATE
+ #define ROTATE(a,n)     (((a)<<(n))|(((a)&0xffffffff)>>(32-(n))))
+ #endif
+ 
+ /*
+  * Make some obvious choices. E.g., HASH_BLOCK_DATA_ORDER_ALIGNED
+  * and HASH_BLOCK_HOST_ORDER ought to be the same if input data
+  * and host are of the same "endianess". It's possible to mask
+  * this with blank #define HASH_BLOCK_DATA_ORDER though...
+  *
+  *                            <[EMAIL PROTECTED]>
+  */
+ #if defined(B_ENDIAN)
+ #  if defined(DATA_ORDER_IS_BIG_ENDIAN)
+ #    if !defined(HASH_BLOCK_DATA_ORDER_ALIGNED) && HASH_LONG_LOG2==2
+ #      define HASH_BLOCK_DATA_ORDER_ALIGNED   HASH_BLOCK_HOST_ORDER
+ #    endif
+ #  elif defined(DATA_ORDER_IS_LITTLE_ENDIAN)
+ #    ifndef HOST_FETCH32
+ #      ifdef LE_FETCH32
+ #        define HOST_FETCH32(p,l)     LE_FETCH32(p)
+ #      elif defined(REVERSE_FETCH32)
+ #        define HOST_FETCH32(p,l)     REVERSE_FETCH32(p,l)
+ #      endif
+ #    endif
+ #  endif
+ #elif defined(L_ENDIAN)
+ #  if defined(DATA_ORDER_IS_LITTLE_ENDIAN)
+ #    if !defined(HASH_BLOCK_DATA_ORDER_ALIGNED) && HASH_LONG_LOG2==2
+ #      define HASH_BLOCK_DATA_ORDER_ALIGNED   HASH_BLOCK_HOST_ORDER
+ #    endif
+ #  elif defined(DATA_ORDER_IS_BIG_ENDIAN)
+ #    ifndef HOST_FETCH32
+ #      ifdef BE_FETCH32
+ #        define HOST_FETCH32(p,l)     BE_FETCH32(p)
+ #      elif defined(REVERSE_FETCH32)
+ #        define HOST_FETCH32(p,l)     REVERSE_FETCH32(p,l)
+ #      endif
+ #    endif
+ #  endif
+ #endif
+ 
+ #if !defined(HASH_BLOCK_DATA_ORDER_ALIGNED) && HASH_BLOCK_DATA_ORDER_ALIGNED!=1
+ #ifndef HASH_BLOCK_DATA_ORDER
+ #error "HASH_BLOCK_DATA_ORDER must be defined!"
+ #endif
+ #endif
+ 
+ #if defined(DATA_ORDER_IS_BIG_ENDIAN)
+ 
+ #define HOST_c2l(c,l) (l =(((unsigned long)(*((c)++)))<<24),          \
+                        l|=(((unsigned long)(*((c)++)))<<16),          \
+                        l|=(((unsigned long)(*((c)++)))<< 8),          \
+                        l|=(((unsigned long)(*((c)++)))    ),          \
+                        l)
+ #define HOST_p_c2l(c,l,n)     {                                       \
+                       switch (n) {                                    \
+                       case 0: l =((unsigned long)(*((c)++)))<<24;     \
+                       case 1: l|=((unsigned long)(*((c)++)))<<16;     \
+                       case 2: l|=((unsigned long)(*((c)++)))<< 8;     \
+                       case 3: l|=((unsigned long)(*((c)++)));         \
+                               } }
+ #define HOST_p_c2l_p(c,l,sc,len) {                                    \
+                       switch (sc) {                                   \
+                       case 0: l =((unsigned long)(*((c)++)))<<24;     \
+                               if (--len == 0) break;                  \
+                       case 1: l|=((unsigned long)(*((c)++)))<<16;     \
+                               if (--len == 0) break;                  \
+                       case 2: l|=((unsigned long)(*((c)++)))<< 8;     \
+                               } }
+ /* NOTE the pointer is not incremented at the end of this */
+ #define HOST_c2l_p(c,l,n)     {                                       \
+                       l=0; (c)+=n;                                    \
+                       switch (n) {                                    \
+                       case 3: l =((unsigned long)(*(--(c))))<< 8;     \
+                       case 2: l|=((unsigned long)(*(--(c))))<<16;     \
+                       case 1: l|=((unsigned long)(*(--(c))))<<24;     \
+                               } }
+ #define HOST_l2c(l,c) (*((c)++)=(unsigned char)(((l)>>24)&0xff),      \
+                        *((c)++)=(unsigned char)(((l)>>16)&0xff),      \
+                        *((c)++)=(unsigned char)(((l)>> 8)&0xff),      \
+                        *((c)++)=(unsigned char)(((l)    )&0xff),      \
+                        l)
+ 
+ #elif defined(DATA_ORDER_IS_LITTLE_ENDIAN)
+ 
+ #define HOST_c2l(c,l) (l =(((unsigned long)(*((c)++)))    ),          \
+                        l|=(((unsigned long)(*((c)++)))<< 8),          \
+                        l|=(((unsigned long)(*((c)++)))<<16),          \
+                        l|=(((unsigned long)(*((c)++)))<<24),          \
+                        l)
+ #define HOST_p_c2l(c,l,n)     {                                       \
+                       switch (n) {                                    \
+                       case 0: l =((unsigned long)(*((c)++)));         \
+                       case 1: l|=((unsigned long)(*((c)++)))<< 8;     \
+                       case 2: l|=((unsigned long)(*((c)++)))<<16;     \
+                       case 3: l|=((unsigned long)(*((c)++)))<<24;     \
+                               } }
+ #define HOST_p_c2l_p(c,l,sc,len) {                                    \
+                       switch (sc) {                                   \
+                       case 0: l =((unsigned long)(*((c)++)));         \
+                               if (--len == 0) break;                  \
+                       case 1: l|=((unsigned long)(*((c)++)))<< 8;     \
+                               if (--len == 0) break;                  \
+                       case 2: l|=((unsigned long)(*((c)++)))<<16;     \
+                               } }
+ /* NOTE the pointer is not incremented at the end of this */
+ #define HOST_c2l_p(c,l,n)     {                                       \
+                       l=0; (c)+=n;                                    \
+                       switch (n) {                                    \
+                       case 3: l =((unsigned long)(*(--(c))))<<16;     \
+                       case 2: l|=((unsigned long)(*(--(c))))<< 8;     \
+                       case 1: l|=((unsigned long)(*(--(c))));         \
+                               } }
+ #define HOST_l2c(l,c) (*((c)++)=(unsigned char)(((l)    )&0xff),      \
+                        *((c)++)=(unsigned char)(((l)>> 8)&0xff),      \
+                        *((c)++)=(unsigned char)(((l)>>16)&0xff),      \
+                        *((c)++)=(unsigned char)(((l)>>24)&0xff),      \
+                        l)
+ 
+ #endif
+ 
+ /*
+  * Time for some action:-)
+  */
+ 
+ void HASH_UPDATE (HASH_CTX *c, const unsigned char *data, unsigned long len)
+       {
+       register HASH_LONG * p;
+       register unsigned long l;
+       int sw,sc,ew,ec;
+ 
+       if (len==0) return;
+ 
+       l=(c->Nl+(len<<3))&0xffffffffL;
+       /* 95-05-24 eay Fixed a bug with the overflow handling, thanks to
+        * Wei Dai <[EMAIL PROTECTED]> for pointing it out. */
+       if (l < c->Nl) /* overflow */
+               c->Nh++;
+       c->Nh+=(len>>29);
+       c->Nl=l;
+ 
+       if (c->num != 0)
+               {
+               p=c->data;
+               sw=c->num>>2;
+               sc=c->num&0x03;
+ 
+               if ((c->num+len) >= HASH_CBLOCK)
+                       {
+                       l=p[sw]; HOST_p_c2l(data,l,sc); p[sw++]=l;
+                       for (; sw<HASH_LBLOCK; sw++)
+                               {
+                               HOST_c2l(data,l); p[sw]=l;
+                               }
+                       HASH_BLOCK_HOST_ORDER (c,p,1);
+                       len-=(HASH_CBLOCK-c->num);
+                       c->num=0;
+                       /* drop through and do the rest */
+                       }
+               else
+                       {
+                       c->num+=len;
+                       if ((sc+len) < 4) /* ugly, add char's to a word */
+                               {
+                               l=p[sw]; HOST_p_c2l_p(data,l,sc,len); p[sw]=l;
+                               }
+                       else
+                               {
+                               ew=(c->num>>2);
+                               ec=(c->num&0x03);
+                               l=p[sw]; HOST_p_c2l(data,l,sc); p[sw++]=l;
+                               for (; sw < ew; sw++)
+                                       {
+                                       HOST_c2l(data,l); p[sw]=l;
+                                       }
+                               if (ec)
+                                       {
+                                       HOST_c2l_p(data,l,ec); p[sw]=l;
+                                       }
+                               }
+                       return;
+                       }
+               }
+ 
+       sw=len/HASH_CBLOCK;
+       if (sw > 0)
+               {
+ #if defined(HASH_BLOCK_DATA_ORDER_ALIGNED) && HASH_BLOCK_DATA_ORDER_ALIGNED!=1
+               /*
+                * Note that HASH_BLOCK_DATA_ORDER_ALIGNED gets defined
+                * only if sizeof(HASH_LONG)==4.
+                */
+               if ((((unsigned long)data)%4) == 0)
+                       {
+                       HASH_BLOCK_DATA_ORDER_ALIGNED (c,(HASH_LONG *)data,sw);
+                       sw*=HASH_CBLOCK;
+                       data+=sw;
+                       len-=sw;
+                       }
+               else
+ #if !defined(HASH_BLOCK_DATA_ORDER)
+                       while (sw--)
+                               {
+                               memcpy (p=c->data,data,HASH_CBLOCK);
+                               HASH_BLOCK_DATA_ORDER_ALIGNED(c,p,1);
+                               data+=HASH_CBLOCK;
+                               len-=HASH_CBLOCK;
+                               }
+ #endif
+ #endif
+ #if defined(HASH_BLOCK_DATA_ORDER)
+                       {
+                       HASH_BLOCK_DATA_ORDER (c,data,sw);
+                       sw*=HASH_CBLOCK;
+                       data+=sw;
+                       len-=sw;
+                       }
+ #endif
+               }
+ 
+       if (len!=0)
+               {
+               p = c->data;
+               c->num = len;
+               ew=len>>2;      /* words to copy */
+               ec=len&0x03;
+               for (; ew; ew--,p++)
+                       {
+                       HOST_c2l(data,l); *p=l;
+                       }
+               HOST_c2l_p(data,l,ec);
+               *p=l;
+               }
+       }
+ 
+ 
+ void HASH_TRANSFORM (HASH_CTX *c, unsigned char *data)
+       {
+ #if defined(HASH_BLOCK_DATA_ORDER_ALIGNED) && HASH_BLOCK_DATA_ORDER_ALIGNED!=1
+       if ((((unsigned long)data)%4) == 0)
+               HASH_BLOCK_DATA_ORDER_ALIGNED (c,(HASH_LONG *)data,1);
+       else
+ #if !defined(HASH_BLOCK_DATA_ORDER)
+               {
+               memcpy (c->data,data,HASH_CBLOCK);
+               HASH_BLOCK_DATA_ORDER_ALIGNED (c,c->data,1);
+               }
+ #endif
+ #endif
+ #if defined(HASH_BLOCK_DATA_ORDER)
+       HASH_BLOCK_DATA_ORDER (c,data,1);
+ #endif
+       }
+ 
+ 
+ void HASH_FINAL (unsigned char *md, HASH_CTX *c)
+       {
+       register HASH_LONG *p;
+       register unsigned long l;
+       register int i,j;
+       static const unsigned char end[4]={0x80,0x00,0x00,0x00};
+       const unsigned char *cp=end;
+ 
+       /* c->num should definitly have room for at least one more byte. */
+       p=c->data;
+       i=c->num>>2;
+       j=c->num&0x03;
+ 
+ #if 0
+       /* purify often complains about the following line as an
+        * Uninitialized Memory Read.  While this can be true, the
+        * following p_c2l macro will reset l when that case is true.
+        * This is because j&0x03 contains the number of 'valid' bytes
+        * already in p[i].  If and only if j&0x03 == 0, the UMR will
+        * occur but this is also the only time p_c2l will do
+        * l= *(cp++) instead of l|= *(cp++)
+        * Many thanks to Alex Tang <[EMAIL PROTECTED]> for pickup this
+        * 'potential bug' */
+ #ifdef PURIFY
+       if (j==0) p[i]=0; /* Yeah, but that's not the way to fix it:-) */
+ #endif
+       l=p[i];
+ #else
+       l = (j==0) ? 0 : p[i];
+ #endif
+       HOST_p_c2l(cp,l,j); p[i++]=l; /* i is the next 'undefined word' */
+ 
+       if (i>(HASH_LBLOCK-2)) /* save room for Nl and Nh */
+               {
+               if (i<HASH_LBLOCK) p[i]=0;
+               HASH_BLOCK_HOST_ORDER (c,p,1);
+               i=0;
+               }
+       for (; i<(HASH_LBLOCK-2); i++)
+               p[i]=0;
+ 
+ #if   defined(DATA_ORDER_IS_BIG_ENDIAN)
+       p[HASH_LBLOCK-2]=c->Nh;
+       p[HASH_LBLOCK-1]=c->Nl;
+ #elif defined(DATA_ORDER_IS_LITTLE_ENDIAN)
+       p[HASH_LBLOCK-2]=c->Nl;
+       p[HASH_LBLOCK-1]=c->Nh;
+ #endif
+       HASH_BLOCK_HOST_ORDER (c,p,1);
+ 
+       l=c->A; HOST_l2c(l,md);
+       l=c->B; HOST_l2c(l,md);
+       l=c->C; HOST_l2c(l,md);
+       l=c->D; HOST_l2c(l,md);
+ 
+       c->num=0;
+       /* clear stuff, HASH_BLOCK may be leaving some stuff on the stack
+        * but I'm not worried :-)
+       memset((void *)c,0,sizeof(HASH_CTX));
+        */
+       }
*** ./crypto/md5/asm/md5-586.pl.orig    Sat Jan 30 19:00:09 1999
--- ./crypto/md5/asm/md5-586.pl Sun May  9 16:42:49 1999
***************
*** 29,35 ****
   0, 7, 14, 5, 12, 3, 10, 1, 8, 15, 6, 13, 4, 11, 2, 9,        # R3
   );
  
! &md5_block("md5_block_x86");
  &asm_finish();
  
  sub Np
--- 29,35 ----
   0, 7, 14, 5, 12, 3, 10, 1, 8, 15, 6, 13, 4, 11, 2, 9,        # R3
   );
  
! &md5_block("md5_block_asm_host_order");
  &asm_finish();
  
  sub Np
***************
*** 183,188 ****
--- 183,189 ----
         &mov($X,       &wparam(1)); # esi
        &mov($C,        &wparam(2));
         &push("ebp");
+       &shl($C,        6);
        &push("ebx");
         &add($C,       $X); # offset we end at
        &sub($C,        64);
*** ./crypto/md5/asm/md5-sparcv9.S.orig Sun May  9 16:42:49 1999
--- ./crypto/md5/asm/md5-sparcv9.S      Sun May  9 16:42:49 1999
***************
*** 0 ****
--- 1,1035 ----
+ .ident        "md5-sparcv9.S, Version 1.0"
+ .ident        "SPARC V9 ISA artwork by Andy Polyakov <[EMAIL PROTECTED]>"
+ .file "md5-sparcv9.S"
+ 
+ /*
+  * ====================================================================
+  * Copyright (c) 1999 Andy Polyakov <[EMAIL PROTECTED]>.
+  *
+  * Rights for redistribution and usage in source and binary forms are
+  * granted as long as above copyright notices are retained. Warranty
+  * of any kind is (of course:-) disclaimed.
+  * ====================================================================
+  */
+ 
+ /*
+  * This is my modest contribution to OpenSSL project (see
+  * http://www.openssl.org/ for more information about it) and is an
+  * assembler implementation of MD5 block hash function. I've hand-coded
+  * this for the sole reason to reach UltraSPARC-specific "load in
+  * little-endian byte order" instruction. This gives up to 15%
+  * performance improvement for cases when input message is aligned at
+  * 32 bits boundary. The module was tested under both 32 *and* 64 bit
+  * kernels. For updates see http://fy.chalmers.se/~appro/hpe/.
+  *
+  * To compile with SC4.x/SC5.x:
+  *
+  *    cc -xarch=v[9|8plus] -DULTRASPARC -DMD5_BLOCK_DATA_ORDER \
+  *            -c md5-sparcv9.S
+  *
+  * and with gcc:
+  *
+  *    gcc -mcpu=ultrasparc -DULTRASPARC -DMD5_BLOCK_DATA_ORDER \
+  *            -c md5-sparcv9.S
+  *
+  * or if above fails (it does if you have gas):
+  *
+  *    gcc -E -DULTRASPARC -DMD5_BLOCK_DATA_ORDER md5_block.sparc.S | \
+  *            as -xarch=v8plus /dev/fd/0 -o md5-sparcv9.o
+  */
+ 
+ #define       A       %o0
+ #define B     %o1
+ #define       C       %o2
+ #define       D       %o3
+ #define       T1      %o4
+ #define       T2      %o5
+ 
+ #define       R0      %l0
+ #define       R1      %l1
+ #define       R2      %l2
+ #define       R3      %l3
+ #define       R4      %l4
+ #define       R5      %l5
+ #define       R6      %l6
+ #define       R7      %l7
+ #define       R8      %i3
+ #define       R9      %i4
+ #define       R10     %i5
+ #define       R11     %g1
+ #define R12   %g2
+ #define       R13     %g3
+ #define RX    %g4
+ 
+ #define Aptr  %i0+0
+ #define Bptr  %i0+4
+ #define Cptr  %i0+8
+ #define Dptr  %i0+12
+ 
+ #define Aval  R5      /* those not used at the end of the last round */
+ #define Bval  R6
+ #define Cval  R7
+ #define Dval  R8
+ 
+ #if defined(MD5_BLOCK_DATA_ORDER)
+ # if defined(ULTRASPARC)
+ #  define     LOAD                    lda
+ #  define     X(i)                    [%i1+i*4]%asi
+ #  define     md5_block               md5_block_asm_data_order_aligned
+ #  define     ASI_PRIMARY_LITTLE      0x88
+ # else
+ #  error "MD5_BLOCK_DATA_ORDER is supported only on UltraSPARC!"
+ # endif
+ #else
+ # define      LOAD                    ld
+ # define      X(i)                    [%i1+i*4]
+ # define      md5_block               md5_block_asm_host_order
+ #endif
+ 
+ .section        ".text",#alloc,#execinstr
+ #if defined(__SUNPRO_C) && defined(__sparcv9)
+   /* They've said -xarch=v9 at command line */
+   .register   %g2,#scratch
+   .register   %g3,#scratch
+ # define      FRAME   -192
+ #else
+ # define      FRAME   -96
+ #endif
+ 
+ .align  32
+ 
+ .global md5_block
+ md5_block:
+       save    %sp,FRAME,%sp
+ 
+       ld      [Dptr],D
+ #ifdef ASI_PRIMARY_LITTLE
+       mov     %asi,%o7        ! How dare I? Well, I just do:-)
+ #else
+       nop
+ #endif
+       ld      [Cptr],C
+ #ifdef ASI_PRIMARY_LITTLE
+       mov     ASI_PRIMARY_LITTLE,%asi
+ #else
+       nop
+ #endif
+       ld      [Bptr],B
+       nop
+       ld      [Aptr],A
+       nop
+       LOAD    X(0),R0
+       nop
+       ba      .Lmd5_block_loop
+       nop
+ 
+ .align        32
+ .Lmd5_block_loop:
+ 
+ !!!!!!!!Round 0
+ 
+       xor     C,D,T1
+       sethi   %hi(0xd76aa478),T2
+       and     T1,B,T1
+       or      T2,%lo(0xd76aa478),T2   !=
+       xor     T1,D,T1
+       add     T1,R0,T1
+       LOAD    X(1),R1
+       add     T1,T2,T1                !=
+       add     A,T1,A
+       sll     A,7,T2
+       srl     A,32-7,A
+       or      A,T2,A                  !=
+        xor     B,C,T1
+       add     A,B,A
+ 
+       sethi   %hi(0xe8c7b756),T2
+       and     T1,A,T1                 !=
+       or      T2,%lo(0xe8c7b756),T2
+       xor     T1,C,T1
+       LOAD    X(2),R2
+       add     T1,R1,T1                !=
+       add     T1,T2,T1
+       add     D,T1,D
+       sll     D,12,T2
+       srl     D,32-12,D               !=
+       or      D,T2,D
+        xor     A,B,T1
+       add     D,A,D
+ 
+       sethi   %hi(0x242070db),T2      !=
+       and     T1,D,T1
+       or      T2,%lo(0x242070db),T2
+       xor     T1,B,T1
+       add     T1,R2,T1                !=
+       LOAD    X(3),R3
+       add     T1,T2,T1
+       add     C,T1,C
+       sll     C,17,T2                 !=
+       srl     C,32-17,C
+       or      C,T2,C
+        xor     D,A,T1
+       add     C,D,C                   !=
+ 
+       sethi   %hi(0xc1bdceee),T2
+       and     T1,C,T1
+       or      T2,%lo(0xc1bdceee),T2
+       xor     T1,A,T1                 !=
+       add     T1,R3,T1
+       LOAD    X(4),R4
+       add     T1,T2,T1
+       add     B,T1,B                  !=
+       sll     B,22,T2
+       srl     B,32-22,B
+       or      B,T2,B
+        xor     C,D,T1                 !=
+       add     B,C,B
+ 
+       sethi   %hi(0xf57c0faf),T2
+       and     T1,B,T1
+       or      T2,%lo(0xf57c0faf),T2   !=
+       xor     T1,D,T1
+       add     T1,R4,T1
+       LOAD    X(5),R5
+       add     T1,T2,T1                !=
+       add     A,T1,A
+       sll     A,7,T2
+       srl     A,32-7,A
+       or      A,T2,A                  !=
+        xor     B,C,T1
+       add     A,B,A
+ 
+       sethi   %hi(0x4787c62a),T2
+       and     T1,A,T1                 !=
+       or      T2,%lo(0x4787c62a),T2
+       xor     T1,C,T1
+       LOAD    X(6),R6
+       add     T1,R5,T1                !=
+       add     T1,T2,T1
+       add     D,T1,D
+       sll     D,12,T2
+       srl     D,32-12,D               !=
+       or      D,T2,D
+        xor     A,B,T1
+       add     D,A,D
+ 
+       sethi   %hi(0xa8304613),T2      !=
+       and     T1,D,T1
+       or      T2,%lo(0xa8304613),T2
+       xor     T1,B,T1
+       add     T1,R6,T1                !=
+       LOAD    X(7),R7
+       add     T1,T2,T1
+       add     C,T1,C
+       sll     C,17,T2                 !=
+       srl     C,32-17,C
+       or      C,T2,C
+        xor     D,A,T1
+       add     C,D,C                   !=
+ 
+       sethi   %hi(0xfd469501),T2
+       and     T1,C,T1
+       or      T2,%lo(0xfd469501),T2
+       xor     T1,A,T1                 !=
+       add     T1,R7,T1
+       LOAD    X(8),R8
+       add     T1,T2,T1
+       add     B,T1,B                  !=
+       sll     B,22,T2
+       srl     B,32-22,B
+       or      B,T2,B
+        xor     C,D,T1                 !=
+       add     B,C,B
+ 
+       sethi   %hi(0x698098d8),T2
+       and     T1,B,T1
+       or      T2,%lo(0x698098d8),T2   !=
+       xor     T1,D,T1
+       add     T1,R8,T1
+       LOAD    X(9),R9
+       add     T1,T2,T1                !=
+       add     A,T1,A
+       sll     A,7,T2
+       srl     A,32-7,A
+       or      A,T2,A                  !=
+        xor     B,C,T1
+       add     A,B,A
+ 
+       sethi   %hi(0x8b44f7af),T2
+       and     T1,A,T1                 !=
+       or      T2,%lo(0x8b44f7af),T2
+       xor     T1,C,T1
+       LOAD    X(10),R10
+       add     T1,R9,T1                !=
+       add     T1,T2,T1
+       add     D,T1,D
+       sll     D,12,T2
+       srl     D,32-12,D               !=
+       or      D,T2,D
+        xor     A,B,T1
+       add     D,A,D
+ 
+       sethi   %hi(0xffff5bb1),T2      !=
+       and     T1,D,T1
+       or      T2,%lo(0xffff5bb1),T2
+       xor     T1,B,T1
+       add     T1,R10,T1               !=
+       LOAD    X(11),R11
+       add     T1,T2,T1
+       add     C,T1,C
+       sll     C,17,T2                 !=
+       srl     C,32-17,C
+       or      C,T2,C
+        xor     D,A,T1
+       add     C,D,C                   !=
+ 
+       sethi   %hi(0x895cd7be),T2
+       and     T1,C,T1
+       or      T2,%lo(0x895cd7be),T2
+       xor     T1,A,T1                 !=
+       add     T1,R11,T1
+       LOAD    X(12),R12
+       add     T1,T2,T1
+       add     B,T1,B                  !=
+       sll     B,22,T2
+       srl     B,32-22,B
+       or      B,T2,B
+        xor     C,D,T1                 !=
+       add     B,C,B
+ 
+       sethi   %hi(0x6b901122),T2
+       and     T1,B,T1
+       or      T2,%lo(0x6b901122),T2   !=
+       xor     T1,D,T1
+       add     T1,R12,T1
+       LOAD    X(13),R13
+       add     T1,T2,T1                !=
+       add     A,T1,A
+       sll     A,7,T2
+       srl     A,32-7,A
+       or      A,T2,A                  !=
+        xor     B,C,T1
+       add     A,B,A
+ 
+       sethi   %hi(0xfd987193),T2
+       and     T1,A,T1                 !=
+       or      T2,%lo(0xfd987193),T2
+       xor     T1,C,T1
+       LOAD    X(14),RX
+       add     T1,R13,T1               !=
+       add     T1,T2,T1
+       add     D,T1,D
+       sll     D,12,T2
+       srl     D,32-12,D               !=
+       or      D,T2,D
+        xor     A,B,T1
+       add     D,A,D
+ 
+       sethi   %hi(0xa679438e),T2      !=
+       and     T1,D,T1
+       or      T2,%lo(0xa679438e),T2
+       xor     T1,B,T1
+       add     T1,RX,T1                !=
+       LOAD    X(15),RX
+       add     T1,T2,T1
+       add     C,T1,C
+       sll     C,17,T2                 !=
+       srl     C,32-17,C
+       or      C,T2,C
+        xor     D,A,T1
+       add     C,D,C                   !=
+ 
+       sethi   %hi(0x49b40821),T2
+       and     T1,C,T1
+       or      T2,%lo(0x49b40821),T2
+       xor     T1,A,T1                 !=
+       add     T1,RX,T1
+       !pre-LOADed     X(1),R1
+       add     T1,T2,T1
+       add     B,T1,B
+       sll     B,22,T2                 !=
+       srl     B,32-22,B
+       or      B,T2,B
+       add     B,C,B
+ 
+ !!!!!!!!Round 1
+ 
+       xor     B,C,T1                  !=
+       sethi   %hi(0xf61e2562),T2
+       and     T1,D,T1
+       or      T2,%lo(0xf61e2562),T2
+       xor     T1,C,T1                 !=
+       add     T1,R1,T1
+       !pre-LOADed     X(6),R6
+       add     T1,T2,T1
+       add     A,T1,A
+       sll     A,5,T2                  !=
+       srl     A,32-5,A
+       or      A,T2,A
+       add     A,B,A
+ 
+       xor     A,B,T1                  !=
+       sethi   %hi(0xc040b340),T2
+       and     T1,C,T1
+       or      T2,%lo(0xc040b340),T2
+       xor     T1,B,T1                 !=
+       add     T1,R6,T1
+       !pre-LOADed     X(11),R11
+       add     T1,T2,T1
+       add     D,T1,D
+       sll     D,9,T2                  !=
+       srl     D,32-9,D
+       or      D,T2,D
+       add     D,A,D
+ 
+       xor     D,A,T1                  !=
+       sethi   %hi(0x265e5a51),T2
+       and     T1,B,T1
+       or      T2,%lo(0x265e5a51),T2
+       xor     T1,A,T1                 !=
+       add     T1,R11,T1
+       !pre-LOADed     X(0),R0
+       add     T1,T2,T1
+       add     C,T1,C
+       sll     C,14,T2                 !=
+       srl     C,32-14,C
+       or      C,T2,C
+       add     C,D,C
+ 
+       xor     C,D,T1                  !=
+       sethi   %hi(0xe9b6c7aa),T2
+       and     T1,A,T1
+       or      T2,%lo(0xe9b6c7aa),T2
+       xor     T1,D,T1                 !=
+       add     T1,R0,T1
+       !pre-LOADed     X(5),R5
+       add     T1,T2,T1
+       add     B,T1,B
+       sll     B,20,T2                 !=
+       srl     B,32-20,B
+       or      B,T2,B
+       add     B,C,B
+ 
+       xor     B,C,T1                  !=
+       sethi   %hi(0xd62f105d),T2
+       and     T1,D,T1
+       or      T2,%lo(0xd62f105d),T2
+       xor     T1,C,T1                 !=
+       add     T1,R5,T1
+       !pre-LOADed     X(10),R10
+       add     T1,T2,T1
+       add     A,T1,A
+       sll     A,5,T2                  !=
+       srl     A,32-5,A
+       or      A,T2,A
+       add     A,B,A
+ 
+       xor     A,B,T1                  !=
+       sethi   %hi(0x02441453),T2
+       and     T1,C,T1
+       or      T2,%lo(0x02441453),T2
+       xor     T1,B,T1                 !=
+       add     T1,R10,T1
+       LOAD    X(15),RX
+       add     T1,T2,T1
+       add     D,T1,D                  !=
+       sll     D,9,T2
+       srl     D,32-9,D
+       or      D,T2,D
+       add     D,A,D                   !=
+ 
+       xor     D,A,T1
+       sethi   %hi(0xd8a1e681),T2
+       and     T1,B,T1
+       or      T2,%lo(0xd8a1e681),T2   !=
+       xor     T1,A,T1
+       add     T1,RX,T1
+       !pre-LOADed     X(4),R4
+       add     T1,T2,T1
+       add     C,T1,C                  !=
+       sll     C,14,T2
+       srl     C,32-14,C
+       or      C,T2,C
+       add     C,D,C                   !=
+ 
+       xor     C,D,T1
+       sethi   %hi(0xe7d3fbc8),T2
+       and     T1,A,T1
+       or      T2,%lo(0xe7d3fbc8),T2   !=
+       xor     T1,D,T1
+       add     T1,R4,T1
+       !pre-LOADed     X(9),R9
+       add     T1,T2,T1
+       add     B,T1,B                  !=
+       sll     B,20,T2
+       srl     B,32-20,B
+       or      B,T2,B
+       add     B,C,B                   !=
+ 
+       xor     B,C,T1
+       sethi   %hi(0x21e1cde6),T2
+       and     T1,D,T1
+       or      T2,%lo(0x21e1cde6),T2   !=
+       xor     T1,C,T1
+       add     T1,R9,T1
+       LOAD    X(14),RX
+       add     T1,T2,T1                !=
+       add     A,T1,A
+       sll     A,5,T2
+       srl     A,32-5,A
+       or      A,T2,A                  !=
+       add     A,B,A
+ 
+       xor     A,B,T1
+       sethi   %hi(0xc33707d6),T2
+       and     T1,C,T1                 !=
+       or      T2,%lo(0xc33707d6),T2
+       xor     T1,B,T1
+       add     T1,RX,T1
+       !pre-LOADed     X(3),R3
+       add     T1,T2,T1                !=
+       add     D,T1,D
+       sll     D,9,T2
+       srl     D,32-9,D
+       or      D,T2,D                  !=
+       add     D,A,D
+ 
+       xor     D,A,T1
+       sethi   %hi(0xf4d50d87),T2
+       and     T1,B,T1                 !=
+       or      T2,%lo(0xf4d50d87),T2
+       xor     T1,A,T1
+       add     T1,R3,T1
+       !pre-LOADed     X(8),R8
+       add     T1,T2,T1                !=
+       add     C,T1,C
+       sll     C,14,T2
+       srl     C,32-14,C
+       or      C,T2,C                  !=
+       add     C,D,C
+ 
+       xor     C,D,T1
+       sethi   %hi(0x455a14ed),T2
+       and     T1,A,T1                 !=
+       or      T2,%lo(0x455a14ed),T2
+       xor     T1,D,T1
+       add     T1,R8,T1
+       !pre-LOADed     X(13),R13
+       add     T1,T2,T1                !=
+       add     B,T1,B
+       sll     B,20,T2
+       srl     B,32-20,B
+       or      B,T2,B                  !=
+       add     B,C,B
+ 
+       xor     B,C,T1
+       sethi   %hi(0xa9e3e905),T2
+       and     T1,D,T1                 !=
+       or      T2,%lo(0xa9e3e905),T2
+       xor     T1,C,T1
+       add     T1,R13,T1
+       !pre-LOADed     X(2),R2
+       add     T1,T2,T1                !=
+       add     A,T1,A
+       sll     A,5,T2
+       srl     A,32-5,A
+       or      A,T2,A                  !=
+       add     A,B,A
+ 
+       xor     A,B,T1
+       sethi   %hi(0xfcefa3f8),T2
+       and     T1,C,T1                 !=
+       or      T2,%lo(0xfcefa3f8),T2
+       xor     T1,B,T1
+       add     T1,R2,T1
+       !pre-LOADed     X(7),R7
+       add     T1,T2,T1                !=
+       add     D,T1,D
+       sll     D,9,T2
+       srl     D,32-9,D
+       or      D,T2,D                  !=
+       add     D,A,D
+ 
+       xor     D,A,T1
+       sethi   %hi(0x676f02d9),T2
+       and     T1,B,T1                 !=
+       or      T2,%lo(0x676f02d9),T2
+       xor     T1,A,T1
+       add     T1,R7,T1
+       !pre-LOADed     X(12),R12
+       add     T1,T2,T1                !=
+       add     C,T1,C
+       sll     C,14,T2
+       srl     C,32-14,C
+       or      C,T2,C                  !=
+       add     C,D,C
+ 
+       xor     C,D,T1
+       sethi   %hi(0x8d2a4c8a),T2
+       and     T1,A,T1                 !=
+       or      T2,%lo(0x8d2a4c8a),T2
+       xor     T1,D,T1
+       add     T1,R12,T1
+       !pre-LOADed     X(5),R5
+       add     T1,T2,T1                !=
+       add     B,T1,B
+       sll     B,20,T2
+       srl     B,32-20,B
+       or      B,T2,B                  !=
+       add     B,C,B
+ 
+ !!!!!!!!Round 2
+ 
+       xor     B,C,T1
+       sethi   %hi(0xfffa3942),T2
+       xor     T1,D,T1                 !=
+       or      T2,%lo(0xfffa3942),T2
+       add     T1,R5,T1
+       !pre-LOADed     X(8),R8
+       add     T1,T2,T1
+       add     A,T1,A                  !=
+       sll     A,4,T2
+       srl     A,32-4,A
+       or      A,T2,A
+       add     A,B,A                   !=
+ 
+       xor     A,B,T1
+       sethi   %hi(0x8771f681),T2
+       xor     T1,C,T1
+       or      T2,%lo(0x8771f681),T2   !=
+       add     T1,R8,T1
+       !pre-LOADed     X(11),R11
+       add     T1,T2,T1
+       add     D,T1,D
+       sll     D,11,T2                 !=
+       srl     D,32-11,D
+       or      D,T2,D
+       add     D,A,D
+ 
+       xor     D,A,T1                  !=
+       sethi   %hi(0x6d9d6122),T2
+       xor     T1,B,T1
+       or      T2,%lo(0x6d9d6122),T2
+       add     T1,R11,T1               !=
+       LOAD    X(14),RX
+       add     T1,T2,T1
+       add     C,T1,C
+       sll     C,16,T2                 !=
+       srl     C,32-16,C
+       or      C,T2,C
+       add     C,D,C
+ 
+       xor     C,D,T1                  !=
+       sethi   %hi(0xfde5380c),T2
+       xor     T1,A,T1
+       or      T2,%lo(0xfde5380c),T2
+       add     T1,RX,T1                !=
+       !pre-LOADed     X(1),R1
+       add     T1,T2,T1
+       add     B,T1,B
+       sll     B,23,T2
+       srl     B,32-23,B               !=
+       or      B,T2,B
+       add     B,C,B
+ 
+       xor     B,C,T1
+       sethi   %hi(0xa4beea44),T2      !=
+       xor     T1,D,T1
+       or      T2,%lo(0xa4beea44),T2
+       add     T1,R1,T1
+       !pre-LOADed     X(4),R4
+       add     T1,T2,T1                !=
+       add     A,T1,A
+       sll     A,4,T2
+       srl     A,32-4,A
+       or      A,T2,A                  !=
+       add     A,B,A
+ 
+       xor     A,B,T1
+       sethi   %hi(0x4bdecfa9),T2
+       xor     T1,C,T1                 !=
+       or      T2,%lo(0x4bdecfa9),T2
+       add     T1,R4,T1
+       !pre-LOADed     X(7),R7
+       add     T1,T2,T1
+       add     D,T1,D                  !=
+       sll     D,11,T2
+       srl     D,32-11,D
+       or      D,T2,D
+       add     D,A,D                   !=
+ 
+       xor     D,A,T1
+       sethi   %hi(0xf6bb4b60),T2
+       xor     T1,B,T1
+       or      T2,%lo(0xf6bb4b60),T2   !=
+       add     T1,R7,T1
+       !pre-LOADed     X(10),R10
+       add     T1,T2,T1
+       add     C,T1,C
+       sll     C,16,T2                 !=
+       srl     C,32-16,C
+       or      C,T2,C
+       add     C,D,C
+ 
+       xor     C,D,T1                  !=
+       sethi   %hi(0xbebfbc70),T2
+       xor     T1,A,T1
+       or      T2,%lo(0xbebfbc70),T2
+       add     T1,R10,T1               !=
+       !pre-LOADed     X(13),R13
+       add     T1,T2,T1
+       add     B,T1,B
+       sll     B,23,T2
+       srl     B,32-23,B               !=
+       or      B,T2,B
+       add     B,C,B
+ 
+       xor     B,C,T1
+       sethi   %hi(0x289b7ec6),T2      !=
+       xor     T1,D,T1
+       or      T2,%lo(0x289b7ec6),T2
+       add     T1,R13,T1
+       !pre-LOADed     X(0),R0
+       add     T1,T2,T1                !=
+       add     A,T1,A
+       sll     A,4,T2
+       srl     A,32-4,A
+       or      A,T2,A                  !=
+       add     A,B,A
+ 
+       xor     A,B,T1
+       sethi   %hi(0xeaa127fa),T2
+       xor     T1,C,T1                 !=
+       or      T2,%lo(0xeaa127fa),T2
+       add     T1,R0,T1
+       !pre-LOADed     X(3),R3
+       add     T1,T2,T1
+       add     D,T1,D                  !=
+       sll     D,11,T2
+       srl     D,32-11,D
+       or      D,T2,D
+       add     D,A,D                   !=
+ 
+       xor     D,A,T1
+       sethi   %hi(0xd4ef3085),T2
+       xor     T1,B,T1
+       or      T2,%lo(0xd4ef3085),T2   !=
+       add     T1,R3,T1
+       !pre-LOADed     X(6),R6
+       add     T1,T2,T1
+       add     C,T1,C
+       sll     C,16,T2                 !=
+       srl     C,32-16,C
+       or      C,T2,C
+       add     C,D,C
+ 
+       xor     C,D,T1                  !=
+       sethi   %hi(0x04881d05),T2
+       xor     T1,A,T1
+       or      T2,%lo(0x04881d05),T2
+       add     T1,R6,T1                !=
+       !pre-LOADed     X(9),R9
+       add     T1,T2,T1
+       add     B,T1,B
+       sll     B,23,T2
+       srl     B,32-23,B               !=
+       or      B,T2,B
+       add     B,C,B
+ 
+       xor     B,C,T1
+       sethi   %hi(0xd9d4d039),T2      !=
+       xor     T1,D,T1
+       or      T2,%lo(0xd9d4d039),T2
+       add     T1,R9,T1
+       !pre-LOADed     X(12),R12
+       add     T1,T2,T1                !=
+       add     A,T1,A
+       sll     A,4,T2
+       srl     A,32-4,A
+       or      A,T2,A                  !=
+       add     A,B,A
+ 
+       xor     A,B,T1
+       sethi   %hi(0xe6db99e5),T2
+       xor     T1,C,T1                 !=
+       or      T2,%lo(0xe6db99e5),T2
+       add     T1,R12,T1
+       LOAD    X(15),RX
+       add     T1,T2,T1                !=
+       add     D,T1,D
+       sll     D,11,T2
+       srl     D,32-11,D
+       or      D,T2,D                  !=
+       add     D,A,D
+ 
+       xor     D,A,T1
+       sethi   %hi(0x1fa27cf8),T2
+       xor     T1,B,T1                 !=
+       or      T2,%lo(0x1fa27cf8),T2
+       add     T1,RX,T1
+       !pre-LOADed     X(2),R2
+       add     T1,T2,T1
+       add     C,T1,C                  !=
+       sll     C,16,T2
+       srl     C,32-16,C
+       or      C,T2,C
+       add     C,D,C                   !=
+ 
+       xor     C,D,T1
+       sethi   %hi(0xc4ac5665),T2
+       xor     T1,A,T1
+       or      T2,%lo(0xc4ac5665),T2   !=
+       add     T1,R2,T1
+       !pre-LOADed     X(0),R0
+       add     T1,T2,T1
+       add     B,T1,B
+       sll     B,23,T2                 !=
+       srl     B,32-23,B
+       or      B,T2,B
+       add     B,C,B
+ 
+ !!!!!!!!Round 3
+ 
+       orn     B,D,T1                  !=
+       sethi   %hi(0xf4292244),T2
+       xor     T1,C,T1
+       or      T2,%lo(0xf4292244),T2
+       add     T1,R0,T1                !=
+       !pre-LOADed     X(7),R7
+       add     T1,T2,T1
+       add     A,T1,A
+       sll     A,6,T2
+       srl     A,32-6,A                !=
+       or      A,T2,A
+       add     A,B,A
+ 
+       orn     A,C,T1
+       sethi   %hi(0x432aff97),T2      !=
+       xor     T1,B,T1
+       or      T2,%lo(0x432aff97),T2
+       LOAD    X(14),RX
+       add     T1,R7,T1                !=
+       add     T1,T2,T1
+       add     D,T1,D
+       sll     D,10,T2
+       srl     D,32-10,D               !=
+       or      D,T2,D
+       add     D,A,D
+ 
+       orn     D,B,T1
+       sethi   %hi(0xab9423a7),T2      !=
+       xor     T1,A,T1
+       or      T2,%lo(0xab9423a7),T2
+       add     T1,RX,T1
+       !pre-LOADed     X(5),R5
+       add     T1,T2,T1                !=
+       add     C,T1,C
+       sll     C,15,T2
+       srl     C,32-15,C
+       or      C,T2,C                  !=
+       add     C,D,C
+ 
+       orn     C,A,T1
+       sethi   %hi(0xfc93a039),T2
+       xor     T1,D,T1                 !=
+       or      T2,%lo(0xfc93a039),T2
+       add     T1,R5,T1
+       !pre-LOADed     X(12),R12
+       add     T1,T2,T1
+       add     B,T1,B                  !=
+       sll     B,21,T2
+       srl     B,32-21,B
+       or      B,T2,B
+       add     B,C,B                   !=
+ 
+       orn     B,D,T1
+       sethi   %hi(0x655b59c3),T2
+       xor     T1,C,T1
+       or      T2,%lo(0x655b59c3),T2   !=
+       add     T1,R12,T1
+       !pre-LOADed     X(3),R3
+       add     T1,T2,T1
+       add     A,T1,A
+       sll     A,6,T2                  !=
+       srl     A,32-6,A
+       or      A,T2,A
+       add     A,B,A
+ 
+       orn     A,C,T1                  !=
+       sethi   %hi(0x8f0ccc92),T2
+       xor     T1,B,T1
+       or      T2,%lo(0x8f0ccc92),T2
+       add     T1,R3,T1                !=
+       !pre-LOADed     X(10),R10
+       add     T1,T2,T1
+       add     D,T1,D
+       sll     D,10,T2
+       srl     D,32-10,D               !=
+       or      D,T2,D
+       add     D,A,D
+ 
+       orn     D,B,T1
+       sethi   %hi(0xffeff47d),T2      !=
+       xor     T1,A,T1
+       or      T2,%lo(0xffeff47d),T2
+       add     T1,R10,T1
+       !pre-LOADed     X(1),R1
+       add     T1,T2,T1                !=
+       add     C,T1,C
+       sll     C,15,T2
+       srl     C,32-15,C
+       or      C,T2,C                  !=
+       add     C,D,C
+ 
+       orn     C,A,T1
+       sethi   %hi(0x85845dd1),T2
+       xor     T1,D,T1                 !=
+       or      T2,%lo(0x85845dd1),T2
+       add     T1,R1,T1
+       !pre-LOADed     X(8),R8
+       add     T1,T2,T1
+       add     B,T1,B                  !=
+       sll     B,21,T2
+       srl     B,32-21,B
+       or      B,T2,B
+       add     B,C,B                   !=
+ 
+       orn     B,D,T1
+       sethi   %hi(0x6fa87e4f),T2
+       xor     T1,C,T1
+       or      T2,%lo(0x6fa87e4f),T2   !=
+       add     T1,R8,T1
+       LOAD    X(15),RX
+       add     T1,T2,T1
+       add     A,T1,A                  !=
+       sll     A,6,T2
+       srl     A,32-6,A
+       or      A,T2,A
+       add     A,B,A                   !=
+ 
+       orn     A,C,T1
+       sethi   %hi(0xfe2ce6e0),T2
+       xor     T1,B,T1
+       or      T2,%lo(0xfe2ce6e0),T2   !=
+       add     T1,RX,T1
+       !pre-LOADed     X(6),R6
+       add     T1,T2,T1
+       add     D,T1,D
+       sll     D,10,T2                 !=
+       srl     D,32-10,D
+       or      D,T2,D
+       add     D,A,D
+ 
+       orn     D,B,T1                  !=
+       sethi   %hi(0xa3014314),T2
+       xor     T1,A,T1
+       or      T2,%lo(0xa3014314),T2
+       add     T1,R6,T1                !=
+       !pre-LOADed     X(13),R13
+       add     T1,T2,T1
+       add     C,T1,C
+       sll     C,15,T2
+       srl     C,32-15,C               !=
+       or      C,T2,C
+       add     C,D,C
+ 
+       orn     C,A,T1
+       sethi   %hi(0x4e0811a1),T2      !=
+       xor     T1,D,T1
+       or      T2,%lo(0x4e0811a1),T2
+       !pre-LOADed     X(4),R4
+        ld      [Aptr],Aval
+       add     T1,R13,T1               !=
+       add     T1,T2,T1
+       add     B,T1,B
+       sll     B,21,T2
+       srl     B,32-21,B               !=
+       or      B,T2,B
+       add     B,C,B
+ 
+       orn     B,D,T1
+       sethi   %hi(0xf7537e82),T2      !=
+       xor     T1,C,T1
+       or      T2,%lo(0xf7537e82),T2
+       !pre-LOADed     X(11),R11
+        ld      [Dptr],Dval
+       add     T1,R4,T1                !=
+       add     T1,T2,T1
+       add     A,T1,A
+       sll     A,6,T2
+       srl     A,32-6,A                !=
+       or      A,T2,A
+       add     A,B,A
+ 
+       orn     A,C,T1
+       sethi   %hi(0xbd3af235),T2      !=
+       xor     T1,B,T1
+       or      T2,%lo(0xbd3af235),T2
+       !pre-LOADed     X(2),R2
+        ld      [Cptr],Cval
+       add     T1,R11,T1               !=
+       add     T1,T2,T1
+       add     D,T1,D
+       sll     D,10,T2
+       srl     D,32-10,D               !=
+       or      D,T2,D
+       add     D,A,D
+ 
+       orn     D,B,T1
+       sethi   %hi(0x2ad7d2bb),T2      !=
+       xor     T1,A,T1
+       or      T2,%lo(0x2ad7d2bb),T2
+       !pre-LOADed     X(9),R9
+        ld      [Bptr],Bval
+       add     T1,R2,T1                !=
+        add     Aval,A,Aval
+       add     T1,T2,T1
+        st      Aval,[Aptr]
+       add     C,T1,C                  !=
+       sll     C,15,T2
+        add     Dval,D,Dval
+       srl     C,32-15,C
+       or      C,T2,C                  !=
+        st      Dval,[Dptr]
+       add     C,D,C
+ 
+       orn     C,A,T1
+       sethi   %hi(0xeb86d391),T2      !=
+       xor     T1,D,T1
+       or      T2,%lo(0xeb86d391),T2
+       add     T1,R9,T1
+       !pre-LOADed     X(0),R0
+        mov     Aval,A                 !=
+       add     T1,T2,T1
+        mov     Dval,D
+       add     B,T1,B
+       sll     B,21,T2                 !=
+        add     Cval,C,Cval
+       srl     B,32-21,B
+        st      Cval,[Cptr]
+       or      B,T2,B                  !=
+       add     B,C,B
+ 
+       deccc   %i2
+       mov     Cval,C
+       add     B,Bval,B                !=
+       inc     64,%i1
+       nop
+       st      B,[Bptr]
+       nop                             !=
+ 
+ #ifdef        ULTRASPARC
+       bg,a,pt %icc,.Lmd5_block_loop
+ #else
+       bg,a    .Lmd5_block_loop
+ #endif
+       LOAD    X(0),R0
+ 
+ #ifdef ASI_PRIMARY_LITTLE
+       mov     %o7,%asi
+ #endif
+       ret
+       restore %g0,0,%o0
+ 
+ .type md5_block,#function
+ .size md5_block,(.-md5_block)

LP64: MD5 woes

Reply via email to