Now I agree ;) 1.8 version is "best-balanced" for all architectures.

Sorry, I can't give up ;) I also placed a^b and b^c in registers, this helped me eliminate two mov's from every round, and made some others code transformations. New version gives up to 3-5% speed-up on:
Core 2 - 950 cycles,
Lynnfiled - 1005,
Sandy Bridge - 936 or 14.6 cbp (!)
P4 Northwood - 1600

On others architectures, including PIII, P4 Prescott, K10 and (I guess, don't tested) Atom, 1.8 version is better.

--

   SY / C4acT/\uBo             Pavel Semjanov
   _   _         _        http://www.semjanov.com
  | | |-| |_|_| |-|
#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <[email protected]> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# May 2012 Optimizations by Pavel Semjanov, <[email protected]>
# Should be faster up to 20% on modern CPUs
#
#
# SHA256 block transform for x86. September 2007.
#
# Performance in clock cycles per processed byte (less is better):
#
#               Pentium PIII    P4      AMD K8  Core2
# gcc           46      36      41      27      26
# icc           57      33      38      25      23      
# x86 asm       40      30      33      20      18
# x86_64 asm(*) -       -       21      16      16
#
# (*) x86_64 assembler performance is presented for reference
#     purposes.
#
# Performance improvement over compiler generated code varies from
# 10% to 40% [see above]. Not very impressive on some ยต-archs, but
# it's 5 times smaller and optimizies amount of writes.

$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
push(@INC,"${dir}","${dir}../../perlasm");
require "x86asm.pl";

&asm_init($ARGV[0],"sha512-586.pl",$ARGV[$#ARGV] eq "386");

#Trying to maximize storing data on registers. The code should be working
#if $a,b,c are also in memory, but it's not tested.


$a=&DWP(64,"esp");
#$b=&DWP(68,"esp");
$c=&DWP(72,"esp");
$d=&DWP(76,"esp");
#$a="eax";
$b="ebp";
#$c="ecx";
#$d="edx";

$e=&DWP(80,"esp");
$f=&DWP(84,"esp");
$g=&DWP(88,"esp");
$h=&DWP(92,"esp");

$_tmp1="esi";
$tmp1="esi";
$tmp2="edi";
$tmp3="edx";
$tmp4="ebx";
$_T1="ebx";
$T1="ebx";
#$xor_sav1=&DWP(104,"esp");
#$xor_sav2=&DWP(108,"esp");
$xor_sav1 = "eax";
$xor_sav2 = "ecx";

$W="ebx";
$X="esp";
$ctx = "ebx";

$sandy =0;

@DATA_K = (
0x428a2f98,
0x71374491,
0xb5c0fbcf,
0xe9b5dba5,
0x3956c25b,
0x59f111f1,
0x923f82a4,
0xab1c5ed5,
0xd807aa98,
0x12835b01,
0x243185be,
0x550c7dc3,
0x72be5d74,
0x80deb1fe,
0x9bdc06a7,
0xc19bf174,
0xe49b69c1,
0xefbe4786,
0x0fc19dc6,
0x240ca1cc,
0x2de92c6f,
0x4a7484aa,
0x5cb0a9dc,
0x76f988da,
0x983e5152,
0xa831c66d,
0xb00327c8,
0xbf597fc7,
0xc6e00bf3,
0xd5a79147,
0x06ca6351,
0x14292967,
0x27b70a85,
0x2e1b2138,
0x4d2c6dfc,
0x53380d13,
0x650a7354,
0x766a0abb,
0x81c2c92e,
0x92722c85,
0xa2bfe8a1,
0xa81a664b,
0xc24b8b70,
0xc76c51a3,
0xd192e819,
0xd6990624,
0xf40e3585,
0x106aa070,
0x19a4c116,
0x1e376c08,
0x2748774c,
0x34b0bcb5,
0x391c0cb3,
0x4ed8aa4a,
0x5b9cca4f,
0x682e6ff3,
0x748f82ee,
0x78a5636f,
0x84c87814,
0x8cc70208,
0x90befffa,
0xa4506ceb,
0xbef9a3f7,
0xc67178f2);

&sha1_block_host("sha256_block_data_order", 0);

&asm_finish();

sub Na
        {
        local($n)=@_;
        return( (($n   )&0x0f),
                (($n+ 1)&0x0f),
                (($n+ 14)&0x0f),
                (($n+ 9)&0x0f));
        }

sub isreg
        {
        local ($r) = @_;

        return (substr ($r, 0, 1) eq "e");
        }


sub my_rotl
        { 
        local($p1,$p2)=@_;
        
        &shld ($p1, $p1, $p2)  if $sandy == 1;
        &rotl ($p1, $p2) if $sandy == 0;
        }


sub BODY_00_15
        {
        local($round00_15,$X,$n,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
        local($tmp1, $T1);

        &comment("00_15 $n") if $round00_15 == 1;
        &comment("continue 16_63 $n") if $round00_15 == 0;
        
        if ($n % 2 == 0) { $tmp1 = $_tmp1; $T1 = $_T1;}
        else { $tmp1 = $_T1; $T1 = $_tmp1; }

        &mov ($T1, &wparam(1)) if $n == 0;

        &mov   ($tmp1, $e) if isreg ($e) || $n == 0;    # Sigma1(e) = 
(ROTATE((e),26) ^ ROTATE((e),21) ^ ROTATE((e),7))
        &mov   ($tmp3,$g);                              # tmp3 to hold Ch(e,f,g)
        &mov   ($tmp2, $tmp1);  
        &my_rotl  ($tmp1, 7);                           # e << 7
        &xor  ($tmp3,$f);
        &and  ($tmp3,$tmp2);
       
        if ($round00_15) {
        &mov ($T1, &DWP($n*4, $T1, "", 1));
        &bswap ($T1);
        &mov   (&swtmp($n), $T1);                       # T1 += X[i]
        }

        &my_rotl  ($tmp2, 21);                          # e << 21
        &add  ($T1, $h);                                # T1 += h
        &xor   ($tmp1, $tmp2);          
        &my_rotl  ($tmp2, 5);                           # e << 26
        &xor   ($tmp2, $tmp1);                          # tmp2 = Sigma1(e)

        if ($n % 2 == 0) { $xor_save1 = $xor_sav1; $xor_save2 = $xor_sav2; }
        else             { $xor_save1 = $xor_sav2; $xor_save2 = $xor_sav1; }


        &xor  ($tmp3,$g);                               # tmp3=Ch(e,f,g)
        &mov  ($xor_save1, $a) if isreg($a) || $n == 0; # xor_save1 should 
already has $a, see below
        &mov  ($tmp1, $xor_save1);                      # Sigma0(a) = 
(ROTATE((a),30) ^ ROTATE((a),19) ^ ROTATE((a),10))
        &lea  ($tmp3,&DWP(0,$T1,$tmp3,1));              # 
        &lea  ($T1,&DWP($DATA_K[$n], $tmp2, $tmp3, 0)); # T1 += Sigma1(e)

        &mov   ($tmp2, $tmp1);  
        &my_rotl  ($tmp1, 10);                          # a << 10
        &xor   ($xor_save1, $b);                        # a^b, save as b^c for 
next round, xor_save1 will be xor_save2
        &my_rotl  ($tmp2, 19);                          # a << 19
        &and   ($xor_save2, $xor_save1);                # calculate Maj using 
already calculated b^c
        &xor   ($tmp2, $tmp1);  
        &my_rotl  ($tmp1, 20);                          # a << 30
        &xor   ($xor_save2, $b);                        # xor_save2 = Maj(a, b, 
c)
        &xor   ($tmp2, $tmp1);                          # tmp2 = Sigma0(a)

        if ($round00_15) {
        &mov ($tmp1, &wparam(1));
        }

        &add   ($xor_save2, $T1);                       # tmp3 += T1

        if (isreg($h)) {
        &lea  ($h,&DWP(0,$tmp2,$xor_save2,1));          # h = Sigma0(a) + 
Maj(a,b,c) + T1
        }
        else {
        &add   ($xor_save2, $tmp2);                     # h = Sigma0(a) + 
Maj(a,b,c) + T1
        &mov   ($h, $xor_save2);                        # will be $a in the 
next round, xor_save2 will be xor_save1
        }

        if (&isreg($d)) {
          &add ($d, $T1);                               # d += T1
        }
        else {
          &add ($T1, $d);                               # d += T1
          &mov ($d, $T1) if ($n < 63);                  # will be $e in the 
next round
        }
        
        }


sub BODY_16_63
        {
        local($round,$K,$n,$a,$b,$c,$d,$e,$f,$g,$h)=@_;

        &comment("16_63 $n");

        local($n0,$n1,$n2,$n3)=&Na($n);

        if ($n % 2 == 0) { $tmp1 = $tmp2; $T1 = $_T1;}
        else { $tmp1 = $tmp2; $T1 = $_tmp1; }

        &mov   ($tmp3,&swtmp($n1));                     # X[i+1]
        &mov   ($tmp1,$tmp3);                           # X[i+1]
        &shr   ($tmp3, 3);                              # sigma0(x) = 
(ROTATE((x),25) ^ ROTATE((x),14) ^ ((x)>>3))
        &my_rotl  ($tmp1, 14);                          
        &xor   ($tmp3, $tmp1);                           
        &my_rotl  ($tmp1, 11);
        &xor   ($tmp3, $tmp1);                          # tmp3 = sigma0(X[i+1])

        &add ($tmp3,&swtmp($n0));                       # X[i]
        &add ($tmp3,&swtmp($n3));                       # X[i+9]

        &mov   ($tmp1,&swtmp($n2));                     # X[i+14]
        &mov   ($T1, $tmp1);           
        &shr   ($tmp1, 10);                             # sigma1(x) = 
(ROTATE((x),15) ^ ROTATE((x),13) ^ ((x)>>10))
        &my_rotl  ($T1, 13);                            
        &xor   ($tmp1, $T1);                             
        &my_rotl  ($T1, 2);
        &xor   ($T1, $tmp1);                            # T1 = sigma1(X[i+14])


        &lea  ($T1,&DWP(0,$T1,$tmp3,1));                # T1 = X[i] + sigma0 + 
sigma1
        &mov(&swtmp($n0), $T1)  if $n <= 61;            

        &BODY_00_15(0,$X, $n,$a,$b,$c,$d,$e,$f,$g,$h);
}




sub sha1_block_host
        {
        local($name)=@_;

        &function_begin_B($name,"");

        # parameter 1 is the MD5_CTX structure.
        # A     0
        # B     4
        # C     8
        # D     12
        # E     16

$ctx = "ebp";

        &mov("ecx",     &wparam(2));  #num
         &push("esi");
        &shl("ecx",6);
         &mov("esi",    &wparam(1)); # data
        &push("ebp");
         &add("ecx","esi");     # offset to leave on
        &push("ebx");
         &mov("ebp",    &wparam(0));  # SHA_context
        &push("edi");
        &stack_push(18+8+2);                            # X[16] + swtmp(17) + 
a,b,c,d,e,f,g,h
         &mov(&swtmp(25),"ecx");


         &mov($tmp1,    &DWP(0,$ctx,"",0));
         &mov($tmp2,    &DWP(4,$ctx,"",0));
         &mov($tmp3,    &DWP(8,$ctx,"",0));
         &mov($tmp4,    &DWP(12,$ctx,"",0));
         &mov($a,       $tmp1);
#        &mov($b,       $tmp2);
         &mov($c,       $tmp3);
         &mov($d,       $tmp4);
         &mov($tmp1,    &DWP(16,$ctx,"",0));
         &mov($tmp3,    &DWP(20,$ctx,"",0));
         &mov($e,       $tmp1);
         &mov($f,       $tmp3);
         &mov($tmp1,    &DWP(24,$ctx,"",0));
         &mov($tmp3,    &DWP(28,$ctx,"",0));
         &mov($g,       $tmp1);
         &mov($h,       $tmp3);

         &mov($b,       $tmp2);

        &set_label("start");

        &comment("");
        &comment("Start processing");


        # odd start
        &mov ($tmp1, $b);
        &xor ($tmp1, $c);
        &mov ($xor_sav2, $tmp1);
        &BODY_00_15(1,$X, 0,$a,$b,$c,$d,$e,$f,$g,$h);
        &BODY_00_15(1,$X, 1,$h,$a,$b,$c,$d,$e,$f,$g);
        &BODY_00_15(1,$X, 2,$g,$h,$a,$b,$c,$d,$e,$f);
        &BODY_00_15(1,$X, 3,$f,$g,$h,$a,$b,$c,$d,$e);
        &BODY_00_15(1,$X, 4,$e,$f,$g,$h,$a,$b,$c,$d);
        &BODY_00_15(1,$X, 5,$d,$e,$f,$g,$h,$a,$b,$c);
        &BODY_00_15(1,$X, 6,$c,$d,$e,$f,$g,$h,$a,$b);
        &BODY_00_15(1,$X, 7,$b,$c,$d,$e,$f,$g,$h,$a);
        &BODY_00_15(1,$X, 8,$a,$b,$c,$d,$e,$f,$g,$h);
        &BODY_00_15(1,$X, 9,$h,$a,$b,$c,$d,$e,$f,$g);
        &BODY_00_15(1,$X,10,$g,$h,$a,$b,$c,$d,$e,$f);
        &BODY_00_15(1,$X,11,$f,$g,$h,$a,$b,$c,$d,$e);
        &BODY_00_15(1,$X,12,$e,$f,$g,$h,$a,$b,$c,$d);
        &BODY_00_15(1,$X,13,$d,$e,$f,$g,$h,$a,$b,$c);
        &BODY_00_15(1,$X,14,$c,$d,$e,$f,$g,$h,$a,$b);
        &BODY_00_15(1,$X,15,$b,$c,$d,$e,$f,$g,$h,$a);


        for ($i=16;$i<64;$i+=8)
                {
        &BODY_16_63(1,$X,$i+0,$a,$b,$c,$d,$e,$f,$g,$h);
        &BODY_16_63(1,$X,$i+1,$h,$a,$b,$c,$d,$e,$f,$g);
        &BODY_16_63(1,$X,$i+2,$g,$h,$a,$b,$c,$d,$e,$f);
        &BODY_16_63(1,$X,$i+3,$f,$g,$h,$a,$b,$c,$d,$e);
        &BODY_16_63(1,$X,$i+4,$e,$f,$g,$h,$a,$b,$c,$d);
        &BODY_16_63(1,$X,$i+5,$d,$e,$f,$g,$h,$a,$b,$c);
        &BODY_16_63(1,$X,$i+6,$c,$d,$e,$f,$g,$h,$a,$b);
        &BODY_16_63(1,$X,$i+7,$b,$c,$d,$e,$f,$g,$h,$a);
                }

        &comment("End processing");
        &comment("");

# $e is now in the $T1

$ctx = "edi";
$tmp1="esi";

         &mov ($ctx,&wparam(0));
         &add ($T1,     &DWP(16,$ctx,"",0));
         &mov (&DWP(16,$ctx,"",0), $T1);
         &mov ($e, $T1);


         &mov ($tmp1,   &DWP(0,$ctx,"",0))      if !isreg ($a);
         &mov ($tmp3,   &DWP(4,$ctx,"",0))      if !isreg ($b);
         &add ($tmp1, $a)                       if !isreg ($a);
         &add ($tmp3, $b)                       if !isreg ($b);
         &mov ($a, $tmp1)                     if !isreg ($a);
         &mov ($b, $tmp3)                     if !isreg ($b);
         &mov (&DWP(0,$ctx,"",0), $tmp1)    if !isreg ($a);
         &mov (&DWP(4,$ctx,"",0), $tmp3)    if !isreg ($b);
         &mov ($tmp1,   &DWP(8,$ctx,"",0))      if !isreg ($c);
         &mov ($tmp3,   &DWP(12,$ctx,"",0))     if !isreg ($d);
         &add ($tmp1, $c)                       if !isreg ($c);
         &add ($tmp3, $d)                       if !isreg ($d);
         &mov ($c, $tmp1)                     if !isreg ($c);
         &mov ($d, $tmp3)                     if !isreg ($d);
         &mov (&DWP(8,$ctx,"",0), $tmp1)    if !isreg ($c);
         &mov (&DWP(12,$ctx,"",0), $tmp3)    if !isreg ($d);

         &add ($a,      &DWP(0,$ctx,"",0))      if isreg ($a);
         &add ($b,      &DWP(4,$ctx,"",0))      if isreg ($b);
         &add ($c,      &DWP(8,$ctx,"",0))      if isreg ($c);
         &add ($d,      &DWP(12,$ctx,"",0))     if isreg ($d);

         &mov (&DWP(0,$ctx,"",0), $a)           if isreg ($a);
         &mov (&DWP(4,$ctx,"",0), $b)       if isreg ($b);
         &mov (&DWP(8,$ctx,"",0), $c)       if isreg ($c);
         &mov (&DWP(12,$ctx,"",0), $d)      if isreg ($d);


#        &mov ($tmp1,   &DWP(16,$ctx,"",0))     if !isreg ($e);
         &mov ($tmp3,   &DWP(20,$ctx,"",0))     if !isreg ($f);
#        &add ($tmp1, $e)                       if !isreg ($e);
         &add ($tmp3, $f)                       if !isreg ($f);
#        &mov ($e, $tmp1)                     if !isreg ($e);
         &mov ($f, $tmp3)                     if !isreg ($f);
#        &mov (&DWP(16,$ctx,"",0), $tmp1)    if !isreg ($e);
         &mov (&DWP(20,$ctx,"",0), $tmp3)    if !isreg ($f);
         &mov ($tmp1,   &DWP(24,$ctx,"",0))     if !isreg ($g);
         &mov ($tmp3,   &DWP(28,$ctx,"",0))     if !isreg ($h);
         &add ($tmp1, $g)                       if !isreg ($g);
         &add ($tmp3, $h)                       if !isreg ($h);
         &mov ($g, $tmp1)                     if !isreg ($g);
         &mov ($h, $tmp3)                     if !isreg ($h);
         &mov (&DWP(24,$ctx,"",0), $tmp1)    if !isreg ($g);
         &mov (&DWP(28,$ctx,"",0), $tmp3)    if !isreg ($h);

#        &add ($e,      &DWP(16,$ctx,"",0))     if isreg ($e);
         &add ($f,      &DWP(20,$ctx,"",0))     if isreg ($f);
         &add ($g,      &DWP(24,$ctx,"",0))     if isreg ($g);
         &add ($h,      &DWP(28,$ctx,"",0))     if isreg ($h);

#        &mov (&DWP(16,$ctx,"",0), $e)  if isreg ($e);
         &mov (&DWP(20,$ctx,"",0), $f)       if isreg ($f);
         &mov (&DWP(24,$ctx,"",0), $g)       if isreg ($g);
         &mov (&DWP(28,$ctx,"",0), $h)      if isreg ($h);


        &mov($W,&wparam(1));
         &mov($tmp3,&swtmp(25));

        &add($W,64);
        &mov(&wparam(1),$W);            # loop counter save
         &cmp($W,$tmp3);
         &jb(&label("start"));

        &stack_pop(18+8+2);
         &pop("edi");
        &pop("ebx");
         &pop("ebp");
        &pop("esi");
         &ret();

        # keep a note of shortcut label so it can be used outside
        # block.
        my $sclabel = &label("shortcut");

        &function_end_B($name);
        # Putting this here avoids problems with MASM in debugging mode
#       &sha1_block_host("sha1_block_asm_host_order", $sclabel);
        }

Reply via email to