Hello again,
as I promised, here is the optimized code for SHA-256 hash, x86
platform. Should work faster on Core 2/iX up to 20%. This code you are
free to use (or modify) in any form on OpenSSL and GRYPTOGAMS. I guess
you should make it PIC, as any other code for x86 (I didn't make it
because I don't need it in my projects).
Thanks again Andy!
--
SY / C4acT/\uBo Pavel Semjanov
_ _ _ http://www.semjanov.com
| | |-| |_|_| |-|
#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <[email protected]> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# May 2012 Optimizations by Pavel Semjanov, <[email protected]>
# Should be faster up to 20% on modern CPUs
#
#
# SHA256 block transform for x86. September 2007.
#
# Performance in clock cycles per processed byte (less is better):
#
# Pentium PIII P4 AMD K8 Core2
# gcc 46 36 41 27 26
# icc 57 33 38 25 23
# x86 asm 40 30 33 20 18
# x86_64 asm(*) - - 21 16 16
#
# (*) x86_64 assembler performance is presented for reference
# purposes.
#
# Performance improvement over compiler generated code varies from
# 10% to 40% [see above]. Not very impressive on some ยต-archs, but
# it's 5 times smaller and optimizies amount of writes.
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
push(@INC,"${dir}","${dir}../../perlasm");
require "x86asm.pl";
&asm_init($ARGV[0],"sha512-586.pl",$ARGV[$#ARGV] eq "386");
#Trying to maximize storing data on registers. The code should be working
#if $a,b,c are also in memory, but it's not tested.
#$a=&DWP(64,"esp");
#$b=&DWP(68,"esp");
#$c=&DWP(72,"esp");
$d=&DWP(76,"esp");
$a="eax";
$b="ebp";
$c="ecx";
#$d="edx";
$e=&DWP(80,"esp");
$f=&DWP(84,"esp");
$g=&DWP(88,"esp");
$h=&DWP(92,"esp");
$_tmp1="esi";
$tmp1="esi";
$tmp2="edi";
$tmp3="edx";
$tmp4="ebx";
$_T1="ebx";
$T1="ebx";
$xor_sav1=&DWP(104,"esp");
$xor_sav2=&DWP(108,"esp");
$W="ebx";
$X="esp";
$ctx = "ebx";
$sandy =0;
@DATA_K = (
0x428a2f98,
0x71374491,
0xb5c0fbcf,
0xe9b5dba5,
0x3956c25b,
0x59f111f1,
0x923f82a4,
0xab1c5ed5,
0xd807aa98,
0x12835b01,
0x243185be,
0x550c7dc3,
0x72be5d74,
0x80deb1fe,
0x9bdc06a7,
0xc19bf174,
0xe49b69c1,
0xefbe4786,
0x0fc19dc6,
0x240ca1cc,
0x2de92c6f,
0x4a7484aa,
0x5cb0a9dc,
0x76f988da,
0x983e5152,
0xa831c66d,
0xb00327c8,
0xbf597fc7,
0xc6e00bf3,
0xd5a79147,
0x06ca6351,
0x14292967,
0x27b70a85,
0x2e1b2138,
0x4d2c6dfc,
0x53380d13,
0x650a7354,
0x766a0abb,
0x81c2c92e,
0x92722c85,
0xa2bfe8a1,
0xa81a664b,
0xc24b8b70,
0xc76c51a3,
0xd192e819,
0xd6990624,
0xf40e3585,
0x106aa070,
0x19a4c116,
0x1e376c08,
0x2748774c,
0x34b0bcb5,
0x391c0cb3,
0x4ed8aa4a,
0x5b9cca4f,
0x682e6ff3,
0x748f82ee,
0x78a5636f,
0x84c87814,
0x8cc70208,
0x90befffa,
0xa4506ceb,
0xbef9a3f7,
0xc67178f2);
&sha1_block_host("sha256_block_data_order", 0);
&asm_finish();
sub Na
{
local($n)=@_;
return( (($n )&0x0f),
(($n+ 1)&0x0f),
(($n+ 14)&0x0f),
(($n+ 9)&0x0f));
}
sub isreg
{
local ($r) = @_;
return (substr ($r, 0, 1) eq "e");
}
sub my_rotl
{
local($p1,$p2)=@_;
&shld ($p1, $p1, $p2) if $sandy == 1;
&rotl ($p1, $p2) if $sandy == 0;
}
sub BODY_00_15
{
local($round00_15,$X,$n,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
local($tmp1, $T1);
&comment("00_15 $n") if $round00_15 == 1;
&comment("continue 16_63 $n") if $round00_15 == 0;
if ($n % 2 == 0) { $tmp1 = $_tmp1; $T1 = $_T1;}
else { $tmp1 = $_T1; $T1 = $_tmp1; }
&mov ($T1, &wparam(1)) if $n == 0;
&mov ($tmp1, $e) if isreg ($e) || $n == 0; # Sigma1(e) =
(ROTATE((e),26) ^ ROTATE((e),21) ^ ROTATE((e),7))
&mov ($tmp3,$g); # tmp3 to hold Ch(e,f,g)
&mov ($tmp2, $tmp1);
&my_rotl ($tmp1, 7); # e << 7
&xor ($tmp3,$f);
&and ($tmp3,$tmp2);
if ($round00_15) {
&mov ($T1, &DWP($n*4, $T1, "", 1));
&bswap ($T1);
&mov (&swtmp($n), $T1); # T1 += X[i]
}
&my_rotl ($tmp2, 21); # e << 21
&add ($T1, $h); # T1 += h
&xor ($tmp1, $tmp2);
&my_rotl ($tmp2, 5); # e << 26
&xor ($tmp2, $tmp1); # tmp2 = Sigma1(e)
&xor ($tmp3,$g); # tmp3=Ch(e,f,g)
&lea ($T1,&DWP($DATA_K[$n],$T1,$tmp3,1)); # T1 += K[i] + Ch
(e,f,g)
&mov ($tmp1, $a); # Sigma0(a) =
(ROTATE((a),30) ^ ROTATE((a),19) ^ ROTATE((a),10))
&add ($T1, $tmp2); # T1 += Sigma1(e)
if ($n % 2 == 0) { $xor_save1 = $xor_sav1; $xor_save2 = $xor_sav2; }
else { $xor_save1 = $xor_sav2; $xor_save2 = $xor_sav1; }
&mov ($tmp2, $tmp1);
&my_rotl ($tmp1, 10); # a << 10
&mov ($tmp3, $b); # tmp3 to hold
Maj(a,b,c)
&xor ($tmp3, $a);
&my_rotl ($tmp2, 19); # a << 19
&xor ($tmp2, $tmp1);
&my_rotl ($tmp1, 20); # a << 30
&xor ($tmp2, $tmp1); # tmp2 = Sigma0(a)
if ($round00_15) {
&mov ($tmp1, &wparam(1));
}
&mov ($xor_save1, $tmp3);
&and ($tmp3,$xor_save2);
&xor ($tmp3, $b); # tmp3 = Maj(a, b, c)
&add ($tmp3, $T1); # tmp3 += T1
if (isreg($h)) {
&lea ($h,&DWP(0,$tmp2,$tmp3,1)); # h = Sigma0(a) +
Maj(a,b,c) + T1
}
else {
&add ($tmp3, $tmp2); # h = Sigma0(a) +
Maj(a,b,c) + T1
&mov ($h, $tmp3);
}
if (&isreg($d)) {
&add ($d, $T1); # d += T1
}
else {
&add ($T1, $d); # d += T1
&mov ($d, $T1) if ($n < 63);
}
}
sub BODY_16_63
{
local($round,$K,$n,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
&comment("16_63 $n");
local($n0,$n1,$n2,$n3)=&Na($n);
if ($n % 2 == 0) { $tmp1 = $tmp2; $T1 = $_T1;}
else { $tmp1 = $tmp2; $T1 = $_tmp1; }
&mov ($tmp3,&swtmp($n1)); # X[i+1]
&mov ($tmp1,$tmp3); # X[i+1]
&shr ($tmp3, 3); # sigma0(x) =
(ROTATE((x),25) ^ ROTATE((x),14) ^ ((x)>>3))
&my_rotl ($tmp1, 14);
&xor ($tmp3, $tmp1);
&my_rotl ($tmp1, 11);
&xor ($tmp3, $tmp1); # tmp3 = sigma0(X[i+1])
&add ($tmp3,&swtmp($n0)); # X[i]
&add ($tmp3,&swtmp($n3)); # X[i+9]
&mov ($tmp1,&swtmp($n2)); # X[i+14]
&mov ($T1, $tmp1);
&shr ($tmp1, 10); # sigma1(x) =
(ROTATE((x),15) ^ ROTATE((x),13) ^ ((x)>>10))
&my_rotl ($T1, 13);
&xor ($tmp1, $T1);
&my_rotl ($T1, 2);
&xor ($T1, $tmp1); # T1 = sigma1(X[i+14])
&lea ($T1,&DWP(0,$T1,$tmp3,1)); # T1 = X[i] + sigma0 +
sigma1
# &add ($T1, $tmp3);
&mov(&swtmp($n0), $T1) if $n <= 61;
&BODY_00_15(0,$X, $n,$a,$b,$c,$d,$e,$f,$g,$h);
}
sub sha1_block_host
{
local($name)=@_;
&function_begin_B($name,"");
# parameter 1 is the MD5_CTX structure.
# A 0
# B 4
# C 8
# D 12
# E 16
$ctx = "ebp";
&mov("ecx", &wparam(2)); #num
&push("esi");
&shl("ecx",6);
&mov("esi", &wparam(1)); # data
&push("ebp");
&add("ecx","esi"); # offset to leave on
&push("ebx");
&mov("ebp", &wparam(0)); # SHA_context
&push("edi");
&stack_push(18+8+2); # X[16] + swtmp(17) +
a,b,c,d,e,f,g,h
&mov(&swtmp(25),"ecx");
&mov($tmp1, &DWP(0,$ctx,"",0));
&mov($tmp2, &DWP(4,$ctx,"",0));
&mov($tmp3, &DWP(8,$ctx,"",0));
&mov($tmp4, &DWP(12,$ctx,"",0));
&mov($a, $tmp1);
# &mov($b, $tmp2);
&mov($c, $tmp3);
&mov($d, $tmp4);
&mov($tmp1, &DWP(16,$ctx,"",0));
&mov($tmp3, &DWP(20,$ctx,"",0));
&mov($e, $tmp1);
&mov($f, $tmp3);
&mov($tmp1, &DWP(24,$ctx,"",0));
&mov($tmp3, &DWP(28,$ctx,"",0));
&mov($g, $tmp1);
&mov($h, $tmp3);
&mov($b, $tmp2);
&set_label("start");
&comment("");
&comment("Start processing");
# odd start
&mov ($tmp1, $b);
&xor ($tmp1, $c);
&mov ($xor_sav2, $tmp1);
&BODY_00_15(1,$X, 0,$a,$b,$c,$d,$e,$f,$g,$h);
&BODY_00_15(1,$X, 1,$h,$a,$b,$c,$d,$e,$f,$g);
&BODY_00_15(1,$X, 2,$g,$h,$a,$b,$c,$d,$e,$f);
&BODY_00_15(1,$X, 3,$f,$g,$h,$a,$b,$c,$d,$e);
&BODY_00_15(1,$X, 4,$e,$f,$g,$h,$a,$b,$c,$d);
&BODY_00_15(1,$X, 5,$d,$e,$f,$g,$h,$a,$b,$c);
&BODY_00_15(1,$X, 6,$c,$d,$e,$f,$g,$h,$a,$b);
&BODY_00_15(1,$X, 7,$b,$c,$d,$e,$f,$g,$h,$a);
&BODY_00_15(1,$X, 8,$a,$b,$c,$d,$e,$f,$g,$h);
&BODY_00_15(1,$X, 9,$h,$a,$b,$c,$d,$e,$f,$g);
&BODY_00_15(1,$X,10,$g,$h,$a,$b,$c,$d,$e,$f);
&BODY_00_15(1,$X,11,$f,$g,$h,$a,$b,$c,$d,$e);
&BODY_00_15(1,$X,12,$e,$f,$g,$h,$a,$b,$c,$d);
&BODY_00_15(1,$X,13,$d,$e,$f,$g,$h,$a,$b,$c);
&BODY_00_15(1,$X,14,$c,$d,$e,$f,$g,$h,$a,$b);
&BODY_00_15(1,$X,15,$b,$c,$d,$e,$f,$g,$h,$a);
for ($i=16;$i<64;$i+=8)
{
&BODY_16_63(1,$X,$i+0,$a,$b,$c,$d,$e,$f,$g,$h);
&BODY_16_63(1,$X,$i+1,$h,$a,$b,$c,$d,$e,$f,$g);
&BODY_16_63(1,$X,$i+2,$g,$h,$a,$b,$c,$d,$e,$f);
&BODY_16_63(1,$X,$i+3,$f,$g,$h,$a,$b,$c,$d,$e);
&BODY_16_63(1,$X,$i+4,$e,$f,$g,$h,$a,$b,$c,$d);
&BODY_16_63(1,$X,$i+5,$d,$e,$f,$g,$h,$a,$b,$c);
&BODY_16_63(1,$X,$i+6,$c,$d,$e,$f,$g,$h,$a,$b);
&BODY_16_63(1,$X,$i+7,$b,$c,$d,$e,$f,$g,$h,$a);
}
&comment("End processing");
&comment("");
# $e is now in the $T1
$ctx = "edi";
$tmp1="esi";
&mov ($ctx,&wparam(0));
&add ($T1, &DWP(16,$ctx,"",0));
&mov (&DWP(16,$ctx,"",0), $T1);
&mov ($e, $T1);
&mov ($tmp1, &DWP(0,$ctx,"",0)) if !isreg ($a);
&mov ($tmp3, &DWP(4,$ctx,"",0)) if !isreg ($b);
&add ($tmp1, $a) if !isreg ($a);
&add ($tmp3, $b) if !isreg ($b);
&mov ($a, $tmp1) if !isreg ($a);
&mov ($b, $tmp3) if !isreg ($b);
&mov (&DWP(0,$ctx,"",0), $tmp1) if !isreg ($a);
&mov (&DWP(4,$ctx,"",0), $tmp3) if !isreg ($b);
&mov ($tmp1, &DWP(8,$ctx,"",0)) if !isreg ($c);
&mov ($tmp3, &DWP(12,$ctx,"",0)) if !isreg ($d);
&add ($tmp1, $c) if !isreg ($c);
&add ($tmp3, $d) if !isreg ($d);
&mov ($c, $tmp1) if !isreg ($c);
&mov ($d, $tmp3) if !isreg ($d);
&mov (&DWP(8,$ctx,"",0), $tmp1) if !isreg ($c);
&mov (&DWP(12,$ctx,"",0), $tmp3) if !isreg ($d);
&add ($a, &DWP(0,$ctx,"",0)) if isreg ($a);
&add ($b, &DWP(4,$ctx,"",0)) if isreg ($b);
&add ($c, &DWP(8,$ctx,"",0)) if isreg ($c);
&add ($d, &DWP(12,$ctx,"",0)) if isreg ($d);
&mov (&DWP(0,$ctx,"",0), $a) if isreg ($a);
&mov (&DWP(4,$ctx,"",0), $b) if isreg ($b);
&mov (&DWP(8,$ctx,"",0), $c) if isreg ($c);
&mov (&DWP(12,$ctx,"",0), $d) if isreg ($d);
# &mov ($tmp1, &DWP(16,$ctx,"",0)) if !isreg ($e);
&mov ($tmp3, &DWP(20,$ctx,"",0)) if !isreg ($f);
# &add ($tmp1, $e) if !isreg ($e);
&add ($tmp3, $f) if !isreg ($f);
# &mov ($e, $tmp1) if !isreg ($e);
&mov ($f, $tmp3) if !isreg ($f);
# &mov (&DWP(16,$ctx,"",0), $tmp1) if !isreg ($e);
&mov (&DWP(20,$ctx,"",0), $tmp3) if !isreg ($f);
&mov ($tmp1, &DWP(24,$ctx,"",0)) if !isreg ($g);
&mov ($tmp3, &DWP(28,$ctx,"",0)) if !isreg ($h);
&add ($tmp1, $g) if !isreg ($g);
&add ($tmp3, $h) if !isreg ($h);
&mov ($g, $tmp1) if !isreg ($g);
&mov ($h, $tmp3) if !isreg ($h);
&mov (&DWP(24,$ctx,"",0), $tmp1) if !isreg ($g);
&mov (&DWP(28,$ctx,"",0), $tmp3) if !isreg ($h);
# &add ($e, &DWP(16,$ctx,"",0)) if isreg ($e);
&add ($f, &DWP(20,$ctx,"",0)) if isreg ($f);
&add ($g, &DWP(24,$ctx,"",0)) if isreg ($g);
&add ($h, &DWP(28,$ctx,"",0)) if isreg ($h);
# &mov (&DWP(16,$ctx,"",0), $e) if isreg ($e);
&mov (&DWP(20,$ctx,"",0), $f) if isreg ($f);
&mov (&DWP(24,$ctx,"",0), $g) if isreg ($g);
&mov (&DWP(28,$ctx,"",0), $h) if isreg ($h);
&mov($W,&wparam(1));
&mov($tmp3,&swtmp(25));
&add($W,64);
&mov(&wparam(1),$W); # loop counter save
&cmp($W,$tmp3);
&jb(&label("start"));
&stack_pop(18+8+2);
&pop("edi");
&pop("ebx");
&pop("ebp");
&pop("esi");
&ret();
# keep a note of shortcut label so it can be used outside
# block.
my $sclabel = &label("shortcut");
&function_end_B($name);
# Putting this here avoids problems with MASM in debugging mode
# &sha1_block_host("sha1_block_asm_host_order", $sclabel);
}