Module Name: src
Committed By: christos
Date: Mon Mar 21 19:12:26 UTC 2016
Modified Files:
src/crypto/external/bsd/openssl/dist/crypto: x86_64cpuid.pl
src/crypto/external/bsd/openssl/dist/crypto/sha/asm: sha1-x86_64.pl
Log Message:
revert change from openssl-1.1.0-pre4, breaks gcc-4.8
To generate a diff of this commit:
cvs rdiff -u -r1.2 -r1.3 \
src/crypto/external/bsd/openssl/dist/crypto/x86_64cpuid.pl
cvs rdiff -u -r1.4 -r1.5 \
src/crypto/external/bsd/openssl/dist/crypto/sha/asm/sha1-x86_64.pl
Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.
Modified files:
Index: src/crypto/external/bsd/openssl/dist/crypto/x86_64cpuid.pl
diff -u src/crypto/external/bsd/openssl/dist/crypto/x86_64cpuid.pl:1.2 src/crypto/external/bsd/openssl/dist/crypto/x86_64cpuid.pl:1.3
--- src/crypto/external/bsd/openssl/dist/crypto/x86_64cpuid.pl:1.2 Sun Mar 20 18:26:56 2016
+++ src/crypto/external/bsd/openssl/dist/crypto/x86_64cpuid.pl Mon Mar 21 15:12:26 2016
@@ -24,7 +24,7 @@ print<<___;
call OPENSSL_cpuid_setup
.hidden OPENSSL_ia32cap_P
-.comm OPENSSL_ia32cap_P,16,4
+.comm OPENSSL_ia32cap_P,8,4
.text
@@ -53,13 +53,12 @@ OPENSSL_rdtsc:
.size OPENSSL_rdtsc,.-OPENSSL_rdtsc
.globl OPENSSL_ia32_cpuid
-.type OPENSSL_ia32_cpuid,\@function,1
+.type OPENSSL_ia32_cpuid,\@abi-omnipotent
.align 16
OPENSSL_ia32_cpuid:
mov %rbx,%r8 # save %rbx
xor %eax,%eax
- mov %eax,8(%rdi) # clear 3rd word
cpuid
mov %eax,%r11d # max value for standard query level
@@ -127,14 +126,6 @@ OPENSSL_ia32_cpuid:
shr \$14,%r10d
and \$0xfff,%r10d # number of cores -1 per L1D
- cmp \$7,%r11d
- jb .Lnocacheinfo
-
- mov \$7,%eax
- xor %ecx,%ecx
- cpuid
- mov %ebx,8(%rdi)
-
.Lnocacheinfo:
mov \$1,%eax
cpuid
@@ -174,7 +165,6 @@ OPENSSL_ia32_cpuid:
.Lclear_avx:
mov \$0xefffe7ff,%eax # ~(1<<28|1<<12|1<<11)
and %eax,%r9d # clear AVX, FMA and AMD XOP bits
- andl \$0xffffffdf,8(%rdi) # cleax AVX2, ~(1<<5)
.Ldone:
shl \$32,%r9
mov %r10d,%eax
@@ -273,96 +263,6 @@ OPENSSL_wipe_cpu:
ret
.size OPENSSL_wipe_cpu,.-OPENSSL_wipe_cpu
___
-{
-my $out="%r10";
-my $cnt="%rcx";
-my $max="%r11";
-my $lasttick="%r8d";
-my $lastdiff="%r9d";
-my $redzone=win64?8:-8;
-
-print<<___;
-.globl OPENSSL_instrument_bus
-.type OPENSSL_instrument_bus,\@abi-omnipotent
-.align 16
-OPENSSL_instrument_bus:
- mov $arg1,$out # tribute to Win64
- mov $arg2,$cnt
- mov $arg2,$max
-
- rdtsc # collect 1st tick
- mov %eax,$lasttick # lasttick = tick
- mov \$0,$lastdiff # lastdiff = 0
- clflush ($out)
- .byte 0xf0 # lock
- add $lastdiff,($out)
- jmp .Loop
-.align 16
-.Loop: rdtsc
- mov %eax,%edx
- sub $lasttick,%eax
- mov %edx,$lasttick
- mov %eax,$lastdiff
- clflush ($out)
- .byte 0xf0 # lock
- add %eax,($out)
- lea 4($out),$out
- sub \$1,$cnt
- jnz .Loop
-
- mov $max,%rax
- ret
-.size OPENSSL_instrument_bus,.-OPENSSL_instrument_bus
-
-.globl OPENSSL_instrument_bus2
-.type OPENSSL_instrument_bus2,\@abi-omnipotent
-.align 16
-OPENSSL_instrument_bus2:
- mov $arg1,$out # tribute to Win64
- mov $arg2,$cnt
- mov $arg3,$max
- mov $cnt,$redzone(%rsp)
-
- rdtsc # collect 1st tick
- mov %eax,$lasttick # lasttick = tick
- mov \$0,$lastdiff # lastdiff = 0
-
- clflush ($out)
- .byte 0xf0 # lock
- add $lastdiff,($out)
-
- rdtsc # collect 1st diff
- mov %eax,%edx
- sub $lasttick,%eax # diff
- mov %edx,$lasttick # lasttick = tick
- mov %eax,$lastdiff # lastdiff = diff
-.Loop2:
- clflush ($out)
- .byte 0xf0 # lock
- add %eax,($out) # accumulate diff
-
- sub \$1,$max
- jz .Ldone2
-
- rdtsc
- mov %eax,%edx
- sub $lasttick,%eax # diff
- mov %edx,$lasttick # lasttick = tick
- cmp $lastdiff,%eax
- mov %eax,$lastdiff # lastdiff = diff
- mov \$0,%edx
- setne %dl
- sub %rdx,$cnt # conditional --$cnt
- lea ($out,%rdx,4),$out # conditional ++$out
- jnz .Loop2
-
-.Ldone2:
- mov $redzone(%rsp),%rax
- sub $cnt,%rax
- ret
-.size OPENSSL_instrument_bus2,.-OPENSSL_instrument_bus2
-___
-}
print<<___;
.globl OPENSSL_ia32_rdrand
@@ -379,21 +279,6 @@ OPENSSL_ia32_rdrand:
cmove %rcx,%rax
ret
.size OPENSSL_ia32_rdrand,.-OPENSSL_ia32_rdrand
-
-.globl OPENSSL_ia32_rdseed
-.type OPENSSL_ia32_rdseed,\@abi-omnipotent
-.align 16
-OPENSSL_ia32_rdseed:
- mov \$8,%ecx
-.Loop_rdseed:
- rdseed %rax
- jc .Lbreak_rdseed
- loop .Loop_rdseed
-.Lbreak_rdseed:
- cmp \$0,%rax
- cmove %rcx,%rax
- ret
-.size OPENSSL_ia32_rdseed,.-OPENSSL_ia32_rdseed
___
close STDOUT; # flush
Index: src/crypto/external/bsd/openssl/dist/crypto/sha/asm/sha1-x86_64.pl
diff -u src/crypto/external/bsd/openssl/dist/crypto/sha/asm/sha1-x86_64.pl:1.4 src/crypto/external/bsd/openssl/dist/crypto/sha/asm/sha1-x86_64.pl:1.5
--- src/crypto/external/bsd/openssl/dist/crypto/sha/asm/sha1-x86_64.pl:1.4 Sun Mar 20 18:26:56 2016
+++ src/crypto/external/bsd/openssl/dist/crypto/sha/asm/sha1-x86_64.pl Mon Mar 21 15:12:26 2016
@@ -1,7 +1,7 @@
#!/usr/bin/env perl
#
# ====================================================================
-# Written by Andy Polyakov <[email protected]> for the OpenSSL
+# Written by Andy Polyakov <[email protected]> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
@@ -49,38 +49,17 @@
#
# Add AVX code path. See sha1-586.pl for further information.
-# May 2013.
-#
-# Add AVX2+BMI code path. Initial attempt (utilizing BMI instructions
-# and loading pair of consecutive blocks to 256-bit %ymm registers)
-# did not provide impressive performance improvement till a crucial
-# hint regarding the number of Xupdate iterations to pre-compute in
-# advance was provided by Ilya Albrekht of Intel Corp.
-
-# March 2014.
-#
-# Add support for Intel SHA Extensions.
-
######################################################################
# Current performance is summarized in following table. Numbers are
# CPU clock cycles spent to process single byte (less is better).
#
-# x86_64 SSSE3 AVX[2]
-# P4 9.05 -
-# Opteron 6.26 -
-# Core2 6.55 6.05/+8% -
-# Westmere 6.73 5.30/+27% -
-# Sandy Bridge 7.70 6.10/+26% 4.99/+54%
-# Ivy Bridge 6.06 4.67/+30% 4.60/+32%
-# Haswell 5.45 4.15/+31% 3.57/+53%
-# Skylake 5.18 4.06/+28% 3.54/+46%
-# Bulldozer 9.11 5.95/+53%
-# VIA Nano 9.32 7.15/+30%
-# Atom 10.3 9.17/+12%
-# Silvermont 13.1(*) 9.37/+40%
-#
-# (*) obviously suboptimal result, nothing was done about it,
-# because SSSE3 code is compiled unconditionally;
+# x86_64 SSSE3 AVX
+# P4 9.8 -
+# Opteron 6.6 -
+# Core2 6.7 6.1/+10% -
+# Atom 11.0 9.7/+13% -
+# Westmere 7.1 5.6/+27% -
+# Sandy Bridge 7.9 6.3/+25% 5.2/+51%
$flavour = shift;
$output = shift;
@@ -93,27 +72,15 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
die "can't locate x86_64-xlate.pl";
-if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
- =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
- $avx = ($1>=2.19) + ($1>=2.22);
-}
-
-if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
- `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
- $avx = ($1>=2.09) + ($1>=2.10);
-}
-
-if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
- `ml64 2>&1` =~ /Version ([0-9]+)\./) {
- $avx = ($1>=10) + ($1>=11);
-}
-
-if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([2-9]\.[0-9]+)/) {
- $avx = ($2>=3.0) + ($2>3.0);
-}
-
-$shaext=1; ### set to zero if compiling for 1.0.1
-$avx=1 if (!$shaext && $avx);
+$avx=1 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+ =~ /GNU assembler version ([2-9]\.[0-9]+)/ &&
+ $1>=2.19);
+$avx=1 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+ `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ &&
+ $1>=2.09);
+$avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
+ `ml64 2>&1` =~ /Version ([0-9]+)\./ &&
+ $1>=10);
open OUT,"| \"$^X\" $xlate $flavour $output";
*STDOUT=*OUT;
@@ -130,7 +97,7 @@ $num="%r10";
$t0="%eax";
$t1="%ebx";
$t2="%ecx";
-@xi=("%edx","%ebp","%r14d");
+@xi=("%edx","%ebp");
$A="%esi";
$B="%edi";
$C="%r11d";
@@ -145,40 +112,42 @@ my $j=$i+1;
$code.=<<___ if ($i==0);
mov `4*$i`($inp),$xi[0]
bswap $xi[0]
+ mov $xi[0],`4*$i`(%rsp)
___
$code.=<<___ if ($i<15);
+ mov $c,$t0
mov `4*$j`($inp),$xi[1]
- mov $d,$t0
- mov $xi[0],`4*$i`(%rsp)
mov $a,$t2
+ xor $d,$t0
bswap $xi[1]
- xor $c,$t0
rol \$5,$t2
- and $b,$t0
lea 0x5a827999($xi[0],$e),$e
+ and $b,$t0
+ mov $xi[1],`4*$j`(%rsp)
add $t2,$e
xor $d,$t0
rol \$30,$b
add $t0,$e
___
$code.=<<___ if ($i>=15);
- xor `4*($j%16)`(%rsp),$xi[1]
- mov $d,$t0
- mov $xi[0],`4*($i%16)`(%rsp)
+ mov `4*($j%16)`(%rsp),$xi[1]
+ mov $c,$t0
mov $a,$t2
xor `4*(($j+2)%16)`(%rsp),$xi[1]
- xor $c,$t0
+ xor $d,$t0
rol \$5,$t2
xor `4*(($j+8)%16)`(%rsp),$xi[1]
and $b,$t0
lea 0x5a827999($xi[0],$e),$e
- rol \$30,$b
+ xor `4*(($j+13)%16)`(%rsp),$xi[1]
xor $d,$t0
- add $t2,$e
rol \$1,$xi[1]
+ add $t2,$e
+ rol \$30,$b
+ mov $xi[1],`4*($j%16)`(%rsp)
add $t0,$e
___
-push(@xi,shift(@xi));
+unshift(@xi,pop(@xi));
}
sub BODY_20_39 {
@@ -186,58 +155,62 @@ my ($i,$a,$b,$c,$d,$e)=@_;
my $j=$i+1;
my $K=($i<40)?0x6ed9eba1:0xca62c1d6;
$code.=<<___ if ($i<79);
- xor `4*($j%16)`(%rsp),$xi[1]
- mov $b,$t0
- `"mov $xi[0],".4*($i%16)."(%rsp)" if ($i<72)`
+ mov `4*($j%16)`(%rsp),$xi[1]
+ mov $c,$t0
mov $a,$t2
xor `4*(($j+2)%16)`(%rsp),$xi[1]
- xor $d,$t0
+ xor $b,$t0
rol \$5,$t2
- xor `4*(($j+8)%16)`(%rsp),$xi[1]
lea $K($xi[0],$e),$e
- xor $c,$t0
+ xor `4*(($j+8)%16)`(%rsp),$xi[1]
+ xor $d,$t0
add $t2,$e
+ xor `4*(($j+13)%16)`(%rsp),$xi[1]
rol \$30,$b
add $t0,$e
rol \$1,$xi[1]
___
+$code.=<<___ if ($i<76);
+ mov $xi[1],`4*($j%16)`(%rsp)
+___
$code.=<<___ if ($i==79);
- mov $b,$t0
+ mov $c,$t0
mov $a,$t2
- xor $d,$t0
+ xor $b,$t0
lea $K($xi[0],$e),$e
rol \$5,$t2
- xor $c,$t0
+ xor $d,$t0
add $t2,$e
rol \$30,$b
add $t0,$e
___
-push(@xi,shift(@xi));
+unshift(@xi,pop(@xi));
}
sub BODY_40_59 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $j=$i+1;
$code.=<<___;
- xor `4*($j%16)`(%rsp),$xi[1]
- mov $d,$t0
- mov $xi[0],`4*($i%16)`(%rsp)
- mov $d,$t1
+ mov `4*($j%16)`(%rsp),$xi[1]
+ mov $c,$t0
+ mov $c,$t1
xor `4*(($j+2)%16)`(%rsp),$xi[1]
- and $c,$t0
+ and $d,$t0
mov $a,$t2
xor `4*(($j+8)%16)`(%rsp),$xi[1]
+ xor $d,$t1
lea 0x8f1bbcdc($xi[0],$e),$e
- xor $c,$t1
rol \$5,$t2
+ xor `4*(($j+13)%16)`(%rsp),$xi[1]
add $t0,$e
- rol \$1,$xi[1]
and $b,$t1
- add $t2,$e
- rol \$30,$b
+ rol \$1,$xi[1]
add $t1,$e
+ rol \$30,$b
+ mov $xi[1],`4*($j%16)`(%rsp)
+ add $t2,$e
___
-push(@xi,shift(@xi));
+unshift(@xi,pop(@xi));
}
$code.=<<___;
@@ -248,45 +221,31 @@ $code.=<<___;
.type sha1_block_data_order,\@function,3
.align 16
sha1_block_data_order:
- mov OPENSSL_ia32cap_P+0(%rip),%r9d
- mov OPENSSL_ia32cap_P+4(%rip),%r8d
- mov OPENSSL_ia32cap_P+8(%rip),%r10d
- test \$`1<<9`,%r8d # check SSSE3 bit
- jz .Lialu
-___
-$code.=<<___ if ($shaext);
- test \$`1<<29`,%r10d # check SHA bit
- jnz _shaext_shortcut
-___
-$code.=<<___ if ($avx>1);
- and \$`1<<3|1<<5|1<<8`,%r10d # check AVX2+BMI1+BMI2
- cmp \$`1<<3|1<<5|1<<8`,%r10d
- je _avx2_shortcut
+ mov OPENSSL_ia32cap_P+0(%rip),%r8
+ mov 4(%r8),%r8d
+ bt \$9,%r8d
+ jnc .Lialu
___
$code.=<<___ if ($avx);
- and \$`1<<28`,%r8d # mask AVX bit
- and \$`1<<30`,%r9d # mask "Intel CPU" bit
- or %r9d,%r8d
- cmp \$`1<<28|1<<30`,%r8d
- je _avx_shortcut
+ bt \$28,%r8d
+ jc _avx_shortcut
___
$code.=<<___;
jmp _ssse3_shortcut
.align 16
.Lialu:
- mov %rsp,%rax
push %rbx
push %rbp
push %r12
push %r13
- push %r14
+ mov %rsp,%r11
mov %rdi,$ctx # reassigned argument
sub \$`8+16*4`,%rsp
mov %rsi,$inp # reassigned argument
and \$-64,%rsp
mov %rdx,$num # reassigned argument
- mov %rax,`16*4`(%rsp)
+ mov %r11,`16*4`(%rsp)
.Lprologue:
mov 0($ctx),$A
@@ -320,187 +279,53 @@ $code.=<<___;
jnz .Lloop
mov `16*4`(%rsp),%rsi
- mov -40(%rsi),%r14
- mov -32(%rsi),%r13
- mov -24(%rsi),%r12
- mov -16(%rsi),%rbp
- mov -8(%rsi),%rbx
- lea (%rsi),%rsp
+ mov (%rsi),%r13
+ mov 8(%rsi),%r12
+ mov 16(%rsi),%rbp
+ mov 24(%rsi),%rbx
+ lea 32(%rsi),%rsp
.Lepilogue:
ret
.size sha1_block_data_order,.-sha1_block_data_order
___
-if ($shaext) {{{
-######################################################################
-# Intel SHA Extensions implementation of SHA1 update function.
-#
-my ($ctx,$inp,$num)=("%rdi","%rsi","%rdx");
-my ($ABCD,$E,$E_,$BSWAP,$ABCD_SAVE,$E_SAVE)=map("%xmm$_",(0..3,8,9));
-my @MSG=map("%xmm$_",(4..7));
-
-$code.=<<___;
-.type sha1_block_data_order_shaext,\@function,3
-.align 32
-sha1_block_data_order_shaext:
-_shaext_shortcut:
-___
-$code.=<<___ if ($win64);
- lea `-8-4*16`(%rsp),%rsp
- movaps %xmm6,-8-4*16(%rax)
- movaps %xmm7,-8-3*16(%rax)
- movaps %xmm8,-8-2*16(%rax)
- movaps %xmm9,-8-1*16(%rax)
-.Lprologue_shaext:
-___
-$code.=<<___;
- movdqu ($ctx),$ABCD
- movd 16($ctx),$E
- movdqa K_XX_XX+0xa0(%rip),$BSWAP # byte-n-word swap
-
- movdqu ($inp),@MSG[0]
- pshufd \$0b00011011,$ABCD,$ABCD # flip word order
- movdqu 0x10($inp),@MSG[1]
- pshufd \$0b00011011,$E,$E # flip word order
- movdqu 0x20($inp),@MSG[2]
- pshufb $BSWAP,@MSG[0]
- movdqu 0x30($inp),@MSG[3]
- pshufb $BSWAP,@MSG[1]
- pshufb $BSWAP,@MSG[2]
- movdqa $E,$E_SAVE # offload $E
- pshufb $BSWAP,@MSG[3]
- jmp .Loop_shaext
-
-.align 16
-.Loop_shaext:
- dec $num
- lea 0x40($inp),%rax # next input block
- paddd @MSG[0],$E
- cmovne %rax,$inp
- movdqa $ABCD,$ABCD_SAVE # offload $ABCD
-___
-for($i=0;$i<20-4;$i+=2) {
-$code.=<<___;
- sha1msg1 @MSG[1],@MSG[0]
- movdqa $ABCD,$E_
- sha1rnds4 \$`int($i/5)`,$E,$ABCD # 0-3...
- sha1nexte @MSG[1],$E_
- pxor @MSG[2],@MSG[0]
- sha1msg1 @MSG[2],@MSG[1]
- sha1msg2 @MSG[3],@MSG[0]
-
- movdqa $ABCD,$E
- sha1rnds4 \$`int(($i+1)/5)`,$E_,$ABCD
- sha1nexte @MSG[2],$E
- pxor @MSG[3],@MSG[1]
- sha1msg2 @MSG[0],@MSG[1]
-___
- push(@MSG,shift(@MSG)); push(@MSG,shift(@MSG));
-}
-$code.=<<___;
- movdqu ($inp),@MSG[0]
- movdqa $ABCD,$E_
- sha1rnds4 \$3,$E,$ABCD # 64-67
- sha1nexte @MSG[1],$E_
- movdqu 0x10($inp),@MSG[1]
- pshufb $BSWAP,@MSG[0]
-
- movdqa $ABCD,$E
- sha1rnds4 \$3,$E_,$ABCD # 68-71
- sha1nexte @MSG[2],$E
- movdqu 0x20($inp),@MSG[2]
- pshufb $BSWAP,@MSG[1]
-
- movdqa $ABCD,$E_
- sha1rnds4 \$3,$E,$ABCD # 72-75
- sha1nexte @MSG[3],$E_
- movdqu 0x30($inp),@MSG[3]
- pshufb $BSWAP,@MSG[2]
-
- movdqa $ABCD,$E
- sha1rnds4 \$3,$E_,$ABCD # 76-79
- sha1nexte $E_SAVE,$E
- pshufb $BSWAP,@MSG[3]
-
- paddd $ABCD_SAVE,$ABCD
- movdqa $E,$E_SAVE # offload $E
-
- jnz .Loop_shaext
-
- pshufd \$0b00011011,$ABCD,$ABCD
- pshufd \$0b00011011,$E,$E
- movdqu $ABCD,($ctx)
- movd $E,16($ctx)
-___
-$code.=<<___ if ($win64);
- movaps -8-4*16(%rax),%xmm6
- movaps -8-3*16(%rax),%xmm7
- movaps -8-2*16(%rax),%xmm8
- movaps -8-1*16(%rax),%xmm9
- mov %rax,%rsp
-.Lepilogue_shaext:
-___
-$code.=<<___;
- ret
-.size sha1_block_data_order_shaext,.-sha1_block_data_order_shaext
-___
-}}}
{{{
my $Xi=4;
my @X=map("%xmm$_",(4..7,0..3));
my @Tx=map("%xmm$_",(8..10));
-my $Kx="%xmm11";
my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization
my @T=("%esi","%edi");
my $j=0;
-my $rx=0;
my $K_XX_XX="%r11";
my $_rol=sub { &rol(@_) };
my $_ror=sub { &ror(@_) };
-{ my $sn;
-sub align32() {
- ++$sn;
-$code.=<<___;
- jmp .Lalign32_$sn # see "Decoded ICache" in manual
-.align 32
-.Lalign32_$sn:
-___
-}
-}
-
$code.=<<___;
.type sha1_block_data_order_ssse3,\@function,3
.align 16
sha1_block_data_order_ssse3:
_ssse3_shortcut:
- mov %rsp,%rax
push %rbx
push %rbp
push %r12
- push %r13 # redundant, done to share Win64 SE handler
- push %r14
- lea `-64-($win64?6*16:0)`(%rsp),%rsp
+ lea `-64-($win64?5*16:0)`(%rsp),%rsp
___
$code.=<<___ if ($win64);
- movaps %xmm6,-40-6*16(%rax)
- movaps %xmm7,-40-5*16(%rax)
- movaps %xmm8,-40-4*16(%rax)
- movaps %xmm9,-40-3*16(%rax)
- movaps %xmm10,-40-2*16(%rax)
- movaps %xmm11,-40-1*16(%rax)
+ movaps %xmm6,64+0(%rsp)
+ movaps %xmm7,64+16(%rsp)
+ movaps %xmm8,64+32(%rsp)
+ movaps %xmm9,64+48(%rsp)
+ movaps %xmm10,64+64(%rsp)
.Lprologue_ssse3:
___
$code.=<<___;
- mov %rax,%r14 # original %rsp
- and \$-64,%rsp
mov %rdi,$ctx # reassigned argument
mov %rsi,$inp # reassigned argument
mov %rdx,$num # reassigned argument
shl \$6,$num
add $inp,$num
- lea K_XX_XX+64(%rip),$K_XX_XX
+ lea K_XX_XX(%rip),$K_XX_XX
mov 0($ctx),$A # load context
mov 4($ctx),$B
@@ -508,22 +333,19 @@ $code.=<<___;
mov 12($ctx),$D
mov $B,@T[0] # magic seed
mov 16($ctx),$E
- mov $C,@T[1]
- xor $D,@T[1]
- and @T[1],@T[0]
movdqa 64($K_XX_XX),@X[2] # pbswap mask
- movdqa -64($K_XX_XX),@Tx[1] # K_00_19
+ movdqa 0($K_XX_XX),@Tx[1] # K_00_19
movdqu 0($inp),@X[-4&7] # load input to %xmm[0-3]
movdqu 16($inp),@X[-3&7]
movdqu 32($inp),@X[-2&7]
movdqu 48($inp),@X[-1&7]
pshufb @X[2],@X[-4&7] # byte swap
+ add \$64,$inp
pshufb @X[2],@X[-3&7]
pshufb @X[2],@X[-2&7]
- add \$64,$inp
- paddd @Tx[1],@X[-4&7] # add K_00_19
pshufb @X[2],@X[-1&7]
+ paddd @Tx[1],@X[-4&7] # add K_00_19
paddd @Tx[1],@X[-3&7]
paddd @Tx[1],@X[-2&7]
movdqa @X[-4&7],0(%rsp) # X[]+K xfer to IALU
@@ -548,75 +370,74 @@ sub Xupdate_ssse3_16_31() # recall that
my @insns = (&$body,&$body,&$body,&$body); # 40 instructions
my ($a,$b,$c,$d,$e);
- eval(shift(@insns)); # ror
- &pshufd (@X[0],@X[-4&7],0xee); # was &movdqa (@X[0],@X[-3&7]);
+ &movdqa (@X[0],@X[-3&7]);
+ eval(shift(@insns));
eval(shift(@insns));
&movdqa (@Tx[0],@X[-1&7]);
- &paddd (@Tx[1],@X[-1&7]);
+ &palignr(@X[0],@X[-4&7],8); # compose "X[-14]" in "X[0]"
eval(shift(@insns));
eval(shift(@insns));
- &punpcklqdq(@X[0],@X[-3&7]); # compose "X[-14]" in "X[0]", was &palignr(@X[0],@X[-4&7],8);
+ &paddd (@Tx[1],@X[-1&7]);
eval(shift(@insns));
- eval(shift(@insns)); # rol
eval(shift(@insns));
&psrldq (@Tx[0],4); # "X[-3]", 3 dwords
eval(shift(@insns));
eval(shift(@insns));
-
&pxor (@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
eval(shift(@insns));
- eval(shift(@insns)); # ror
+ eval(shift(@insns));
+
&pxor (@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
+ eval(shift(@insns));
&pxor (@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]"
eval(shift(@insns));
- eval(shift(@insns)); # rol
+ eval(shift(@insns));
&movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
eval(shift(@insns));
eval(shift(@insns));
&movdqa (@Tx[2],@X[0]);
+ &movdqa (@Tx[0],@X[0]);
+ eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
- eval(shift(@insns)); # ror
- &movdqa (@Tx[0],@X[0]);
eval(shift(@insns));
&pslldq (@Tx[2],12); # "X[0]"<<96, extract one dword
&paddd (@X[0],@X[0]);
eval(shift(@insns));
eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
&psrld (@Tx[0],31);
eval(shift(@insns));
- eval(shift(@insns)); # rol
eval(shift(@insns));
&movdqa (@Tx[1],@Tx[2]);
eval(shift(@insns));
eval(shift(@insns));
&psrld (@Tx[2],30);
- eval(shift(@insns));
- eval(shift(@insns)); # ror
&por (@X[0],@Tx[0]); # "X[0]"<<<=1
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
+ eval(shift(@insns));
&pslld (@Tx[1],2);
&pxor (@X[0],@Tx[2]);
eval(shift(@insns));
- &movdqa (@Tx[2],eval(2*16*(($Xi)/5)-64)."($K_XX_XX)"); # K_XX_XX
- eval(shift(@insns)); # rol
+ eval(shift(@insns));
+ &movdqa (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)"); # K_XX_XX
eval(shift(@insns));
eval(shift(@insns));
&pxor (@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2
- &pshufd (@Tx[1],@X[-1&7],0xee) if ($Xi==7); # was &movdqa (@Tx[0],@X[-1&7]) in Xupdate_ssse3_32_79
foreach (@insns) { eval; } # remaining instructions [if any]
@@ -627,30 +448,27 @@ sub Xupdate_ssse3_16_31() # recall that
sub Xupdate_ssse3_32_79()
{ use integer;
my $body = shift;
- my @insns = (&$body,&$body,&$body,&$body); # 32 to 44 instructions
+ my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions
my ($a,$b,$c,$d,$e);
- eval(shift(@insns)) if ($Xi==8);
- &pxor (@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
- eval(shift(@insns)) if ($Xi==8);
+ &movdqa (@Tx[0],@X[-1&7]) if ($Xi==8);
eval(shift(@insns)); # body_20_39
+ &pxor (@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
+ &palignr(@Tx[0],@X[-2&7],8); # compose "X[-6]"
eval(shift(@insns));
- eval(shift(@insns)) if (@insns[1] =~ /_ror/);
- eval(shift(@insns)) if (@insns[0] =~ /_ror/);
- &punpcklqdq(@Tx[0],@X[-1&7]); # compose "X[-6]", was &palignr(@Tx[0],@X[-2&7],8);
eval(shift(@insns));
eval(shift(@insns)); # rol
&pxor (@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
eval(shift(@insns));
- eval(shift(@insns));
+ eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/);
if ($Xi%5) {
&movdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
} else { # ... or load next one
- &movdqa (@Tx[2],eval(2*16*($Xi/5)-64)."($K_XX_XX)");
+ &movdqa (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
}
- eval(shift(@insns)); # ror
&paddd (@Tx[1],@X[-1&7]);
+ eval(shift(@insns)); # ror
eval(shift(@insns));
&pxor (@X[0],@Tx[0]); # "X[0]"^="X[-6]"
@@ -658,31 +476,29 @@ sub Xupdate_ssse3_32_79()
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns)); # rol
- eval(shift(@insns)) if (@insns[0] =~ /_ror/);
&movdqa (@Tx[0],@X[0]);
+ &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
eval(shift(@insns));
eval(shift(@insns));
- &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
eval(shift(@insns)); # ror
eval(shift(@insns));
- eval(shift(@insns)); # body_20_39
&pslld (@X[0],2);
- eval(shift(@insns));
+ eval(shift(@insns)); # body_20_39
eval(shift(@insns));
&psrld (@Tx[0],30);
- eval(shift(@insns)) if (@insns[0] =~ /_rol/);# rol
+ eval(shift(@insns));
+ eval(shift(@insns)); # rol
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns)); # ror
+ eval(shift(@insns));
&por (@X[0],@Tx[0]); # "X[0]"<<<=2
- eval(shift(@insns));
eval(shift(@insns)); # body_20_39
- eval(shift(@insns)) if (@insns[1] =~ /_rol/);
- eval(shift(@insns)) if (@insns[0] =~ /_rol/);
- &pshufd(@Tx[1],@X[-1&7],0xee) if ($Xi<19); # was &movdqa (@Tx[1],@X[0])
+ eval(shift(@insns));
+ &movdqa (@Tx[1],@X[0]) if ($Xi<19);
eval(shift(@insns));
eval(shift(@insns)); # rol
eval(shift(@insns));
@@ -703,11 +519,10 @@ sub Xuplast_ssse3_80()
my ($a,$b,$c,$d,$e);
eval(shift(@insns));
+ &paddd (@Tx[1],@X[-1&7]);
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
- &paddd (@Tx[1],@X[-1&7]);
- eval(shift(@insns));
eval(shift(@insns));
&movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU
@@ -720,7 +535,7 @@ sub Xuplast_ssse3_80()
unshift(@Tx,pop(@Tx));
&movdqa (@X[2],"64($K_XX_XX)"); # pbswap mask
- &movdqa (@Tx[1],"-64($K_XX_XX)"); # K_00_19
+ &movdqa (@Tx[1],"0($K_XX_XX)"); # K_00_19
&movdqu (@X[-4&7],"0($inp)"); # load input
&movdqu (@X[-3&7],"16($inp)");
&movdqu (@X[-2&7],"32($inp)");
@@ -739,12 +554,9 @@ sub Xloop_ssse3()
eval(shift(@insns));
eval(shift(@insns));
- eval(shift(@insns));
&pshufb (@X[($Xi-3)&7],@X[2]);
eval(shift(@insns));
eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
&paddd (@X[($Xi-4)&7],@Tx[1]);
eval(shift(@insns));
eval(shift(@insns));
@@ -753,8 +565,6 @@ sub Xloop_ssse3()
&movdqa (eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]); # X[]+K xfer to IALU
eval(shift(@insns));
eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
&psubd (@X[($Xi-4)&7],@Tx[1]);
foreach (@insns) { eval; }
@@ -770,66 +580,51 @@ sub Xtail_ssse3()
foreach (@insns) { eval; }
}
-sub body_00_19 () { # ((c^d)&b)^d
- # on start @T[0]=(c^d)&b
- return &body_20_39() if ($rx==19); $rx++;
+sub body_00_19 () {
(
'($a,$b,$c,$d,$e)=@V;'.
- '&$_ror ($b,$j?7:2)', # $b>>>2
- '&xor (@T[0],$d)',
- '&mov (@T[1],$a)', # $b for next round
-
- '&add ($e,eval(4*($j&15))."(%rsp)")', # X[]+K xfer
- '&xor ($b,$c)', # $c^$d for next round
-
- '&$_rol ($a,5)',
- '&add ($e,@T[0])',
- '&and (@T[1],$b)', # ($b&($c^$d)) for next round
-
- '&xor ($b,$c)', # restore $b
- '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+ '&add ($e,eval(4*($j&15))."(%rsp)");', # X[]+K xfer
+ '&xor ($c,$d);',
+ '&mov (@T[1],$a);', # $b in next round
+ '&$_rol ($a,5);',
+ '&and (@T[0],$c);', # ($b&($c^$d))
+ '&xor ($c,$d);', # restore $c
+ '&xor (@T[0],$d);',
+ '&add ($e,$a);',
+ '&$_ror ($b,$j?7:2);', # $b>>>2
+ '&add ($e,@T[0]);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
);
}
-sub body_20_39 () { # b^d^c
- # on entry @T[0]=b^d
- return &body_40_59() if ($rx==39); $rx++;
+sub body_20_39 () {
(
'($a,$b,$c,$d,$e)=@V;'.
- '&add ($e,eval(4*($j&15))."(%rsp)")', # X[]+K xfer
- '&xor (@T[0],$d) if($j==19);'.
- '&xor (@T[0],$c) if($j> 19)', # ($b^$d^$c)
- '&mov (@T[1],$a)', # $b for next round
-
- '&$_rol ($a,5)',
- '&add ($e,@T[0])',
- '&xor (@T[1],$c) if ($j< 79)', # $b^$d for next round
-
- '&$_ror ($b,7)', # $b>>>2
- '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+ '&add ($e,eval(4*($j++&15))."(%rsp)");', # X[]+K xfer
+ '&xor (@T[0],$d);', # ($b^$d)
+ '&mov (@T[1],$a);', # $b in next round
+ '&$_rol ($a,5);',
+ '&xor (@T[0],$c);', # ($b^$d^$c)
+ '&add ($e,$a);',
+ '&$_ror ($b,7);', # $b>>>2
+ '&add ($e,@T[0]);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
);
}
-sub body_40_59 () { # ((b^c)&(c^d))^c
- # on entry @T[0]=(b^c), (c^=d)
- $rx++;
+sub body_40_59 () {
(
'($a,$b,$c,$d,$e)=@V;'.
- '&add ($e,eval(4*($j&15))."(%rsp)")', # X[]+K xfer
- '&and (@T[0],$c) if ($j>=40)', # (b^c)&(c^d)
- '&xor ($c,$d) if ($j>=40)', # restore $c
-
- '&$_ror ($b,7)', # $b>>>2
- '&mov (@T[1],$a)', # $b for next round
- '&xor (@T[0],$c)',
-
- '&$_rol ($a,5)',
- '&add ($e,@T[0])',
- '&xor (@T[1],$c) if ($j==59);'.
- '&xor (@T[1],$b) if ($j< 59)', # b^c for next round
-
- '&xor ($b,$c) if ($j< 59)', # c^d for next round
- '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+ '&mov (@T[1],$c);',
+ '&xor ($c,$d);',
+ '&add ($e,eval(4*($j++&15))."(%rsp)");', # X[]+K xfer
+ '&and (@T[1],$d);',
+ '&and (@T[0],$c);', # ($b&($c^$d))
+ '&$_ror ($b,7);', # $b>>>2
+ '&add ($e,@T[1]);',
+ '&mov (@T[1],$a);', # $b in next round
+ '&$_rol ($a,5);',
+ '&add ($e,@T[0]);',
+ '&xor ($c,$d);', # restore $c
+ '&add ($e,$a);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
);
}
$code.=<<___;
@@ -870,11 +665,8 @@ $code.=<<___;
mov @T[0],4($ctx)
mov @T[0],$B # magic seed
mov $C,8($ctx)
- mov $C,@T[1]
mov $D,12($ctx)
- xor $D,@T[1]
mov $E,16($ctx)
- and @T[1],@T[0]
jmp .Loop_ssse3
.align 16
@@ -899,34 +691,31 @@ $code.=<<___;
mov $E,16($ctx)
___
$code.=<<___ if ($win64);
- movaps -40-6*16(%r14),%xmm6
- movaps -40-5*16(%r14),%xmm7
- movaps -40-4*16(%r14),%xmm8
- movaps -40-3*16(%r14),%xmm9
- movaps -40-2*16(%r14),%xmm10
- movaps -40-1*16(%r14),%xmm11
+ movaps 64+0(%rsp),%xmm6
+ movaps 64+16(%rsp),%xmm7
+ movaps 64+32(%rsp),%xmm8
+ movaps 64+48(%rsp),%xmm9
+ movaps 64+64(%rsp),%xmm10
___
$code.=<<___;
- lea (%r14),%rsi
- mov -40(%rsi),%r14
- mov -32(%rsi),%r13
- mov -24(%rsi),%r12
- mov -16(%rsi),%rbp
- mov -8(%rsi),%rbx
- lea (%rsi),%rsp
+ lea `64+($win64?5*16:0)`(%rsp),%rsi
+ mov 0(%rsi),%r12
+ mov 8(%rsi),%rbp
+ mov 16(%rsi),%rbx
+ lea 24(%rsi),%rsp
.Lepilogue_ssse3:
ret
.size sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3
___
if ($avx) {
-$Xi=4; # reset variables
-@X=map("%xmm$_",(4..7,0..3));
-@Tx=map("%xmm$_",(8..10));
-$j=0;
-$rx=0;
-
-my $done_avx_label=".Ldone_avx";
+my $Xi=4;
+my @X=map("%xmm$_",(4..7,0..3));
+my @Tx=map("%xmm$_",(8..10));
+my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization
+my @T=("%esi","%edi");
+my $j=0;
+my $K_XX_XX="%r11";
my $_rol=sub { &shld(@_[0],@_) };
my $_ror=sub { &shrd(@_[0],@_) };
@@ -936,34 +725,28 @@ $code.=<<___;
.align 16
sha1_block_data_order_avx:
_avx_shortcut:
- mov %rsp,%rax
push %rbx
push %rbp
push %r12
- push %r13 # redundant, done to share Win64 SE handler
- push %r14
- lea `-64-($win64?6*16:0)`(%rsp),%rsp
- vzeroupper
+ lea `-64-($win64?5*16:0)`(%rsp),%rsp
___
$code.=<<___ if ($win64);
- vmovaps %xmm6,-40-6*16(%rax)
- vmovaps %xmm7,-40-5*16(%rax)
- vmovaps %xmm8,-40-4*16(%rax)
- vmovaps %xmm9,-40-3*16(%rax)
- vmovaps %xmm10,-40-2*16(%rax)
- vmovaps %xmm11,-40-1*16(%rax)
+ movaps %xmm6,64+0(%rsp)
+ movaps %xmm7,64+16(%rsp)
+ movaps %xmm8,64+32(%rsp)
+ movaps %xmm9,64+48(%rsp)
+ movaps %xmm10,64+64(%rsp)
.Lprologue_avx:
___
$code.=<<___;
- mov %rax,%r14 # original %rsp
- and \$-64,%rsp
mov %rdi,$ctx # reassigned argument
mov %rsi,$inp # reassigned argument
mov %rdx,$num # reassigned argument
+ vzeroupper
shl \$6,$num
add $inp,$num
- lea K_XX_XX+64(%rip),$K_XX_XX
+ lea K_XX_XX(%rip),$K_XX_XX
mov 0($ctx),$A # load context
mov 4($ctx),$B
@@ -971,12 +754,9 @@ $code.=<<___;
mov 12($ctx),$D
mov $B,@T[0] # magic seed
mov 16($ctx),$E
- mov $C,@T[1]
- xor $D,@T[1]
- and @T[1],@T[0]
vmovdqa 64($K_XX_XX),@X[2] # pbswap mask
- vmovdqa -64($K_XX_XX),$Kx # K_00_19
+ vmovdqa 0($K_XX_XX),@Tx[1] # K_00_19
vmovdqu 0($inp),@X[-4&7] # load input to %xmm[0-3]
vmovdqu 16($inp),@X[-3&7]
vmovdqu 32($inp),@X[-2&7]
@@ -986,9 +766,9 @@ $code.=<<___;
vpshufb @X[2],@X[-3&7],@X[-3&7]
vpshufb @X[2],@X[-2&7],@X[-2&7]
vpshufb @X[2],@X[-1&7],@X[-1&7]
- vpaddd $Kx,@X[-4&7],@X[0] # add K_00_19
- vpaddd $Kx,@X[-3&7],@X[1]
- vpaddd $Kx,@X[-2&7],@X[2]
+ vpaddd @Tx[1],@X[-4&7],@X[0] # add K_00_19
+ vpaddd @Tx[1],@X[-3&7],@X[1]
+ vpaddd @Tx[1],@X[-2&7],@X[2]
vmovdqa @X[0],0(%rsp) # X[]+K xfer to IALU
vmovdqa @X[1],16(%rsp)
vmovdqa @X[2],32(%rsp)
@@ -1007,10 +787,10 @@ sub Xupdate_avx_16_31() # recall that $
eval(shift(@insns));
eval(shift(@insns));
- &vpaddd (@Tx[1],$Kx,@X[-1&7]);
+ &vpaddd (@Tx[1],@Tx[1],@X[-1&7]);
eval(shift(@insns));
eval(shift(@insns));
- &vpsrldq(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords
+ &vpsrldq(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords
eval(shift(@insns));
eval(shift(@insns));
&vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
@@ -1060,7 +840,7 @@ sub Xupdate_avx_16_31() # recall that $
&vpxor (@X[0],@X[0],@Tx[2]); # "X[0]"^=("X[0]">>96)<<<2
eval(shift(@insns));
eval(shift(@insns));
- &vmovdqa ($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)") if ($Xi%5==0); # K_XX_XX
+ &vmovdqa (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)"); # K_XX_XX
eval(shift(@insns));
eval(shift(@insns));
@@ -1068,12 +848,13 @@ sub Xupdate_avx_16_31() # recall that $
foreach (@insns) { eval; } # remaining instructions [if any]
$Xi++; push(@X,shift(@X)); # "rotate" X[]
+ push(@Tx,shift(@Tx));
}
sub Xupdate_avx_32_79()
{ use integer;
my $body = shift;
- my @insns = (&$body,&$body,&$body,&$body); # 32 to 44 instructions
+ my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions
my ($a,$b,$c,$d,$e);
&vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8); # compose "X[-6]"
@@ -1086,8 +867,12 @@ sub Xupdate_avx_32_79()
&vpxor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
eval(shift(@insns));
eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/);
- &vpaddd (@Tx[1],$Kx,@X[-1&7]);
- &vmovdqa ($Kx,eval(2*16*($Xi/5)-64)."($K_XX_XX)") if ($Xi%5==0);
+ if ($Xi%5) {
+ &vmovdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
+ } else { # ... or load next one
+ &vmovdqa (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
+ }
+ &vpaddd (@Tx[1],@Tx[1],@X[-1&7]);
eval(shift(@insns)); # ror
eval(shift(@insns));
@@ -1117,6 +902,7 @@ sub Xupdate_avx_32_79()
&vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=2
eval(shift(@insns)); # body_20_39
eval(shift(@insns));
+ &vmovdqa (@Tx[1],@X[0]) if ($Xi<19);
eval(shift(@insns));
eval(shift(@insns)); # rol
eval(shift(@insns));
@@ -1127,6 +913,7 @@ sub Xupdate_avx_32_79()
foreach (@insns) { eval; } # remaining instructions
$Xi++; push(@X,shift(@X)); # "rotate" X[]
+ push(@Tx,shift(@Tx));
}
sub Xuplast_avx_80()
@@ -1136,21 +923,23 @@ sub Xuplast_avx_80()
my ($a,$b,$c,$d,$e);
eval(shift(@insns));
- &vpaddd (@Tx[1],$Kx,@X[-1&7]);
+ &vpaddd (@Tx[1],@Tx[1],@X[-1&7]);
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
- &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU
+ &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU
foreach (@insns) { eval; } # remaining instructions
&cmp ($inp,$num);
- &je ($done_avx_label);
+ &je (".Ldone_avx");
+
+ unshift(@Tx,pop(@Tx));
&vmovdqa(@X[2],"64($K_XX_XX)"); # pbswap mask
- &vmovdqa($Kx,"-64($K_XX_XX)"); # K_00_19
+ &vmovdqa(@Tx[1],"0($K_XX_XX)"); # K_00_19
&vmovdqu(@X[-4&7],"0($inp)"); # load input
&vmovdqu(@X[-3&7],"16($inp)");
&vmovdqu(@X[-2&7],"32($inp)");
@@ -1172,7 +961,7 @@ sub Xloop_avx()
&vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]);
eval(shift(@insns));
eval(shift(@insns));
- &vpaddd (@X[$Xi&7],@X[($Xi-4)&7],$Kx);
+ &vpaddd (@X[$Xi&7],@X[($Xi-4)&7],@Tx[1]);
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
@@ -1232,15 +1021,12 @@ $code.=<<___;
mov @T[0],4($ctx)
mov @T[0],$B # magic seed
mov $C,8($ctx)
- mov $C,@T[1]
mov $D,12($ctx)
- xor $D,@T[1]
mov $E,16($ctx)
- and @T[1],@T[0]
jmp .Loop_avx
.align 16
-$done_avx_label:
+.Ldone_avx:
___
$j=$saved_j; @V=@saved_V;
@@ -1263,520 +1049,31 @@ $code.=<<___;
mov $E,16($ctx)
___
$code.=<<___ if ($win64);
- movaps -40-6*16(%r14),%xmm6
- movaps -40-5*16(%r14),%xmm7
- movaps -40-4*16(%r14),%xmm8
- movaps -40-3*16(%r14),%xmm9
- movaps -40-2*16(%r14),%xmm10
- movaps -40-1*16(%r14),%xmm11
+ movaps 64+0(%rsp),%xmm6
+ movaps 64+16(%rsp),%xmm7
+ movaps 64+32(%rsp),%xmm8
+ movaps 64+48(%rsp),%xmm9
+ movaps 64+64(%rsp),%xmm10
___
$code.=<<___;
- lea (%r14),%rsi
- mov -40(%rsi),%r14
- mov -32(%rsi),%r13
- mov -24(%rsi),%r12
- mov -16(%rsi),%rbp
- mov -8(%rsi),%rbx
- lea (%rsi),%rsp
+ lea `64+($win64?5*16:0)`(%rsp),%rsi
+ mov 0(%rsi),%r12
+ mov 8(%rsi),%rbp
+ mov 16(%rsi),%rbx
+ lea 24(%rsi),%rsp
.Lepilogue_avx:
ret
.size sha1_block_data_order_avx,.-sha1_block_data_order_avx
___
-
-if ($avx>1) {
-use integer;
-$Xi=4; # reset variables
-@X=map("%ymm$_",(4..7,0..3));
-@Tx=map("%ymm$_",(8..10));
-$Kx="%ymm11";
-$j=0;
-
-my @ROTX=("%eax","%ebp","%ebx","%ecx","%edx","%esi");
-my ($a5,$t0)=("%r12d","%edi");
-
-my ($A,$F,$B,$C,$D,$E)=@ROTX;
-my $rx=0;
-my $frame="%r13";
-
-$code.=<<___;
-.type sha1_block_data_order_avx2,\@function,3
-.align 16
-sha1_block_data_order_avx2:
-_avx2_shortcut:
- mov %rsp,%rax
- push %rbx
- push %rbp
- push %r12
- push %r13
- push %r14
- vzeroupper
-___
-$code.=<<___ if ($win64);
- lea -6*16(%rsp),%rsp
- vmovaps %xmm6,-40-6*16(%rax)
- vmovaps %xmm7,-40-5*16(%rax)
- vmovaps %xmm8,-40-4*16(%rax)
- vmovaps %xmm9,-40-3*16(%rax)
- vmovaps %xmm10,-40-2*16(%rax)
- vmovaps %xmm11,-40-1*16(%rax)
-.Lprologue_avx2:
-___
-$code.=<<___;
- mov %rax,%r14 # original %rsp
- mov %rdi,$ctx # reassigned argument
- mov %rsi,$inp # reassigned argument
- mov %rdx,$num # reassigned argument
-
- lea -640(%rsp),%rsp
- shl \$6,$num
- lea 64($inp),$frame
- and \$-128,%rsp
- add $inp,$num
- lea K_XX_XX+64(%rip),$K_XX_XX
-
- mov 0($ctx),$A # load context
- cmp $num,$frame
- cmovae $inp,$frame # next or same block
- mov 4($ctx),$F
- mov 8($ctx),$C
- mov 12($ctx),$D
- mov 16($ctx),$E
- vmovdqu 64($K_XX_XX),@X[2] # pbswap mask
-
- vmovdqu ($inp),%xmm0
- vmovdqu 16($inp),%xmm1
- vmovdqu 32($inp),%xmm2
- vmovdqu 48($inp),%xmm3
- lea 64($inp),$inp
- vinserti128 \$1,($frame),@X[-4&7],@X[-4&7]
- vinserti128 \$1,16($frame),@X[-3&7],@X[-3&7]
- vpshufb @X[2],@X[-4&7],@X[-4&7]
- vinserti128 \$1,32($frame),@X[-2&7],@X[-2&7]
- vpshufb @X[2],@X[-3&7],@X[-3&7]
- vinserti128 \$1,48($frame),@X[-1&7],@X[-1&7]
- vpshufb @X[2],@X[-2&7],@X[-2&7]
- vmovdqu -64($K_XX_XX),$Kx # K_00_19
- vpshufb @X[2],@X[-1&7],@X[-1&7]
-
- vpaddd $Kx,@X[-4&7],@X[0] # add K_00_19
- vpaddd $Kx,@X[-3&7],@X[1]
- vmovdqu @X[0],0(%rsp) # X[]+K xfer to IALU
- vpaddd $Kx,@X[-2&7],@X[2]
- vmovdqu @X[1],32(%rsp)
- vpaddd $Kx,@X[-1&7],@X[3]
- vmovdqu @X[2],64(%rsp)
- vmovdqu @X[3],96(%rsp)
-___
-for (;$Xi<8;$Xi++) { # Xupdate_avx2_16_31
- use integer;
-
- &vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]"
- &vpsrldq(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords
- &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
- &vpxor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
- &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]"
- &vpsrld (@Tx[0],@X[0],31);
- &vmovdqu($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)") if ($Xi%5==0); # K_XX_XX
- &vpslldq(@Tx[2],@X[0],12); # "X[0]"<<96, extract one dword
- &vpaddd (@X[0],@X[0],@X[0]);
- &vpsrld (@Tx[1],@Tx[2],30);
- &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=1
- &vpslld (@Tx[2],@Tx[2],2);
- &vpxor (@X[0],@X[0],@Tx[1]);
- &vpxor (@X[0],@X[0],@Tx[2]); # "X[0]"^=("X[0]">>96)<<<2
- &vpaddd (@Tx[1],@X[0],$Kx);
- &vmovdqu("32*$Xi(%rsp)",@Tx[1]); # X[]+K xfer to IALU
-
- push(@X,shift(@X)); # "rotate" X[]
-}
-$code.=<<___;
- lea 128(%rsp),$frame
- jmp .Loop_avx2
-.align 32
-.Loop_avx2:
- rorx \$2,$F,$B
- andn $D,$F,$t0
- and $C,$F
- xor $t0,$F
-___
-sub bodyx_00_19 () { # 8 instructions, 3 cycles critical path
- # at start $f=(b&c)^(~b&d), $b>>>=2
- return &bodyx_20_39() if ($rx==19); $rx++;
- (
- '($a,$f,$b,$c,$d,$e)=@ROTX;'.
-
- '&add ($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'. # e+=X[i]+K
- '&lea ($frame,"256($frame)") if ($j%32==31);',
- '&andn ($t0,$a,$c)', # ~b&d for next round
-
- '&add ($e,$f)', # e+=(b&c)^(~b&d)
- '&rorx ($a5,$a,27)', # a<<<5
- '&rorx ($f,$a,2)', # b>>>2 for next round
- '&and ($a,$b)', # b&c for next round
-
- '&add ($e,$a5)', # e+=a<<<5
- '&xor ($a,$t0);'. # f=(b&c)^(~b&d) for next round
-
- 'unshift(@ROTX,pop(@ROTX)); $j++;'
- )
-}
-
-sub bodyx_20_39 () { # 7 instructions, 2 cycles critical path
- # on entry $f=b^c^d, $b>>>=2
- return &bodyx_40_59() if ($rx==39); $rx++;
- (
- '($a,$f,$b,$c,$d,$e)=@ROTX;'.
-
- '&add ($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'. # e+=X[i]+K
- '&lea ($frame,"256($frame)") if ($j%32==31);',
-
- '&lea ($e,"($e,$f)")', # e+=b^c^d
- '&rorx ($a5,$a,27)', # a<<<5
- '&rorx ($f,$a,2) if ($j<79)', # b>>>2 in next round
- '&xor ($a,$b) if ($j<79)', # b^c for next round
-
- '&add ($e,$a5)', # e+=a<<<5
- '&xor ($a,$c) if ($j<79);'. # f=b^c^d for next round
-
- 'unshift(@ROTX,pop(@ROTX)); $j++;'
- )
-}
-
-sub bodyx_40_59 () { # 10 instructions, 3 cycles critical path
- # on entry $f=((b^c)&(c^d)), $b>>>=2
- $rx++;
- (
- '($a,$f,$b,$c,$d,$e)=@ROTX;'.
-
- '&add ($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'. # e+=X[i]+K
- '&lea ($frame,"256($frame)") if ($j%32==31);',
- '&xor ($f,$c) if ($j>39)', # (b^c)&(c^d)^c
- '&mov ($t0,$b) if ($j<59)', # count on zero latency
- '&xor ($t0,$c) if ($j<59)', # c^d for next round
-
- '&lea ($e,"($e,$f)")', # e+=(b^c)&(c^d)^c
- '&rorx ($a5,$a,27)', # a<<<5
- '&rorx ($f,$a,2)', # b>>>2 in next round
- '&xor ($a,$b)', # b^c for next round
-
- '&add ($e,$a5)', # e+=a<<<5
- '&and ($a,$t0) if ($j< 59);'. # f=(b^c)&(c^d) for next round
- '&xor ($a,$c) if ($j==59);'. # f=b^c^d for next round
-
- 'unshift(@ROTX,pop(@ROTX)); $j++;'
- )
-}
-
-sub Xupdate_avx2_16_31() # recall that $Xi starts wtih 4
-{ use integer;
- my $body = shift;
- my @insns = (&$body,&$body,&$body,&$body,&$body); # 35 instructions
- my ($a,$b,$c,$d,$e);
-
- &vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]"
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
-
- &vpsrldq(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
-
- &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
- &vpxor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
- eval(shift(@insns));
- eval(shift(@insns));
-
- &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]"
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
-
- &vpsrld (@Tx[0],@X[0],31);
- &vmovdqu($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)") if ($Xi%5==0); # K_XX_XX
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
-
- &vpslldq(@Tx[2],@X[0],12); # "X[0]"<<96, extract one dword
- &vpaddd (@X[0],@X[0],@X[0]);
- eval(shift(@insns));
- eval(shift(@insns));
-
- &vpsrld (@Tx[1],@Tx[2],30);
- &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=1
- eval(shift(@insns));
- eval(shift(@insns));
-
- &vpslld (@Tx[2],@Tx[2],2);
- &vpxor (@X[0],@X[0],@Tx[1]);
- eval(shift(@insns));
- eval(shift(@insns));
-
- &vpxor (@X[0],@X[0],@Tx[2]); # "X[0]"^=("X[0]">>96)<<<2
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
-
- &vpaddd (@Tx[1],@X[0],$Kx);
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- &vmovdqu(eval(32*($Xi))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
-
- foreach (@insns) { eval; } # remaining instructions [if any]
-
- $Xi++;
- push(@X,shift(@X)); # "rotate" X[]
-}
-
-sub Xupdate_avx2_32_79()
-{ use integer;
- my $body = shift;
- my @insns = (&$body,&$body,&$body,&$body,&$body); # 35 to 50 instructions
- my ($a,$b,$c,$d,$e);
-
- &vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8); # compose "X[-6]"
- &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
- eval(shift(@insns));
- eval(shift(@insns));
-
- &vpxor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
- &vmovdqu($Kx,eval(2*16*($Xi/5)-64)."($K_XX_XX)") if ($Xi%5==0);
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
-
- &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-6]"
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
-
- &vpsrld (@Tx[0],@X[0],30);
- &vpslld (@X[0],@X[0],2);
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
-
- #&vpslld (@X[0],@X[0],2);
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
-
- &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=2
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
-
- &vpaddd (@Tx[1],@X[0],$Kx);
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
-
- &vmovdqu("32*$Xi(%rsp)",@Tx[1]); # X[]+K xfer to IALU
-
- foreach (@insns) { eval; } # remaining instructions
-
- $Xi++;
- push(@X,shift(@X)); # "rotate" X[]
-}
-
-sub Xloop_avx2()
-{ use integer;
- my $body = shift;
- my @insns = (&$body,&$body,&$body,&$body,&$body); # 32 instructions
- my ($a,$b,$c,$d,$e);
-
- foreach (@insns) { eval; }
-}
-
- &align32();
- &Xupdate_avx2_32_79(\&bodyx_00_19);
- &Xupdate_avx2_32_79(\&bodyx_00_19);
- &Xupdate_avx2_32_79(\&bodyx_00_19);
- &Xupdate_avx2_32_79(\&bodyx_00_19);
-
- &Xupdate_avx2_32_79(\&bodyx_20_39);
- &Xupdate_avx2_32_79(\&bodyx_20_39);
- &Xupdate_avx2_32_79(\&bodyx_20_39);
- &Xupdate_avx2_32_79(\&bodyx_20_39);
-
- &align32();
- &Xupdate_avx2_32_79(\&bodyx_40_59);
- &Xupdate_avx2_32_79(\&bodyx_40_59);
- &Xupdate_avx2_32_79(\&bodyx_40_59);
- &Xupdate_avx2_32_79(\&bodyx_40_59);
-
- &Xloop_avx2(\&bodyx_20_39);
- &Xloop_avx2(\&bodyx_20_39);
- &Xloop_avx2(\&bodyx_20_39);
- &Xloop_avx2(\&bodyx_20_39);
-
-$code.=<<___;
- lea 128($inp),$frame
- lea 128($inp),%rdi # borrow $t0
- cmp $num,$frame
- cmovae $inp,$frame # next or previous block
-
- # output is d-e-[a]-f-b-c => A=d,F=e,C=f,D=b,E=c
- add 0($ctx),@ROTX[0] # update context
- add 4($ctx),@ROTX[1]
- add 8($ctx),@ROTX[3]
- mov @ROTX[0],0($ctx)
- add 12($ctx),@ROTX[4]
- mov @ROTX[1],4($ctx)
- mov @ROTX[0],$A # A=d
- add 16($ctx),@ROTX[5]
- mov @ROTX[3],$a5
- mov @ROTX[3],8($ctx)
- mov @ROTX[4],$D # D=b
- #xchg @ROTX[5],$F # F=c, C=f
- mov @ROTX[4],12($ctx)
- mov @ROTX[1],$F # F=e
- mov @ROTX[5],16($ctx)
- #mov $F,16($ctx)
- mov @ROTX[5],$E # E=c
- mov $a5,$C # C=f
- #xchg $F,$E # E=c, F=e
-
- cmp $num,$inp
- je .Ldone_avx2
-___
-
-$Xi=4; # reset variables
-@X=map("%ymm$_",(4..7,0..3));
-
-$code.=<<___;
- vmovdqu 64($K_XX_XX),@X[2] # pbswap mask
- cmp $num,%rdi # borrowed $t0
- ja .Last_avx2
-
- vmovdqu -64(%rdi),%xmm0 # low part of @X[-4&7]
- vmovdqu -48(%rdi),%xmm1
- vmovdqu -32(%rdi),%xmm2
- vmovdqu -16(%rdi),%xmm3
- vinserti128 \$1,0($frame),@X[-4&7],@X[-4&7]
- vinserti128 \$1,16($frame),@X[-3&7],@X[-3&7]
- vinserti128 \$1,32($frame),@X[-2&7],@X[-2&7]
- vinserti128 \$1,48($frame),@X[-1&7],@X[-1&7]
- jmp .Last_avx2
-
-.align 32
-.Last_avx2:
- lea 128+16(%rsp),$frame
- rorx \$2,$F,$B
- andn $D,$F,$t0
- and $C,$F
- xor $t0,$F
- sub \$-128,$inp
-___
- $rx=$j=0; @ROTX=($A,$F,$B,$C,$D,$E);
-
- &Xloop_avx2 (\&bodyx_00_19);
- &Xloop_avx2 (\&bodyx_00_19);
- &Xloop_avx2 (\&bodyx_00_19);
- &Xloop_avx2 (\&bodyx_00_19);
-
- &Xloop_avx2 (\&bodyx_20_39);
- &vmovdqu ($Kx,"-64($K_XX_XX)"); # K_00_19
- &vpshufb (@X[-4&7],@X[-4&7],@X[2]); # byte swap
- &Xloop_avx2 (\&bodyx_20_39);
- &vpshufb (@X[-3&7],@X[-3&7],@X[2]);
- &vpaddd (@Tx[0],@X[-4&7],$Kx); # add K_00_19
- &Xloop_avx2 (\&bodyx_20_39);
- &vmovdqu ("0(%rsp)",@Tx[0]);
- &vpshufb (@X[-2&7],@X[-2&7],@X[2]);
- &vpaddd (@Tx[1],@X[-3&7],$Kx);
- &Xloop_avx2 (\&bodyx_20_39);
- &vmovdqu ("32(%rsp)",@Tx[1]);
- &vpshufb (@X[-1&7],@X[-1&7],@X[2]);
- &vpaddd (@X[2],@X[-2&7],$Kx);
-
- &Xloop_avx2 (\&bodyx_40_59);
- &align32 ();
- &vmovdqu ("64(%rsp)",@X[2]);
- &vpaddd (@X[3],@X[-1&7],$Kx);
- &Xloop_avx2 (\&bodyx_40_59);
- &vmovdqu ("96(%rsp)",@X[3]);
- &Xloop_avx2 (\&bodyx_40_59);
- &Xupdate_avx2_16_31(\&bodyx_40_59);
-
- &Xupdate_avx2_16_31(\&bodyx_20_39);
- &Xupdate_avx2_16_31(\&bodyx_20_39);
- &Xupdate_avx2_16_31(\&bodyx_20_39);
- &Xloop_avx2 (\&bodyx_20_39);
-
-$code.=<<___;
- lea 128(%rsp),$frame
-
- # output is d-e-[a]-f-b-c => A=d,F=e,C=f,D=b,E=c
- add 0($ctx),@ROTX[0] # update context
- add 4($ctx),@ROTX[1]
- add 8($ctx),@ROTX[3]
- mov @ROTX[0],0($ctx)
- add 12($ctx),@ROTX[4]
- mov @ROTX[1],4($ctx)
- mov @ROTX[0],$A # A=d
- add 16($ctx),@ROTX[5]
- mov @ROTX[3],$a5
- mov @ROTX[3],8($ctx)
- mov @ROTX[4],$D # D=b
- #xchg @ROTX[5],$F # F=c, C=f
- mov @ROTX[4],12($ctx)
- mov @ROTX[1],$F # F=e
- mov @ROTX[5],16($ctx)
- #mov $F,16($ctx)
- mov @ROTX[5],$E # E=c
- mov $a5,$C # C=f
- #xchg $F,$E # E=c, F=e
-
- cmp $num,$inp
- jbe .Loop_avx2
-
-.Ldone_avx2:
- vzeroupper
-___
-$code.=<<___ if ($win64);
- movaps -40-6*16(%r14),%xmm6
- movaps -40-5*16(%r14),%xmm7
- movaps -40-4*16(%r14),%xmm8
- movaps -40-3*16(%r14),%xmm9
- movaps -40-2*16(%r14),%xmm10
- movaps -40-1*16(%r14),%xmm11
-___
-$code.=<<___;
- lea (%r14),%rsi
- mov -40(%rsi),%r14
- mov -32(%rsi),%r13
- mov -24(%rsi),%r12
- mov -16(%rsi),%rbp
- mov -8(%rsi),%rbx
- lea (%rsi),%rsp
-.Lepilogue_avx2:
- ret
-.size sha1_block_data_order_avx2,.-sha1_block_data_order_avx2
-___
-}
}
$code.=<<___;
.align 64
K_XX_XX:
.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19
-.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19
.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39
-.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39
-.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59
.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59
.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79
-.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79
.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap mask
-.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap mask
-.byte 0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0
___
}}}
$code.=<<___;
@@ -1822,58 +1119,20 @@ se_handler:
jae .Lcommon_seh_tail
mov `16*4`(%rax),%rax # pull saved stack pointer
+ lea 32(%rax),%rax
mov -8(%rax),%rbx
mov -16(%rax),%rbp
mov -24(%rax),%r12
mov -32(%rax),%r13
- mov -40(%rax),%r14
mov %rbx,144($context) # restore context->Rbx
mov %rbp,160($context) # restore context->Rbp
mov %r12,216($context) # restore context->R12
mov %r13,224($context) # restore context->R13
- mov %r14,232($context) # restore context->R14
jmp .Lcommon_seh_tail
.size se_handler,.-se_handler
-___
-$code.=<<___ if ($shaext);
-.type shaext_handler,\@abi-omnipotent
-.align 16
-shaext_handler:
- push %rsi
- push %rdi
- push %rbx
- push %rbp
- push %r12
- push %r13
- push %r14
- push %r15
- pushfq
- sub \$64,%rsp
-
- mov 120($context),%rax # pull context->Rax
- mov 248($context),%rbx # pull context->Rip
-
- lea .Lprologue_shaext(%rip),%r10
- cmp %r10,%rbx # context->Rip<.Lprologue
- jb .Lcommon_seh_tail
-
- lea .Lepilogue_shaext(%rip),%r10
- cmp %r10,%rbx # context->Rip>=.Lepilogue
- jae .Lcommon_seh_tail
-
- lea -8-4*16(%rax),%rsi
- lea 512($context),%rdi # &context.Xmm6
- mov \$8,%ecx
- .long 0xa548f3fc # cld; rep movsq
-
- jmp .Lcommon_seh_tail
-.size shaext_handler,.-shaext_handler
-___
-
-$code.=<<___;
.type ssse3_handler,\@abi-omnipotent
.align 16
ssse3_handler:
@@ -1906,23 +1165,18 @@ ssse3_handler:
cmp %r10,%rbx # context->Rip>=epilogue label
jae .Lcommon_seh_tail
- mov 232($context),%rax # pull context->R14
-
- lea -40-6*16(%rax),%rsi
+ lea 64(%rax),%rsi
lea 512($context),%rdi # &context.Xmm6
- mov \$12,%ecx
+ mov \$10,%ecx
.long 0xa548f3fc # cld; rep movsq
+ lea `24+64+5*16`(%rax),%rax # adjust stack pointer
mov -8(%rax),%rbx
mov -16(%rax),%rbp
mov -24(%rax),%r12
- mov -32(%rax),%r13
- mov -40(%rax),%r14
mov %rbx,144($context) # restore context->Rbx
mov %rbp,160($context) # restore context->Rbp
mov %r12,216($context) # restore cotnext->R12
- mov %r13,224($context) # restore cotnext->R13
- mov %r14,232($context) # restore cotnext->R14
.Lcommon_seh_tail:
mov 8(%rax),%rdi
@@ -1969,13 +1223,6 @@ ssse3_handler:
.rva .LSEH_begin_sha1_block_data_order
.rva .LSEH_end_sha1_block_data_order
.rva .LSEH_info_sha1_block_data_order
-___
-$code.=<<___ if ($shaext);
- .rva .LSEH_begin_sha1_block_data_order_shaext
- .rva .LSEH_end_sha1_block_data_order_shaext
- .rva .LSEH_info_sha1_block_data_order_shaext
-___
-$code.=<<___;
.rva .LSEH_begin_sha1_block_data_order_ssse3
.rva .LSEH_end_sha1_block_data_order_ssse3
.rva .LSEH_info_sha1_block_data_order_ssse3
@@ -1985,24 +1232,12 @@ $code.=<<___ if ($avx);
.rva .LSEH_end_sha1_block_data_order_avx
.rva .LSEH_info_sha1_block_data_order_avx
___
-$code.=<<___ if ($avx>1);
- .rva .LSEH_begin_sha1_block_data_order_avx2
- .rva .LSEH_end_sha1_block_data_order_avx2
- .rva .LSEH_info_sha1_block_data_order_avx2
-___
$code.=<<___;
.section .xdata
.align 8
.LSEH_info_sha1_block_data_order:
.byte 9,0,0,0
.rva se_handler
-___
-$code.=<<___ if ($shaext);
-.LSEH_info_sha1_block_data_order_shaext:
- .byte 9,0,0,0
- .rva shaext_handler
-___
-$code.=<<___;
.LSEH_info_sha1_block_data_order_ssse3:
.byte 9,0,0,0
.rva ssse3_handler
@@ -2014,55 +1249,10 @@ $code.=<<___ if ($avx);
.rva ssse3_handler
.rva .Lprologue_avx,.Lepilogue_avx # HandlerData[]
___
-$code.=<<___ if ($avx>1);
-.LSEH_info_sha1_block_data_order_avx2:
- .byte 9,0,0,0
- .rva ssse3_handler
- .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[]
-___
}
####################################################################
-sub sha1rnds4 {
- if (@_[0] =~ /\$([x0-9a-f]+),\s*%xmm([0-7]),\s*%xmm([0-7])/) {
- my @opcode=(0x0f,0x3a,0xcc);
- push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M
- my $c=$1;
- push @opcode,$c=~/^0/?oct($c):$c;
- return ".byte\t".join(',',@opcode);
- } else {
- return "sha1rnds4\t".@_[0];
- }
-}
-
-sub sha1op38 {
- my $instr = shift;
- my %opcodelet = (
- "sha1nexte" => 0xc8,
- "sha1msg1" => 0xc9,
- "sha1msg2" => 0xca );
-
- if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
- my @opcode=(0x0f,0x38);
- my $rex=0;
- $rex|=0x04 if ($2>=8);
- $rex|=0x01 if ($1>=8);
- unshift @opcode,0x40|$rex if ($rex);
- push @opcode,$opcodelet{$instr};
- push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
- return ".byte\t".join(',',@opcode);
- } else {
- return $instr."\t".@_[0];
- }
-}
-
-foreach (split("\n",$code)) {
- s/\`([^\`]*)\`/eval $1/geo;
-
- s/\b(sha1rnds4)\s+(.*)/sha1rnds4($2)/geo or
- s/\b(sha1[^\s]*)\s+(.*)/sha1op38($1,$2)/geo;
-
- print $_,"\n";
-}
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+print $code;
close STDOUT;