Module Name: src Committed By: christos Date: Mon Mar 21 19:12:26 UTC 2016
Modified Files: src/crypto/external/bsd/openssl/dist/crypto: x86_64cpuid.pl src/crypto/external/bsd/openssl/dist/crypto/sha/asm: sha1-x86_64.pl Log Message: revert change from openssl-1.1.0-pre4, breaks gcc-4.8 To generate a diff of this commit: cvs rdiff -u -r1.2 -r1.3 \ src/crypto/external/bsd/openssl/dist/crypto/x86_64cpuid.pl cvs rdiff -u -r1.4 -r1.5 \ src/crypto/external/bsd/openssl/dist/crypto/sha/asm/sha1-x86_64.pl Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
Modified files: Index: src/crypto/external/bsd/openssl/dist/crypto/x86_64cpuid.pl diff -u src/crypto/external/bsd/openssl/dist/crypto/x86_64cpuid.pl:1.2 src/crypto/external/bsd/openssl/dist/crypto/x86_64cpuid.pl:1.3 --- src/crypto/external/bsd/openssl/dist/crypto/x86_64cpuid.pl:1.2 Sun Mar 20 18:26:56 2016 +++ src/crypto/external/bsd/openssl/dist/crypto/x86_64cpuid.pl Mon Mar 21 15:12:26 2016 @@ -24,7 +24,7 @@ print<<___; call OPENSSL_cpuid_setup .hidden OPENSSL_ia32cap_P -.comm OPENSSL_ia32cap_P,16,4 +.comm OPENSSL_ia32cap_P,8,4 .text @@ -53,13 +53,12 @@ OPENSSL_rdtsc: .size OPENSSL_rdtsc,.-OPENSSL_rdtsc .globl OPENSSL_ia32_cpuid -.type OPENSSL_ia32_cpuid,\@function,1 +.type OPENSSL_ia32_cpuid,\@abi-omnipotent .align 16 OPENSSL_ia32_cpuid: mov %rbx,%r8 # save %rbx xor %eax,%eax - mov %eax,8(%rdi) # clear 3rd word cpuid mov %eax,%r11d # max value for standard query level @@ -127,14 +126,6 @@ OPENSSL_ia32_cpuid: shr \$14,%r10d and \$0xfff,%r10d # number of cores -1 per L1D - cmp \$7,%r11d - jb .Lnocacheinfo - - mov \$7,%eax - xor %ecx,%ecx - cpuid - mov %ebx,8(%rdi) - .Lnocacheinfo: mov \$1,%eax cpuid @@ -174,7 +165,6 @@ OPENSSL_ia32_cpuid: .Lclear_avx: mov \$0xefffe7ff,%eax # ~(1<<28|1<<12|1<<11) and %eax,%r9d # clear AVX, FMA and AMD XOP bits - andl \$0xffffffdf,8(%rdi) # cleax AVX2, ~(1<<5) .Ldone: shl \$32,%r9 mov %r10d,%eax @@ -273,96 +263,6 @@ OPENSSL_wipe_cpu: ret .size OPENSSL_wipe_cpu,.-OPENSSL_wipe_cpu ___ -{ -my $out="%r10"; -my $cnt="%rcx"; -my $max="%r11"; -my $lasttick="%r8d"; -my $lastdiff="%r9d"; -my $redzone=win64?8:-8; - -print<<___; -.globl OPENSSL_instrument_bus -.type OPENSSL_instrument_bus,\@abi-omnipotent -.align 16 -OPENSSL_instrument_bus: - mov $arg1,$out # tribute to Win64 - mov $arg2,$cnt - mov $arg2,$max - - rdtsc # collect 1st tick - mov %eax,$lasttick # lasttick = tick - mov \$0,$lastdiff # lastdiff = 0 - clflush ($out) - .byte 0xf0 # lock - add $lastdiff,($out) - jmp .Loop -.align 16 -.Loop: rdtsc - mov %eax,%edx - sub $lasttick,%eax - mov %edx,$lasttick - mov %eax,$lastdiff - clflush ($out) - .byte 0xf0 # lock - add %eax,($out) - lea 4($out),$out - sub \$1,$cnt - jnz .Loop - - mov $max,%rax - ret -.size OPENSSL_instrument_bus,.-OPENSSL_instrument_bus - -.globl OPENSSL_instrument_bus2 -.type OPENSSL_instrument_bus2,\@abi-omnipotent -.align 16 -OPENSSL_instrument_bus2: - mov $arg1,$out # tribute to Win64 - mov $arg2,$cnt - mov $arg3,$max - mov $cnt,$redzone(%rsp) - - rdtsc # collect 1st tick - mov %eax,$lasttick # lasttick = tick - mov \$0,$lastdiff # lastdiff = 0 - - clflush ($out) - .byte 0xf0 # lock - add $lastdiff,($out) - - rdtsc # collect 1st diff - mov %eax,%edx - sub $lasttick,%eax # diff - mov %edx,$lasttick # lasttick = tick - mov %eax,$lastdiff # lastdiff = diff -.Loop2: - clflush ($out) - .byte 0xf0 # lock - add %eax,($out) # accumulate diff - - sub \$1,$max - jz .Ldone2 - - rdtsc - mov %eax,%edx - sub $lasttick,%eax # diff - mov %edx,$lasttick # lasttick = tick - cmp $lastdiff,%eax - mov %eax,$lastdiff # lastdiff = diff - mov \$0,%edx - setne %dl - sub %rdx,$cnt # conditional --$cnt - lea ($out,%rdx,4),$out # conditional ++$out - jnz .Loop2 - -.Ldone2: - mov $redzone(%rsp),%rax - sub $cnt,%rax - ret -.size OPENSSL_instrument_bus2,.-OPENSSL_instrument_bus2 -___ -} print<<___; .globl OPENSSL_ia32_rdrand @@ -379,21 +279,6 @@ OPENSSL_ia32_rdrand: cmove %rcx,%rax ret .size OPENSSL_ia32_rdrand,.-OPENSSL_ia32_rdrand - -.globl OPENSSL_ia32_rdseed -.type OPENSSL_ia32_rdseed,\@abi-omnipotent -.align 16 -OPENSSL_ia32_rdseed: - mov \$8,%ecx -.Loop_rdseed: - rdseed %rax - jc .Lbreak_rdseed - loop .Loop_rdseed -.Lbreak_rdseed: - cmp \$0,%rax - cmove %rcx,%rax - ret -.size OPENSSL_ia32_rdseed,.-OPENSSL_ia32_rdseed ___ close STDOUT; # flush Index: src/crypto/external/bsd/openssl/dist/crypto/sha/asm/sha1-x86_64.pl diff -u src/crypto/external/bsd/openssl/dist/crypto/sha/asm/sha1-x86_64.pl:1.4 src/crypto/external/bsd/openssl/dist/crypto/sha/asm/sha1-x86_64.pl:1.5 --- src/crypto/external/bsd/openssl/dist/crypto/sha/asm/sha1-x86_64.pl:1.4 Sun Mar 20 18:26:56 2016 +++ src/crypto/external/bsd/openssl/dist/crypto/sha/asm/sha1-x86_64.pl Mon Mar 21 15:12:26 2016 @@ -1,7 +1,7 @@ #!/usr/bin/env perl # # ==================================================================== -# Written by Andy Polyakov <ap...@openssl.org> for the OpenSSL +# Written by Andy Polyakov <ap...@fy.chalmers.se> for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. @@ -49,38 +49,17 @@ # # Add AVX code path. See sha1-586.pl for further information. -# May 2013. -# -# Add AVX2+BMI code path. Initial attempt (utilizing BMI instructions -# and loading pair of consecutive blocks to 256-bit %ymm registers) -# did not provide impressive performance improvement till a crucial -# hint regarding the number of Xupdate iterations to pre-compute in -# advance was provided by Ilya Albrekht of Intel Corp. - -# March 2014. -# -# Add support for Intel SHA Extensions. - ###################################################################### # Current performance is summarized in following table. Numbers are # CPU clock cycles spent to process single byte (less is better). # -# x86_64 SSSE3 AVX[2] -# P4 9.05 - -# Opteron 6.26 - -# Core2 6.55 6.05/+8% - -# Westmere 6.73 5.30/+27% - -# Sandy Bridge 7.70 6.10/+26% 4.99/+54% -# Ivy Bridge 6.06 4.67/+30% 4.60/+32% -# Haswell 5.45 4.15/+31% 3.57/+53% -# Skylake 5.18 4.06/+28% 3.54/+46% -# Bulldozer 9.11 5.95/+53% -# VIA Nano 9.32 7.15/+30% -# Atom 10.3 9.17/+12% -# Silvermont 13.1(*) 9.37/+40% -# -# (*) obviously suboptimal result, nothing was done about it, -# because SSSE3 code is compiled unconditionally; +# x86_64 SSSE3 AVX +# P4 9.8 - +# Opteron 6.6 - +# Core2 6.7 6.1/+10% - +# Atom 11.0 9.7/+13% - +# Westmere 7.1 5.6/+27% - +# Sandy Bridge 7.9 6.3/+25% 5.2/+51% $flavour = shift; $output = shift; @@ -93,27 +72,15 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; -if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` - =~ /GNU assembler version ([2-9]\.[0-9]+)/) { - $avx = ($1>=2.19) + ($1>=2.22); -} - -if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && - `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { - $avx = ($1>=2.09) + ($1>=2.10); -} - -if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && - `ml64 2>&1` =~ /Version ([0-9]+)\./) { - $avx = ($1>=10) + ($1>=11); -} - -if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([2-9]\.[0-9]+)/) { - $avx = ($2>=3.0) + ($2>3.0); -} - -$shaext=1; ### set to zero if compiling for 1.0.1 -$avx=1 if (!$shaext && $avx); +$avx=1 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` + =~ /GNU assembler version ([2-9]\.[0-9]+)/ && + $1>=2.19); +$avx=1 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && + `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ && + $1>=2.09); +$avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && + `ml64 2>&1` =~ /Version ([0-9]+)\./ && + $1>=10); open OUT,"| \"$^X\" $xlate $flavour $output"; *STDOUT=*OUT; @@ -130,7 +97,7 @@ $num="%r10"; $t0="%eax"; $t1="%ebx"; $t2="%ecx"; -@xi=("%edx","%ebp","%r14d"); +@xi=("%edx","%ebp"); $A="%esi"; $B="%edi"; $C="%r11d"; @@ -145,40 +112,42 @@ my $j=$i+1; $code.=<<___ if ($i==0); mov `4*$i`($inp),$xi[0] bswap $xi[0] + mov $xi[0],`4*$i`(%rsp) ___ $code.=<<___ if ($i<15); + mov $c,$t0 mov `4*$j`($inp),$xi[1] - mov $d,$t0 - mov $xi[0],`4*$i`(%rsp) mov $a,$t2 + xor $d,$t0 bswap $xi[1] - xor $c,$t0 rol \$5,$t2 - and $b,$t0 lea 0x5a827999($xi[0],$e),$e + and $b,$t0 + mov $xi[1],`4*$j`(%rsp) add $t2,$e xor $d,$t0 rol \$30,$b add $t0,$e ___ $code.=<<___ if ($i>=15); - xor `4*($j%16)`(%rsp),$xi[1] - mov $d,$t0 - mov $xi[0],`4*($i%16)`(%rsp) + mov `4*($j%16)`(%rsp),$xi[1] + mov $c,$t0 mov $a,$t2 xor `4*(($j+2)%16)`(%rsp),$xi[1] - xor $c,$t0 + xor $d,$t0 rol \$5,$t2 xor `4*(($j+8)%16)`(%rsp),$xi[1] and $b,$t0 lea 0x5a827999($xi[0],$e),$e - rol \$30,$b + xor `4*(($j+13)%16)`(%rsp),$xi[1] xor $d,$t0 - add $t2,$e rol \$1,$xi[1] + add $t2,$e + rol \$30,$b + mov $xi[1],`4*($j%16)`(%rsp) add $t0,$e ___ -push(@xi,shift(@xi)); +unshift(@xi,pop(@xi)); } sub BODY_20_39 { @@ -186,58 +155,62 @@ my ($i,$a,$b,$c,$d,$e)=@_; my $j=$i+1; my $K=($i<40)?0x6ed9eba1:0xca62c1d6; $code.=<<___ if ($i<79); - xor `4*($j%16)`(%rsp),$xi[1] - mov $b,$t0 - `"mov $xi[0],".4*($i%16)."(%rsp)" if ($i<72)` + mov `4*($j%16)`(%rsp),$xi[1] + mov $c,$t0 mov $a,$t2 xor `4*(($j+2)%16)`(%rsp),$xi[1] - xor $d,$t0 + xor $b,$t0 rol \$5,$t2 - xor `4*(($j+8)%16)`(%rsp),$xi[1] lea $K($xi[0],$e),$e - xor $c,$t0 + xor `4*(($j+8)%16)`(%rsp),$xi[1] + xor $d,$t0 add $t2,$e + xor `4*(($j+13)%16)`(%rsp),$xi[1] rol \$30,$b add $t0,$e rol \$1,$xi[1] ___ +$code.=<<___ if ($i<76); + mov $xi[1],`4*($j%16)`(%rsp) +___ $code.=<<___ if ($i==79); - mov $b,$t0 + mov $c,$t0 mov $a,$t2 - xor $d,$t0 + xor $b,$t0 lea $K($xi[0],$e),$e rol \$5,$t2 - xor $c,$t0 + xor $d,$t0 add $t2,$e rol \$30,$b add $t0,$e ___ -push(@xi,shift(@xi)); +unshift(@xi,pop(@xi)); } sub BODY_40_59 { my ($i,$a,$b,$c,$d,$e)=@_; my $j=$i+1; $code.=<<___; - xor `4*($j%16)`(%rsp),$xi[1] - mov $d,$t0 - mov $xi[0],`4*($i%16)`(%rsp) - mov $d,$t1 + mov `4*($j%16)`(%rsp),$xi[1] + mov $c,$t0 + mov $c,$t1 xor `4*(($j+2)%16)`(%rsp),$xi[1] - and $c,$t0 + and $d,$t0 mov $a,$t2 xor `4*(($j+8)%16)`(%rsp),$xi[1] + xor $d,$t1 lea 0x8f1bbcdc($xi[0],$e),$e - xor $c,$t1 rol \$5,$t2 + xor `4*(($j+13)%16)`(%rsp),$xi[1] add $t0,$e - rol \$1,$xi[1] and $b,$t1 - add $t2,$e - rol \$30,$b + rol \$1,$xi[1] add $t1,$e + rol \$30,$b + mov $xi[1],`4*($j%16)`(%rsp) + add $t2,$e ___ -push(@xi,shift(@xi)); +unshift(@xi,pop(@xi)); } $code.=<<___; @@ -248,45 +221,31 @@ $code.=<<___; .type sha1_block_data_order,\@function,3 .align 16 sha1_block_data_order: - mov OPENSSL_ia32cap_P+0(%rip),%r9d - mov OPENSSL_ia32cap_P+4(%rip),%r8d - mov OPENSSL_ia32cap_P+8(%rip),%r10d - test \$`1<<9`,%r8d # check SSSE3 bit - jz .Lialu -___ -$code.=<<___ if ($shaext); - test \$`1<<29`,%r10d # check SHA bit - jnz _shaext_shortcut -___ -$code.=<<___ if ($avx>1); - and \$`1<<3|1<<5|1<<8`,%r10d # check AVX2+BMI1+BMI2 - cmp \$`1<<3|1<<5|1<<8`,%r10d - je _avx2_shortcut + mov OPENSSL_ia32cap_P+0(%rip),%r8 + mov 4(%r8),%r8d + bt \$9,%r8d + jnc .Lialu ___ $code.=<<___ if ($avx); - and \$`1<<28`,%r8d # mask AVX bit - and \$`1<<30`,%r9d # mask "Intel CPU" bit - or %r9d,%r8d - cmp \$`1<<28|1<<30`,%r8d - je _avx_shortcut + bt \$28,%r8d + jc _avx_shortcut ___ $code.=<<___; jmp _ssse3_shortcut .align 16 .Lialu: - mov %rsp,%rax push %rbx push %rbp push %r12 push %r13 - push %r14 + mov %rsp,%r11 mov %rdi,$ctx # reassigned argument sub \$`8+16*4`,%rsp mov %rsi,$inp # reassigned argument and \$-64,%rsp mov %rdx,$num # reassigned argument - mov %rax,`16*4`(%rsp) + mov %r11,`16*4`(%rsp) .Lprologue: mov 0($ctx),$A @@ -320,187 +279,53 @@ $code.=<<___; jnz .Lloop mov `16*4`(%rsp),%rsi - mov -40(%rsi),%r14 - mov -32(%rsi),%r13 - mov -24(%rsi),%r12 - mov -16(%rsi),%rbp - mov -8(%rsi),%rbx - lea (%rsi),%rsp + mov (%rsi),%r13 + mov 8(%rsi),%r12 + mov 16(%rsi),%rbp + mov 24(%rsi),%rbx + lea 32(%rsi),%rsp .Lepilogue: ret .size sha1_block_data_order,.-sha1_block_data_order ___ -if ($shaext) {{{ -###################################################################### -# Intel SHA Extensions implementation of SHA1 update function. -# -my ($ctx,$inp,$num)=("%rdi","%rsi","%rdx"); -my ($ABCD,$E,$E_,$BSWAP,$ABCD_SAVE,$E_SAVE)=map("%xmm$_",(0..3,8,9)); -my @MSG=map("%xmm$_",(4..7)); - -$code.=<<___; -.type sha1_block_data_order_shaext,\@function,3 -.align 32 -sha1_block_data_order_shaext: -_shaext_shortcut: -___ -$code.=<<___ if ($win64); - lea `-8-4*16`(%rsp),%rsp - movaps %xmm6,-8-4*16(%rax) - movaps %xmm7,-8-3*16(%rax) - movaps %xmm8,-8-2*16(%rax) - movaps %xmm9,-8-1*16(%rax) -.Lprologue_shaext: -___ -$code.=<<___; - movdqu ($ctx),$ABCD - movd 16($ctx),$E - movdqa K_XX_XX+0xa0(%rip),$BSWAP # byte-n-word swap - - movdqu ($inp),@MSG[0] - pshufd \$0b00011011,$ABCD,$ABCD # flip word order - movdqu 0x10($inp),@MSG[1] - pshufd \$0b00011011,$E,$E # flip word order - movdqu 0x20($inp),@MSG[2] - pshufb $BSWAP,@MSG[0] - movdqu 0x30($inp),@MSG[3] - pshufb $BSWAP,@MSG[1] - pshufb $BSWAP,@MSG[2] - movdqa $E,$E_SAVE # offload $E - pshufb $BSWAP,@MSG[3] - jmp .Loop_shaext - -.align 16 -.Loop_shaext: - dec $num - lea 0x40($inp),%rax # next input block - paddd @MSG[0],$E - cmovne %rax,$inp - movdqa $ABCD,$ABCD_SAVE # offload $ABCD -___ -for($i=0;$i<20-4;$i+=2) { -$code.=<<___; - sha1msg1 @MSG[1],@MSG[0] - movdqa $ABCD,$E_ - sha1rnds4 \$`int($i/5)`,$E,$ABCD # 0-3... - sha1nexte @MSG[1],$E_ - pxor @MSG[2],@MSG[0] - sha1msg1 @MSG[2],@MSG[1] - sha1msg2 @MSG[3],@MSG[0] - - movdqa $ABCD,$E - sha1rnds4 \$`int(($i+1)/5)`,$E_,$ABCD - sha1nexte @MSG[2],$E - pxor @MSG[3],@MSG[1] - sha1msg2 @MSG[0],@MSG[1] -___ - push(@MSG,shift(@MSG)); push(@MSG,shift(@MSG)); -} -$code.=<<___; - movdqu ($inp),@MSG[0] - movdqa $ABCD,$E_ - sha1rnds4 \$3,$E,$ABCD # 64-67 - sha1nexte @MSG[1],$E_ - movdqu 0x10($inp),@MSG[1] - pshufb $BSWAP,@MSG[0] - - movdqa $ABCD,$E - sha1rnds4 \$3,$E_,$ABCD # 68-71 - sha1nexte @MSG[2],$E - movdqu 0x20($inp),@MSG[2] - pshufb $BSWAP,@MSG[1] - - movdqa $ABCD,$E_ - sha1rnds4 \$3,$E,$ABCD # 72-75 - sha1nexte @MSG[3],$E_ - movdqu 0x30($inp),@MSG[3] - pshufb $BSWAP,@MSG[2] - - movdqa $ABCD,$E - sha1rnds4 \$3,$E_,$ABCD # 76-79 - sha1nexte $E_SAVE,$E - pshufb $BSWAP,@MSG[3] - - paddd $ABCD_SAVE,$ABCD - movdqa $E,$E_SAVE # offload $E - - jnz .Loop_shaext - - pshufd \$0b00011011,$ABCD,$ABCD - pshufd \$0b00011011,$E,$E - movdqu $ABCD,($ctx) - movd $E,16($ctx) -___ -$code.=<<___ if ($win64); - movaps -8-4*16(%rax),%xmm6 - movaps -8-3*16(%rax),%xmm7 - movaps -8-2*16(%rax),%xmm8 - movaps -8-1*16(%rax),%xmm9 - mov %rax,%rsp -.Lepilogue_shaext: -___ -$code.=<<___; - ret -.size sha1_block_data_order_shaext,.-sha1_block_data_order_shaext -___ -}}} {{{ my $Xi=4; my @X=map("%xmm$_",(4..7,0..3)); my @Tx=map("%xmm$_",(8..10)); -my $Kx="%xmm11"; my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization my @T=("%esi","%edi"); my $j=0; -my $rx=0; my $K_XX_XX="%r11"; my $_rol=sub { &rol(@_) }; my $_ror=sub { &ror(@_) }; -{ my $sn; -sub align32() { - ++$sn; -$code.=<<___; - jmp .Lalign32_$sn # see "Decoded ICache" in manual -.align 32 -.Lalign32_$sn: -___ -} -} - $code.=<<___; .type sha1_block_data_order_ssse3,\@function,3 .align 16 sha1_block_data_order_ssse3: _ssse3_shortcut: - mov %rsp,%rax push %rbx push %rbp push %r12 - push %r13 # redundant, done to share Win64 SE handler - push %r14 - lea `-64-($win64?6*16:0)`(%rsp),%rsp + lea `-64-($win64?5*16:0)`(%rsp),%rsp ___ $code.=<<___ if ($win64); - movaps %xmm6,-40-6*16(%rax) - movaps %xmm7,-40-5*16(%rax) - movaps %xmm8,-40-4*16(%rax) - movaps %xmm9,-40-3*16(%rax) - movaps %xmm10,-40-2*16(%rax) - movaps %xmm11,-40-1*16(%rax) + movaps %xmm6,64+0(%rsp) + movaps %xmm7,64+16(%rsp) + movaps %xmm8,64+32(%rsp) + movaps %xmm9,64+48(%rsp) + movaps %xmm10,64+64(%rsp) .Lprologue_ssse3: ___ $code.=<<___; - mov %rax,%r14 # original %rsp - and \$-64,%rsp mov %rdi,$ctx # reassigned argument mov %rsi,$inp # reassigned argument mov %rdx,$num # reassigned argument shl \$6,$num add $inp,$num - lea K_XX_XX+64(%rip),$K_XX_XX + lea K_XX_XX(%rip),$K_XX_XX mov 0($ctx),$A # load context mov 4($ctx),$B @@ -508,22 +333,19 @@ $code.=<<___; mov 12($ctx),$D mov $B,@T[0] # magic seed mov 16($ctx),$E - mov $C,@T[1] - xor $D,@T[1] - and @T[1],@T[0] movdqa 64($K_XX_XX),@X[2] # pbswap mask - movdqa -64($K_XX_XX),@Tx[1] # K_00_19 + movdqa 0($K_XX_XX),@Tx[1] # K_00_19 movdqu 0($inp),@X[-4&7] # load input to %xmm[0-3] movdqu 16($inp),@X[-3&7] movdqu 32($inp),@X[-2&7] movdqu 48($inp),@X[-1&7] pshufb @X[2],@X[-4&7] # byte swap + add \$64,$inp pshufb @X[2],@X[-3&7] pshufb @X[2],@X[-2&7] - add \$64,$inp - paddd @Tx[1],@X[-4&7] # add K_00_19 pshufb @X[2],@X[-1&7] + paddd @Tx[1],@X[-4&7] # add K_00_19 paddd @Tx[1],@X[-3&7] paddd @Tx[1],@X[-2&7] movdqa @X[-4&7],0(%rsp) # X[]+K xfer to IALU @@ -548,75 +370,74 @@ sub Xupdate_ssse3_16_31() # recall that my @insns = (&$body,&$body,&$body,&$body); # 40 instructions my ($a,$b,$c,$d,$e); - eval(shift(@insns)); # ror - &pshufd (@X[0],@X[-4&7],0xee); # was &movdqa (@X[0],@X[-3&7]); + &movdqa (@X[0],@X[-3&7]); + eval(shift(@insns)); eval(shift(@insns)); &movdqa (@Tx[0],@X[-1&7]); - &paddd (@Tx[1],@X[-1&7]); + &palignr(@X[0],@X[-4&7],8); # compose "X[-14]" in "X[0]" eval(shift(@insns)); eval(shift(@insns)); - &punpcklqdq(@X[0],@X[-3&7]); # compose "X[-14]" in "X[0]", was &palignr(@X[0],@X[-4&7],8); + &paddd (@Tx[1],@X[-1&7]); eval(shift(@insns)); - eval(shift(@insns)); # rol eval(shift(@insns)); &psrldq (@Tx[0],4); # "X[-3]", 3 dwords eval(shift(@insns)); eval(shift(@insns)); - &pxor (@X[0],@X[-4&7]); # "X[0]"^="X[-16]" eval(shift(@insns)); - eval(shift(@insns)); # ror + eval(shift(@insns)); + &pxor (@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]" eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); + eval(shift(@insns)); &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]" eval(shift(@insns)); - eval(shift(@insns)); # rol + eval(shift(@insns)); &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU eval(shift(@insns)); eval(shift(@insns)); &movdqa (@Tx[2],@X[0]); + &movdqa (@Tx[0],@X[0]); + eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); - eval(shift(@insns)); # ror - &movdqa (@Tx[0],@X[0]); eval(shift(@insns)); &pslldq (@Tx[2],12); # "X[0]"<<96, extract one dword &paddd (@X[0],@X[0]); eval(shift(@insns)); eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); &psrld (@Tx[0],31); eval(shift(@insns)); - eval(shift(@insns)); # rol eval(shift(@insns)); &movdqa (@Tx[1],@Tx[2]); eval(shift(@insns)); eval(shift(@insns)); &psrld (@Tx[2],30); - eval(shift(@insns)); - eval(shift(@insns)); # ror &por (@X[0],@Tx[0]); # "X[0]"<<<=1 eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); + eval(shift(@insns)); &pslld (@Tx[1],2); &pxor (@X[0],@Tx[2]); eval(shift(@insns)); - &movdqa (@Tx[2],eval(2*16*(($Xi)/5)-64)."($K_XX_XX)"); # K_XX_XX - eval(shift(@insns)); # rol + eval(shift(@insns)); + &movdqa (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)"); # K_XX_XX eval(shift(@insns)); eval(shift(@insns)); &pxor (@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2 - &pshufd (@Tx[1],@X[-1&7],0xee) if ($Xi==7); # was &movdqa (@Tx[0],@X[-1&7]) in Xupdate_ssse3_32_79 foreach (@insns) { eval; } # remaining instructions [if any] @@ -627,30 +448,27 @@ sub Xupdate_ssse3_16_31() # recall that sub Xupdate_ssse3_32_79() { use integer; my $body = shift; - my @insns = (&$body,&$body,&$body,&$body); # 32 to 44 instructions + my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions my ($a,$b,$c,$d,$e); - eval(shift(@insns)) if ($Xi==8); - &pxor (@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]" - eval(shift(@insns)) if ($Xi==8); + &movdqa (@Tx[0],@X[-1&7]) if ($Xi==8); eval(shift(@insns)); # body_20_39 + &pxor (@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]" + &palignr(@Tx[0],@X[-2&7],8); # compose "X[-6]" eval(shift(@insns)); - eval(shift(@insns)) if (@insns[1] =~ /_ror/); - eval(shift(@insns)) if (@insns[0] =~ /_ror/); - &punpcklqdq(@Tx[0],@X[-1&7]); # compose "X[-6]", was &palignr(@Tx[0],@X[-2&7],8); eval(shift(@insns)); eval(shift(@insns)); # rol &pxor (@X[0],@X[-7&7]); # "X[0]"^="X[-28]" eval(shift(@insns)); - eval(shift(@insns)); + eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/); if ($Xi%5) { &movdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX... } else { # ... or load next one - &movdqa (@Tx[2],eval(2*16*($Xi/5)-64)."($K_XX_XX)"); + &movdqa (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)"); } - eval(shift(@insns)); # ror &paddd (@Tx[1],@X[-1&7]); + eval(shift(@insns)); # ror eval(shift(@insns)); &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-6]" @@ -658,31 +476,29 @@ sub Xupdate_ssse3_32_79() eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # rol - eval(shift(@insns)) if (@insns[0] =~ /_ror/); &movdqa (@Tx[0],@X[0]); + &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU eval(shift(@insns)); eval(shift(@insns)); - &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU eval(shift(@insns)); # ror eval(shift(@insns)); - eval(shift(@insns)); # body_20_39 &pslld (@X[0],2); - eval(shift(@insns)); + eval(shift(@insns)); # body_20_39 eval(shift(@insns)); &psrld (@Tx[0],30); - eval(shift(@insns)) if (@insns[0] =~ /_rol/);# rol + eval(shift(@insns)); + eval(shift(@insns)); # rol eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # ror + eval(shift(@insns)); &por (@X[0],@Tx[0]); # "X[0]"<<<=2 - eval(shift(@insns)); eval(shift(@insns)); # body_20_39 - eval(shift(@insns)) if (@insns[1] =~ /_rol/); - eval(shift(@insns)) if (@insns[0] =~ /_rol/); - &pshufd(@Tx[1],@X[-1&7],0xee) if ($Xi<19); # was &movdqa (@Tx[1],@X[0]) + eval(shift(@insns)); + &movdqa (@Tx[1],@X[0]) if ($Xi<19); eval(shift(@insns)); eval(shift(@insns)); # rol eval(shift(@insns)); @@ -703,11 +519,10 @@ sub Xuplast_ssse3_80() my ($a,$b,$c,$d,$e); eval(shift(@insns)); + &paddd (@Tx[1],@X[-1&7]); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); - &paddd (@Tx[1],@X[-1&7]); - eval(shift(@insns)); eval(shift(@insns)); &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU @@ -720,7 +535,7 @@ sub Xuplast_ssse3_80() unshift(@Tx,pop(@Tx)); &movdqa (@X[2],"64($K_XX_XX)"); # pbswap mask - &movdqa (@Tx[1],"-64($K_XX_XX)"); # K_00_19 + &movdqa (@Tx[1],"0($K_XX_XX)"); # K_00_19 &movdqu (@X[-4&7],"0($inp)"); # load input &movdqu (@X[-3&7],"16($inp)"); &movdqu (@X[-2&7],"32($inp)"); @@ -739,12 +554,9 @@ sub Xloop_ssse3() eval(shift(@insns)); eval(shift(@insns)); - eval(shift(@insns)); &pshufb (@X[($Xi-3)&7],@X[2]); eval(shift(@insns)); eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); &paddd (@X[($Xi-4)&7],@Tx[1]); eval(shift(@insns)); eval(shift(@insns)); @@ -753,8 +565,6 @@ sub Xloop_ssse3() &movdqa (eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]); # X[]+K xfer to IALU eval(shift(@insns)); eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); &psubd (@X[($Xi-4)&7],@Tx[1]); foreach (@insns) { eval; } @@ -770,66 +580,51 @@ sub Xtail_ssse3() foreach (@insns) { eval; } } -sub body_00_19 () { # ((c^d)&b)^d - # on start @T[0]=(c^d)&b - return &body_20_39() if ($rx==19); $rx++; +sub body_00_19 () { ( '($a,$b,$c,$d,$e)=@V;'. - '&$_ror ($b,$j?7:2)', # $b>>>2 - '&xor (@T[0],$d)', - '&mov (@T[1],$a)', # $b for next round - - '&add ($e,eval(4*($j&15))."(%rsp)")', # X[]+K xfer - '&xor ($b,$c)', # $c^$d for next round - - '&$_rol ($a,5)', - '&add ($e,@T[0])', - '&and (@T[1],$b)', # ($b&($c^$d)) for next round - - '&xor ($b,$c)', # restore $b - '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));' + '&add ($e,eval(4*($j&15))."(%rsp)");', # X[]+K xfer + '&xor ($c,$d);', + '&mov (@T[1],$a);', # $b in next round + '&$_rol ($a,5);', + '&and (@T[0],$c);', # ($b&($c^$d)) + '&xor ($c,$d);', # restore $c + '&xor (@T[0],$d);', + '&add ($e,$a);', + '&$_ror ($b,$j?7:2);', # $b>>>2 + '&add ($e,@T[0]);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));' ); } -sub body_20_39 () { # b^d^c - # on entry @T[0]=b^d - return &body_40_59() if ($rx==39); $rx++; +sub body_20_39 () { ( '($a,$b,$c,$d,$e)=@V;'. - '&add ($e,eval(4*($j&15))."(%rsp)")', # X[]+K xfer - '&xor (@T[0],$d) if($j==19);'. - '&xor (@T[0],$c) if($j> 19)', # ($b^$d^$c) - '&mov (@T[1],$a)', # $b for next round - - '&$_rol ($a,5)', - '&add ($e,@T[0])', - '&xor (@T[1],$c) if ($j< 79)', # $b^$d for next round - - '&$_ror ($b,7)', # $b>>>2 - '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));' + '&add ($e,eval(4*($j++&15))."(%rsp)");', # X[]+K xfer + '&xor (@T[0],$d);', # ($b^$d) + '&mov (@T[1],$a);', # $b in next round + '&$_rol ($a,5);', + '&xor (@T[0],$c);', # ($b^$d^$c) + '&add ($e,$a);', + '&$_ror ($b,7);', # $b>>>2 + '&add ($e,@T[0]);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));' ); } -sub body_40_59 () { # ((b^c)&(c^d))^c - # on entry @T[0]=(b^c), (c^=d) - $rx++; +sub body_40_59 () { ( '($a,$b,$c,$d,$e)=@V;'. - '&add ($e,eval(4*($j&15))."(%rsp)")', # X[]+K xfer - '&and (@T[0],$c) if ($j>=40)', # (b^c)&(c^d) - '&xor ($c,$d) if ($j>=40)', # restore $c - - '&$_ror ($b,7)', # $b>>>2 - '&mov (@T[1],$a)', # $b for next round - '&xor (@T[0],$c)', - - '&$_rol ($a,5)', - '&add ($e,@T[0])', - '&xor (@T[1],$c) if ($j==59);'. - '&xor (@T[1],$b) if ($j< 59)', # b^c for next round - - '&xor ($b,$c) if ($j< 59)', # c^d for next round - '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));' + '&mov (@T[1],$c);', + '&xor ($c,$d);', + '&add ($e,eval(4*($j++&15))."(%rsp)");', # X[]+K xfer + '&and (@T[1],$d);', + '&and (@T[0],$c);', # ($b&($c^$d)) + '&$_ror ($b,7);', # $b>>>2 + '&add ($e,@T[1]);', + '&mov (@T[1],$a);', # $b in next round + '&$_rol ($a,5);', + '&add ($e,@T[0]);', + '&xor ($c,$d);', # restore $c + '&add ($e,$a);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));' ); } $code.=<<___; @@ -870,11 +665,8 @@ $code.=<<___; mov @T[0],4($ctx) mov @T[0],$B # magic seed mov $C,8($ctx) - mov $C,@T[1] mov $D,12($ctx) - xor $D,@T[1] mov $E,16($ctx) - and @T[1],@T[0] jmp .Loop_ssse3 .align 16 @@ -899,34 +691,31 @@ $code.=<<___; mov $E,16($ctx) ___ $code.=<<___ if ($win64); - movaps -40-6*16(%r14),%xmm6 - movaps -40-5*16(%r14),%xmm7 - movaps -40-4*16(%r14),%xmm8 - movaps -40-3*16(%r14),%xmm9 - movaps -40-2*16(%r14),%xmm10 - movaps -40-1*16(%r14),%xmm11 + movaps 64+0(%rsp),%xmm6 + movaps 64+16(%rsp),%xmm7 + movaps 64+32(%rsp),%xmm8 + movaps 64+48(%rsp),%xmm9 + movaps 64+64(%rsp),%xmm10 ___ $code.=<<___; - lea (%r14),%rsi - mov -40(%rsi),%r14 - mov -32(%rsi),%r13 - mov -24(%rsi),%r12 - mov -16(%rsi),%rbp - mov -8(%rsi),%rbx - lea (%rsi),%rsp + lea `64+($win64?5*16:0)`(%rsp),%rsi + mov 0(%rsi),%r12 + mov 8(%rsi),%rbp + mov 16(%rsi),%rbx + lea 24(%rsi),%rsp .Lepilogue_ssse3: ret .size sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3 ___ if ($avx) { -$Xi=4; # reset variables -@X=map("%xmm$_",(4..7,0..3)); -@Tx=map("%xmm$_",(8..10)); -$j=0; -$rx=0; - -my $done_avx_label=".Ldone_avx"; +my $Xi=4; +my @X=map("%xmm$_",(4..7,0..3)); +my @Tx=map("%xmm$_",(8..10)); +my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization +my @T=("%esi","%edi"); +my $j=0; +my $K_XX_XX="%r11"; my $_rol=sub { &shld(@_[0],@_) }; my $_ror=sub { &shrd(@_[0],@_) }; @@ -936,34 +725,28 @@ $code.=<<___; .align 16 sha1_block_data_order_avx: _avx_shortcut: - mov %rsp,%rax push %rbx push %rbp push %r12 - push %r13 # redundant, done to share Win64 SE handler - push %r14 - lea `-64-($win64?6*16:0)`(%rsp),%rsp - vzeroupper + lea `-64-($win64?5*16:0)`(%rsp),%rsp ___ $code.=<<___ if ($win64); - vmovaps %xmm6,-40-6*16(%rax) - vmovaps %xmm7,-40-5*16(%rax) - vmovaps %xmm8,-40-4*16(%rax) - vmovaps %xmm9,-40-3*16(%rax) - vmovaps %xmm10,-40-2*16(%rax) - vmovaps %xmm11,-40-1*16(%rax) + movaps %xmm6,64+0(%rsp) + movaps %xmm7,64+16(%rsp) + movaps %xmm8,64+32(%rsp) + movaps %xmm9,64+48(%rsp) + movaps %xmm10,64+64(%rsp) .Lprologue_avx: ___ $code.=<<___; - mov %rax,%r14 # original %rsp - and \$-64,%rsp mov %rdi,$ctx # reassigned argument mov %rsi,$inp # reassigned argument mov %rdx,$num # reassigned argument + vzeroupper shl \$6,$num add $inp,$num - lea K_XX_XX+64(%rip),$K_XX_XX + lea K_XX_XX(%rip),$K_XX_XX mov 0($ctx),$A # load context mov 4($ctx),$B @@ -971,12 +754,9 @@ $code.=<<___; mov 12($ctx),$D mov $B,@T[0] # magic seed mov 16($ctx),$E - mov $C,@T[1] - xor $D,@T[1] - and @T[1],@T[0] vmovdqa 64($K_XX_XX),@X[2] # pbswap mask - vmovdqa -64($K_XX_XX),$Kx # K_00_19 + vmovdqa 0($K_XX_XX),@Tx[1] # K_00_19 vmovdqu 0($inp),@X[-4&7] # load input to %xmm[0-3] vmovdqu 16($inp),@X[-3&7] vmovdqu 32($inp),@X[-2&7] @@ -986,9 +766,9 @@ $code.=<<___; vpshufb @X[2],@X[-3&7],@X[-3&7] vpshufb @X[2],@X[-2&7],@X[-2&7] vpshufb @X[2],@X[-1&7],@X[-1&7] - vpaddd $Kx,@X[-4&7],@X[0] # add K_00_19 - vpaddd $Kx,@X[-3&7],@X[1] - vpaddd $Kx,@X[-2&7],@X[2] + vpaddd @Tx[1],@X[-4&7],@X[0] # add K_00_19 + vpaddd @Tx[1],@X[-3&7],@X[1] + vpaddd @Tx[1],@X[-2&7],@X[2] vmovdqa @X[0],0(%rsp) # X[]+K xfer to IALU vmovdqa @X[1],16(%rsp) vmovdqa @X[2],32(%rsp) @@ -1007,10 +787,10 @@ sub Xupdate_avx_16_31() # recall that $ eval(shift(@insns)); eval(shift(@insns)); - &vpaddd (@Tx[1],$Kx,@X[-1&7]); + &vpaddd (@Tx[1],@Tx[1],@X[-1&7]); eval(shift(@insns)); eval(shift(@insns)); - &vpsrldq(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords + &vpsrldq(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords eval(shift(@insns)); eval(shift(@insns)); &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]" @@ -1060,7 +840,7 @@ sub Xupdate_avx_16_31() # recall that $ &vpxor (@X[0],@X[0],@Tx[2]); # "X[0]"^=("X[0]">>96)<<<2 eval(shift(@insns)); eval(shift(@insns)); - &vmovdqa ($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)") if ($Xi%5==0); # K_XX_XX + &vmovdqa (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)"); # K_XX_XX eval(shift(@insns)); eval(shift(@insns)); @@ -1068,12 +848,13 @@ sub Xupdate_avx_16_31() # recall that $ foreach (@insns) { eval; } # remaining instructions [if any] $Xi++; push(@X,shift(@X)); # "rotate" X[] + push(@Tx,shift(@Tx)); } sub Xupdate_avx_32_79() { use integer; my $body = shift; - my @insns = (&$body,&$body,&$body,&$body); # 32 to 44 instructions + my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions my ($a,$b,$c,$d,$e); &vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8); # compose "X[-6]" @@ -1086,8 +867,12 @@ sub Xupdate_avx_32_79() &vpxor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]" eval(shift(@insns)); eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/); - &vpaddd (@Tx[1],$Kx,@X[-1&7]); - &vmovdqa ($Kx,eval(2*16*($Xi/5)-64)."($K_XX_XX)") if ($Xi%5==0); + if ($Xi%5) { + &vmovdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX... + } else { # ... or load next one + &vmovdqa (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)"); + } + &vpaddd (@Tx[1],@Tx[1],@X[-1&7]); eval(shift(@insns)); # ror eval(shift(@insns)); @@ -1117,6 +902,7 @@ sub Xupdate_avx_32_79() &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=2 eval(shift(@insns)); # body_20_39 eval(shift(@insns)); + &vmovdqa (@Tx[1],@X[0]) if ($Xi<19); eval(shift(@insns)); eval(shift(@insns)); # rol eval(shift(@insns)); @@ -1127,6 +913,7 @@ sub Xupdate_avx_32_79() foreach (@insns) { eval; } # remaining instructions $Xi++; push(@X,shift(@X)); # "rotate" X[] + push(@Tx,shift(@Tx)); } sub Xuplast_avx_80() @@ -1136,21 +923,23 @@ sub Xuplast_avx_80() my ($a,$b,$c,$d,$e); eval(shift(@insns)); - &vpaddd (@Tx[1],$Kx,@X[-1&7]); + &vpaddd (@Tx[1],@Tx[1],@X[-1&7]); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); - &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU + &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU foreach (@insns) { eval; } # remaining instructions &cmp ($inp,$num); - &je ($done_avx_label); + &je (".Ldone_avx"); + + unshift(@Tx,pop(@Tx)); &vmovdqa(@X[2],"64($K_XX_XX)"); # pbswap mask - &vmovdqa($Kx,"-64($K_XX_XX)"); # K_00_19 + &vmovdqa(@Tx[1],"0($K_XX_XX)"); # K_00_19 &vmovdqu(@X[-4&7],"0($inp)"); # load input &vmovdqu(@X[-3&7],"16($inp)"); &vmovdqu(@X[-2&7],"32($inp)"); @@ -1172,7 +961,7 @@ sub Xloop_avx() &vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]); eval(shift(@insns)); eval(shift(@insns)); - &vpaddd (@X[$Xi&7],@X[($Xi-4)&7],$Kx); + &vpaddd (@X[$Xi&7],@X[($Xi-4)&7],@Tx[1]); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); @@ -1232,15 +1021,12 @@ $code.=<<___; mov @T[0],4($ctx) mov @T[0],$B # magic seed mov $C,8($ctx) - mov $C,@T[1] mov $D,12($ctx) - xor $D,@T[1] mov $E,16($ctx) - and @T[1],@T[0] jmp .Loop_avx .align 16 -$done_avx_label: +.Ldone_avx: ___ $j=$saved_j; @V=@saved_V; @@ -1263,520 +1049,31 @@ $code.=<<___; mov $E,16($ctx) ___ $code.=<<___ if ($win64); - movaps -40-6*16(%r14),%xmm6 - movaps -40-5*16(%r14),%xmm7 - movaps -40-4*16(%r14),%xmm8 - movaps -40-3*16(%r14),%xmm9 - movaps -40-2*16(%r14),%xmm10 - movaps -40-1*16(%r14),%xmm11 + movaps 64+0(%rsp),%xmm6 + movaps 64+16(%rsp),%xmm7 + movaps 64+32(%rsp),%xmm8 + movaps 64+48(%rsp),%xmm9 + movaps 64+64(%rsp),%xmm10 ___ $code.=<<___; - lea (%r14),%rsi - mov -40(%rsi),%r14 - mov -32(%rsi),%r13 - mov -24(%rsi),%r12 - mov -16(%rsi),%rbp - mov -8(%rsi),%rbx - lea (%rsi),%rsp + lea `64+($win64?5*16:0)`(%rsp),%rsi + mov 0(%rsi),%r12 + mov 8(%rsi),%rbp + mov 16(%rsi),%rbx + lea 24(%rsi),%rsp .Lepilogue_avx: ret .size sha1_block_data_order_avx,.-sha1_block_data_order_avx ___ - -if ($avx>1) { -use integer; -$Xi=4; # reset variables -@X=map("%ymm$_",(4..7,0..3)); -@Tx=map("%ymm$_",(8..10)); -$Kx="%ymm11"; -$j=0; - -my @ROTX=("%eax","%ebp","%ebx","%ecx","%edx","%esi"); -my ($a5,$t0)=("%r12d","%edi"); - -my ($A,$F,$B,$C,$D,$E)=@ROTX; -my $rx=0; -my $frame="%r13"; - -$code.=<<___; -.type sha1_block_data_order_avx2,\@function,3 -.align 16 -sha1_block_data_order_avx2: -_avx2_shortcut: - mov %rsp,%rax - push %rbx - push %rbp - push %r12 - push %r13 - push %r14 - vzeroupper -___ -$code.=<<___ if ($win64); - lea -6*16(%rsp),%rsp - vmovaps %xmm6,-40-6*16(%rax) - vmovaps %xmm7,-40-5*16(%rax) - vmovaps %xmm8,-40-4*16(%rax) - vmovaps %xmm9,-40-3*16(%rax) - vmovaps %xmm10,-40-2*16(%rax) - vmovaps %xmm11,-40-1*16(%rax) -.Lprologue_avx2: -___ -$code.=<<___; - mov %rax,%r14 # original %rsp - mov %rdi,$ctx # reassigned argument - mov %rsi,$inp # reassigned argument - mov %rdx,$num # reassigned argument - - lea -640(%rsp),%rsp - shl \$6,$num - lea 64($inp),$frame - and \$-128,%rsp - add $inp,$num - lea K_XX_XX+64(%rip),$K_XX_XX - - mov 0($ctx),$A # load context - cmp $num,$frame - cmovae $inp,$frame # next or same block - mov 4($ctx),$F - mov 8($ctx),$C - mov 12($ctx),$D - mov 16($ctx),$E - vmovdqu 64($K_XX_XX),@X[2] # pbswap mask - - vmovdqu ($inp),%xmm0 - vmovdqu 16($inp),%xmm1 - vmovdqu 32($inp),%xmm2 - vmovdqu 48($inp),%xmm3 - lea 64($inp),$inp - vinserti128 \$1,($frame),@X[-4&7],@X[-4&7] - vinserti128 \$1,16($frame),@X[-3&7],@X[-3&7] - vpshufb @X[2],@X[-4&7],@X[-4&7] - vinserti128 \$1,32($frame),@X[-2&7],@X[-2&7] - vpshufb @X[2],@X[-3&7],@X[-3&7] - vinserti128 \$1,48($frame),@X[-1&7],@X[-1&7] - vpshufb @X[2],@X[-2&7],@X[-2&7] - vmovdqu -64($K_XX_XX),$Kx # K_00_19 - vpshufb @X[2],@X[-1&7],@X[-1&7] - - vpaddd $Kx,@X[-4&7],@X[0] # add K_00_19 - vpaddd $Kx,@X[-3&7],@X[1] - vmovdqu @X[0],0(%rsp) # X[]+K xfer to IALU - vpaddd $Kx,@X[-2&7],@X[2] - vmovdqu @X[1],32(%rsp) - vpaddd $Kx,@X[-1&7],@X[3] - vmovdqu @X[2],64(%rsp) - vmovdqu @X[3],96(%rsp) -___ -for (;$Xi<8;$Xi++) { # Xupdate_avx2_16_31 - use integer; - - &vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]" - &vpsrldq(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords - &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]" - &vpxor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]" - &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]" - &vpsrld (@Tx[0],@X[0],31); - &vmovdqu($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)") if ($Xi%5==0); # K_XX_XX - &vpslldq(@Tx[2],@X[0],12); # "X[0]"<<96, extract one dword - &vpaddd (@X[0],@X[0],@X[0]); - &vpsrld (@Tx[1],@Tx[2],30); - &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=1 - &vpslld (@Tx[2],@Tx[2],2); - &vpxor (@X[0],@X[0],@Tx[1]); - &vpxor (@X[0],@X[0],@Tx[2]); # "X[0]"^=("X[0]">>96)<<<2 - &vpaddd (@Tx[1],@X[0],$Kx); - &vmovdqu("32*$Xi(%rsp)",@Tx[1]); # X[]+K xfer to IALU - - push(@X,shift(@X)); # "rotate" X[] -} -$code.=<<___; - lea 128(%rsp),$frame - jmp .Loop_avx2 -.align 32 -.Loop_avx2: - rorx \$2,$F,$B - andn $D,$F,$t0 - and $C,$F - xor $t0,$F -___ -sub bodyx_00_19 () { # 8 instructions, 3 cycles critical path - # at start $f=(b&c)^(~b&d), $b>>>=2 - return &bodyx_20_39() if ($rx==19); $rx++; - ( - '($a,$f,$b,$c,$d,$e)=@ROTX;'. - - '&add ($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'. # e+=X[i]+K - '&lea ($frame,"256($frame)") if ($j%32==31);', - '&andn ($t0,$a,$c)', # ~b&d for next round - - '&add ($e,$f)', # e+=(b&c)^(~b&d) - '&rorx ($a5,$a,27)', # a<<<5 - '&rorx ($f,$a,2)', # b>>>2 for next round - '&and ($a,$b)', # b&c for next round - - '&add ($e,$a5)', # e+=a<<<5 - '&xor ($a,$t0);'. # f=(b&c)^(~b&d) for next round - - 'unshift(@ROTX,pop(@ROTX)); $j++;' - ) -} - -sub bodyx_20_39 () { # 7 instructions, 2 cycles critical path - # on entry $f=b^c^d, $b>>>=2 - return &bodyx_40_59() if ($rx==39); $rx++; - ( - '($a,$f,$b,$c,$d,$e)=@ROTX;'. - - '&add ($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'. # e+=X[i]+K - '&lea ($frame,"256($frame)") if ($j%32==31);', - - '&lea ($e,"($e,$f)")', # e+=b^c^d - '&rorx ($a5,$a,27)', # a<<<5 - '&rorx ($f,$a,2) if ($j<79)', # b>>>2 in next round - '&xor ($a,$b) if ($j<79)', # b^c for next round - - '&add ($e,$a5)', # e+=a<<<5 - '&xor ($a,$c) if ($j<79);'. # f=b^c^d for next round - - 'unshift(@ROTX,pop(@ROTX)); $j++;' - ) -} - -sub bodyx_40_59 () { # 10 instructions, 3 cycles critical path - # on entry $f=((b^c)&(c^d)), $b>>>=2 - $rx++; - ( - '($a,$f,$b,$c,$d,$e)=@ROTX;'. - - '&add ($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'. # e+=X[i]+K - '&lea ($frame,"256($frame)") if ($j%32==31);', - '&xor ($f,$c) if ($j>39)', # (b^c)&(c^d)^c - '&mov ($t0,$b) if ($j<59)', # count on zero latency - '&xor ($t0,$c) if ($j<59)', # c^d for next round - - '&lea ($e,"($e,$f)")', # e+=(b^c)&(c^d)^c - '&rorx ($a5,$a,27)', # a<<<5 - '&rorx ($f,$a,2)', # b>>>2 in next round - '&xor ($a,$b)', # b^c for next round - - '&add ($e,$a5)', # e+=a<<<5 - '&and ($a,$t0) if ($j< 59);'. # f=(b^c)&(c^d) for next round - '&xor ($a,$c) if ($j==59);'. # f=b^c^d for next round - - 'unshift(@ROTX,pop(@ROTX)); $j++;' - ) -} - -sub Xupdate_avx2_16_31() # recall that $Xi starts wtih 4 -{ use integer; - my $body = shift; - my @insns = (&$body,&$body,&$body,&$body,&$body); # 35 instructions - my ($a,$b,$c,$d,$e); - - &vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]" - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - - &vpsrldq(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - - &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]" - &vpxor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]" - eval(shift(@insns)); - eval(shift(@insns)); - - &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]" - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - - &vpsrld (@Tx[0],@X[0],31); - &vmovdqu($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)") if ($Xi%5==0); # K_XX_XX - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - - &vpslldq(@Tx[2],@X[0],12); # "X[0]"<<96, extract one dword - &vpaddd (@X[0],@X[0],@X[0]); - eval(shift(@insns)); - eval(shift(@insns)); - - &vpsrld (@Tx[1],@Tx[2],30); - &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=1 - eval(shift(@insns)); - eval(shift(@insns)); - - &vpslld (@Tx[2],@Tx[2],2); - &vpxor (@X[0],@X[0],@Tx[1]); - eval(shift(@insns)); - eval(shift(@insns)); - - &vpxor (@X[0],@X[0],@Tx[2]); # "X[0]"^=("X[0]">>96)<<<2 - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - - &vpaddd (@Tx[1],@X[0],$Kx); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &vmovdqu(eval(32*($Xi))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU - - foreach (@insns) { eval; } # remaining instructions [if any] - - $Xi++; - push(@X,shift(@X)); # "rotate" X[] -} - -sub Xupdate_avx2_32_79() -{ use integer; - my $body = shift; - my @insns = (&$body,&$body,&$body,&$body,&$body); # 35 to 50 instructions - my ($a,$b,$c,$d,$e); - - &vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8); # compose "X[-6]" - &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]" - eval(shift(@insns)); - eval(shift(@insns)); - - &vpxor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]" - &vmovdqu($Kx,eval(2*16*($Xi/5)-64)."($K_XX_XX)") if ($Xi%5==0); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - - &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-6]" - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - - &vpsrld (@Tx[0],@X[0],30); - &vpslld (@X[0],@X[0],2); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - - #&vpslld (@X[0],@X[0],2); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - - &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=2 - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - - &vpaddd (@Tx[1],@X[0],$Kx); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - - &vmovdqu("32*$Xi(%rsp)",@Tx[1]); # X[]+K xfer to IALU - - foreach (@insns) { eval; } # remaining instructions - - $Xi++; - push(@X,shift(@X)); # "rotate" X[] -} - -sub Xloop_avx2() -{ use integer; - my $body = shift; - my @insns = (&$body,&$body,&$body,&$body,&$body); # 32 instructions - my ($a,$b,$c,$d,$e); - - foreach (@insns) { eval; } -} - - &align32(); - &Xupdate_avx2_32_79(\&bodyx_00_19); - &Xupdate_avx2_32_79(\&bodyx_00_19); - &Xupdate_avx2_32_79(\&bodyx_00_19); - &Xupdate_avx2_32_79(\&bodyx_00_19); - - &Xupdate_avx2_32_79(\&bodyx_20_39); - &Xupdate_avx2_32_79(\&bodyx_20_39); - &Xupdate_avx2_32_79(\&bodyx_20_39); - &Xupdate_avx2_32_79(\&bodyx_20_39); - - &align32(); - &Xupdate_avx2_32_79(\&bodyx_40_59); - &Xupdate_avx2_32_79(\&bodyx_40_59); - &Xupdate_avx2_32_79(\&bodyx_40_59); - &Xupdate_avx2_32_79(\&bodyx_40_59); - - &Xloop_avx2(\&bodyx_20_39); - &Xloop_avx2(\&bodyx_20_39); - &Xloop_avx2(\&bodyx_20_39); - &Xloop_avx2(\&bodyx_20_39); - -$code.=<<___; - lea 128($inp),$frame - lea 128($inp),%rdi # borrow $t0 - cmp $num,$frame - cmovae $inp,$frame # next or previous block - - # output is d-e-[a]-f-b-c => A=d,F=e,C=f,D=b,E=c - add 0($ctx),@ROTX[0] # update context - add 4($ctx),@ROTX[1] - add 8($ctx),@ROTX[3] - mov @ROTX[0],0($ctx) - add 12($ctx),@ROTX[4] - mov @ROTX[1],4($ctx) - mov @ROTX[0],$A # A=d - add 16($ctx),@ROTX[5] - mov @ROTX[3],$a5 - mov @ROTX[3],8($ctx) - mov @ROTX[4],$D # D=b - #xchg @ROTX[5],$F # F=c, C=f - mov @ROTX[4],12($ctx) - mov @ROTX[1],$F # F=e - mov @ROTX[5],16($ctx) - #mov $F,16($ctx) - mov @ROTX[5],$E # E=c - mov $a5,$C # C=f - #xchg $F,$E # E=c, F=e - - cmp $num,$inp - je .Ldone_avx2 -___ - -$Xi=4; # reset variables -@X=map("%ymm$_",(4..7,0..3)); - -$code.=<<___; - vmovdqu 64($K_XX_XX),@X[2] # pbswap mask - cmp $num,%rdi # borrowed $t0 - ja .Last_avx2 - - vmovdqu -64(%rdi),%xmm0 # low part of @X[-4&7] - vmovdqu -48(%rdi),%xmm1 - vmovdqu -32(%rdi),%xmm2 - vmovdqu -16(%rdi),%xmm3 - vinserti128 \$1,0($frame),@X[-4&7],@X[-4&7] - vinserti128 \$1,16($frame),@X[-3&7],@X[-3&7] - vinserti128 \$1,32($frame),@X[-2&7],@X[-2&7] - vinserti128 \$1,48($frame),@X[-1&7],@X[-1&7] - jmp .Last_avx2 - -.align 32 -.Last_avx2: - lea 128+16(%rsp),$frame - rorx \$2,$F,$B - andn $D,$F,$t0 - and $C,$F - xor $t0,$F - sub \$-128,$inp -___ - $rx=$j=0; @ROTX=($A,$F,$B,$C,$D,$E); - - &Xloop_avx2 (\&bodyx_00_19); - &Xloop_avx2 (\&bodyx_00_19); - &Xloop_avx2 (\&bodyx_00_19); - &Xloop_avx2 (\&bodyx_00_19); - - &Xloop_avx2 (\&bodyx_20_39); - &vmovdqu ($Kx,"-64($K_XX_XX)"); # K_00_19 - &vpshufb (@X[-4&7],@X[-4&7],@X[2]); # byte swap - &Xloop_avx2 (\&bodyx_20_39); - &vpshufb (@X[-3&7],@X[-3&7],@X[2]); - &vpaddd (@Tx[0],@X[-4&7],$Kx); # add K_00_19 - &Xloop_avx2 (\&bodyx_20_39); - &vmovdqu ("0(%rsp)",@Tx[0]); - &vpshufb (@X[-2&7],@X[-2&7],@X[2]); - &vpaddd (@Tx[1],@X[-3&7],$Kx); - &Xloop_avx2 (\&bodyx_20_39); - &vmovdqu ("32(%rsp)",@Tx[1]); - &vpshufb (@X[-1&7],@X[-1&7],@X[2]); - &vpaddd (@X[2],@X[-2&7],$Kx); - - &Xloop_avx2 (\&bodyx_40_59); - &align32 (); - &vmovdqu ("64(%rsp)",@X[2]); - &vpaddd (@X[3],@X[-1&7],$Kx); - &Xloop_avx2 (\&bodyx_40_59); - &vmovdqu ("96(%rsp)",@X[3]); - &Xloop_avx2 (\&bodyx_40_59); - &Xupdate_avx2_16_31(\&bodyx_40_59); - - &Xupdate_avx2_16_31(\&bodyx_20_39); - &Xupdate_avx2_16_31(\&bodyx_20_39); - &Xupdate_avx2_16_31(\&bodyx_20_39); - &Xloop_avx2 (\&bodyx_20_39); - -$code.=<<___; - lea 128(%rsp),$frame - - # output is d-e-[a]-f-b-c => A=d,F=e,C=f,D=b,E=c - add 0($ctx),@ROTX[0] # update context - add 4($ctx),@ROTX[1] - add 8($ctx),@ROTX[3] - mov @ROTX[0],0($ctx) - add 12($ctx),@ROTX[4] - mov @ROTX[1],4($ctx) - mov @ROTX[0],$A # A=d - add 16($ctx),@ROTX[5] - mov @ROTX[3],$a5 - mov @ROTX[3],8($ctx) - mov @ROTX[4],$D # D=b - #xchg @ROTX[5],$F # F=c, C=f - mov @ROTX[4],12($ctx) - mov @ROTX[1],$F # F=e - mov @ROTX[5],16($ctx) - #mov $F,16($ctx) - mov @ROTX[5],$E # E=c - mov $a5,$C # C=f - #xchg $F,$E # E=c, F=e - - cmp $num,$inp - jbe .Loop_avx2 - -.Ldone_avx2: - vzeroupper -___ -$code.=<<___ if ($win64); - movaps -40-6*16(%r14),%xmm6 - movaps -40-5*16(%r14),%xmm7 - movaps -40-4*16(%r14),%xmm8 - movaps -40-3*16(%r14),%xmm9 - movaps -40-2*16(%r14),%xmm10 - movaps -40-1*16(%r14),%xmm11 -___ -$code.=<<___; - lea (%r14),%rsi - mov -40(%rsi),%r14 - mov -32(%rsi),%r13 - mov -24(%rsi),%r12 - mov -16(%rsi),%rbp - mov -8(%rsi),%rbx - lea (%rsi),%rsp -.Lepilogue_avx2: - ret -.size sha1_block_data_order_avx2,.-sha1_block_data_order_avx2 -___ -} } $code.=<<___; .align 64 K_XX_XX: .long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19 -.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19 .long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39 -.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39 -.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59 .long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59 .long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79 -.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap mask -.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap mask -.byte 0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0 ___ }}} $code.=<<___; @@ -1822,58 +1119,20 @@ se_handler: jae .Lcommon_seh_tail mov `16*4`(%rax),%rax # pull saved stack pointer + lea 32(%rax),%rax mov -8(%rax),%rbx mov -16(%rax),%rbp mov -24(%rax),%r12 mov -32(%rax),%r13 - mov -40(%rax),%r14 mov %rbx,144($context) # restore context->Rbx mov %rbp,160($context) # restore context->Rbp mov %r12,216($context) # restore context->R12 mov %r13,224($context) # restore context->R13 - mov %r14,232($context) # restore context->R14 jmp .Lcommon_seh_tail .size se_handler,.-se_handler -___ -$code.=<<___ if ($shaext); -.type shaext_handler,\@abi-omnipotent -.align 16 -shaext_handler: - push %rsi - push %rdi - push %rbx - push %rbp - push %r12 - push %r13 - push %r14 - push %r15 - pushfq - sub \$64,%rsp - - mov 120($context),%rax # pull context->Rax - mov 248($context),%rbx # pull context->Rip - - lea .Lprologue_shaext(%rip),%r10 - cmp %r10,%rbx # context->Rip<.Lprologue - jb .Lcommon_seh_tail - - lea .Lepilogue_shaext(%rip),%r10 - cmp %r10,%rbx # context->Rip>=.Lepilogue - jae .Lcommon_seh_tail - - lea -8-4*16(%rax),%rsi - lea 512($context),%rdi # &context.Xmm6 - mov \$8,%ecx - .long 0xa548f3fc # cld; rep movsq - - jmp .Lcommon_seh_tail -.size shaext_handler,.-shaext_handler -___ - -$code.=<<___; .type ssse3_handler,\@abi-omnipotent .align 16 ssse3_handler: @@ -1906,23 +1165,18 @@ ssse3_handler: cmp %r10,%rbx # context->Rip>=epilogue label jae .Lcommon_seh_tail - mov 232($context),%rax # pull context->R14 - - lea -40-6*16(%rax),%rsi + lea 64(%rax),%rsi lea 512($context),%rdi # &context.Xmm6 - mov \$12,%ecx + mov \$10,%ecx .long 0xa548f3fc # cld; rep movsq + lea `24+64+5*16`(%rax),%rax # adjust stack pointer mov -8(%rax),%rbx mov -16(%rax),%rbp mov -24(%rax),%r12 - mov -32(%rax),%r13 - mov -40(%rax),%r14 mov %rbx,144($context) # restore context->Rbx mov %rbp,160($context) # restore context->Rbp mov %r12,216($context) # restore cotnext->R12 - mov %r13,224($context) # restore cotnext->R13 - mov %r14,232($context) # restore cotnext->R14 .Lcommon_seh_tail: mov 8(%rax),%rdi @@ -1969,13 +1223,6 @@ ssse3_handler: .rva .LSEH_begin_sha1_block_data_order .rva .LSEH_end_sha1_block_data_order .rva .LSEH_info_sha1_block_data_order -___ -$code.=<<___ if ($shaext); - .rva .LSEH_begin_sha1_block_data_order_shaext - .rva .LSEH_end_sha1_block_data_order_shaext - .rva .LSEH_info_sha1_block_data_order_shaext -___ -$code.=<<___; .rva .LSEH_begin_sha1_block_data_order_ssse3 .rva .LSEH_end_sha1_block_data_order_ssse3 .rva .LSEH_info_sha1_block_data_order_ssse3 @@ -1985,24 +1232,12 @@ $code.=<<___ if ($avx); .rva .LSEH_end_sha1_block_data_order_avx .rva .LSEH_info_sha1_block_data_order_avx ___ -$code.=<<___ if ($avx>1); - .rva .LSEH_begin_sha1_block_data_order_avx2 - .rva .LSEH_end_sha1_block_data_order_avx2 - .rva .LSEH_info_sha1_block_data_order_avx2 -___ $code.=<<___; .section .xdata .align 8 .LSEH_info_sha1_block_data_order: .byte 9,0,0,0 .rva se_handler -___ -$code.=<<___ if ($shaext); -.LSEH_info_sha1_block_data_order_shaext: - .byte 9,0,0,0 - .rva shaext_handler -___ -$code.=<<___; .LSEH_info_sha1_block_data_order_ssse3: .byte 9,0,0,0 .rva ssse3_handler @@ -2014,55 +1249,10 @@ $code.=<<___ if ($avx); .rva ssse3_handler .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[] ___ -$code.=<<___ if ($avx>1); -.LSEH_info_sha1_block_data_order_avx2: - .byte 9,0,0,0 - .rva ssse3_handler - .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[] -___ } #################################################################### -sub sha1rnds4 { - if (@_[0] =~ /\$([x0-9a-f]+),\s*%xmm([0-7]),\s*%xmm([0-7])/) { - my @opcode=(0x0f,0x3a,0xcc); - push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M - my $c=$1; - push @opcode,$c=~/^0/?oct($c):$c; - return ".byte\t".join(',',@opcode); - } else { - return "sha1rnds4\t".@_[0]; - } -} - -sub sha1op38 { - my $instr = shift; - my %opcodelet = ( - "sha1nexte" => 0xc8, - "sha1msg1" => 0xc9, - "sha1msg2" => 0xca ); - - if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) { - my @opcode=(0x0f,0x38); - my $rex=0; - $rex|=0x04 if ($2>=8); - $rex|=0x01 if ($1>=8); - unshift @opcode,0x40|$rex if ($rex); - push @opcode,$opcodelet{$instr}; - push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M - return ".byte\t".join(',',@opcode); - } else { - return $instr."\t".@_[0]; - } -} - -foreach (split("\n",$code)) { - s/\`([^\`]*)\`/eval $1/geo; - - s/\b(sha1rnds4)\s+(.*)/sha1rnds4($2)/geo or - s/\b(sha1[^\s]*)\s+(.*)/sha1op38($1,$2)/geo; - - print $_,"\n"; -} +$code =~ s/\`([^\`]*)\`/eval $1/gem; +print $code; close STDOUT;