[openssl-commits] [openssl] master update

Andy Polyakov Sun, 03 Jun 2018 12:21:01 -0700

The branch master has been updated
       via  c4d9ef4cc5bf1c48a74b64879622ae9fd6f26b03 (commit)
       via  1a467bd12f20928f3d5e6809b5f9394dbe606541 (commit)
       via  41013cd63c068e2f271fabc92702ee67d800f0cb (commit)
      from  9a708bf982da1d2c9739339d16d7b021da955e00 (commit)



- Log -----------------------------------------------------------------
commit c4d9ef4cc5bf1c48a74b64879622ae9fd6f26b03
Author: Andy Polyakov <[email protected]>
Date:   Sat Jun 2 15:25:50 2018 +0200

    sha/asm/sha512p8-ppc.pl: improve POWER9 performance by ~10%.
    
    Biggest part, ~7%, of improvement resulted from omitting constants'
    table index increment in each round. And minor part from rescheduling
    instructions. Apparently POWER9 (and POWER8) manage to dispatch
    instructions more efficiently if they are laid down as if they have
    no latency...
    
    Reviewed-by: Rich Salz <[email protected]>
    (Merged from https://github.com/openssl/openssl/pull/6406)

commit 1a467bd12f20928f3d5e6809b5f9394dbe606541
Author: Andy Polyakov <[email protected]>
Date:   Sat Jun 2 14:14:28 2018 +0200

    chacha/asm/chacha-ppc.pl: improve POWER8 performance by 15%.
    
    This comes at cost of minor 2.5% regression on G4, which is reasonable
    trade-off. [Further improve compliance with ABI requirements.]
    
    Reviewed-by: Rich Salz <[email protected]>
    (Merged from https://github.com/openssl/openssl/pull/6406)

commit 41013cd63c068e2f271fabc92702ee67d800f0cb
Author: Andy Polyakov <[email protected]>
Date:   Sat Jun 2 14:03:27 2018 +0200

    PPC assembly pack: correct POWER9 results.
    
    As it turns out originally published results were skewed by "turbo"
    mode. VM apparently remains oblivious to dynamic frequency scaling,
    and reports that processor operates at "base" frequency at all times.
    While actual frequency gets increased under load.
    
    Reviewed-by: Rich Salz <[email protected]>
    (Merged from https://github.com/openssl/openssl/pull/6406)

-----------------------------------------------------------------------

Summary of changes:
 crypto/aes/asm/aesp8-ppc.pl           |   3 +-
 crypto/chacha/asm/chacha-ppc.pl       |  74 ++++++++++++---------
 crypto/modes/asm/ghashp8-ppc.pl       |   2 +-
 crypto/poly1305/asm/poly1305-ppc.pl   |   2 +-
 crypto/poly1305/asm/poly1305-ppcfp.pl |   1 -
 crypto/poly1305/poly1305_ieee754.c    |   1 -
 crypto/sha/asm/keccak1600-ppc64.pl    |   2 +-
 crypto/sha/asm/keccak1600p8-ppc.pl    |   2 +-
 crypto/sha/asm/sha512p8-ppc.pl        | 122 +++++++++++++++-------------------
 9 files changed, 102 insertions(+), 107 deletions(-)

diff --git a/crypto/aes/asm/aesp8-ppc.pl b/crypto/aes/asm/aesp8-ppc.pl
index 8670940..488b133 100755
--- a/crypto/aes/asm/aesp8-ppc.pl
+++ b/crypto/aes/asm/aesp8-ppc.pl
@@ -40,7 +40,8 @@
 #              CBC en-/decrypt CTR     XTS
 # POWER8[le]   3.96/0.72       0.74    1.1
 # POWER8[be]   3.75/0.65       0.66    1.0
-# POWER9[le]   3.05/0.65       0.65    0.80
+# POWER9[le]   4.02/0.86       0.84    1.05
+# POWER9[be]   3.99/0.78       0.79    0.97
 
 $flavour = shift;
 
diff --git a/crypto/chacha/asm/chacha-ppc.pl b/crypto/chacha/asm/chacha-ppc.pl
index 350d5fa..88746fe 100755
--- a/crypto/chacha/asm/chacha-ppc.pl
+++ b/crypto/chacha/asm/chacha-ppc.pl
@@ -23,11 +23,11 @@
 #                      IALU/gcc-4.x    3xAltiVec+1xIALU
 #
 # Freescale e300       13.6/+115%      -
-# PPC74x0/G4e          6.81/+310%      3.72
+# PPC74x0/G4e          6.81/+310%      3.81
 # PPC970/G5            9.29/+160%      ?
-# POWER7               8.62/+61%       3.38
-# POWER8               8.70/+51%       3.36
-# POWER9               6.61/+29%       3.30(*)
+# POWER7               8.62/+61%       3.35
+# POWER8               8.70/+51%       2.91
+# POWER9               8.80/+29%       4.44(*)
 #
 # (*)  this is trade-off result, it's possible to improve it, but
 #      then it would negatively affect all others;
@@ -398,12 +398,12 @@ ___
 my ($A0,$B0,$C0,$D0,$A1,$B1,$C1,$D1,$A2,$B2,$C2,$D2)
                                = map("v$_",(0..11));
 my @K                          = map("v$_",(12..17));
-my ($FOUR,$sixteen,$twenty4)   = map("v$_",(18..20));
-my ($inpperm,$outperm,$outmask)        = map("v$_",(21..23));
-my @D                          = map("v$_",(24..28));
+my ($FOUR,$sixteen,$twenty4)   = map("v$_",(18..19,23));
+my ($inpperm,$outperm,$outmask)        = map("v$_",(24..26));
+my @D                          = map("v$_",(27..31));
 my ($twelve,$seven,$T0,$T1) = @D;
 
-my $FRAME=$LOCALS+64+10*16+18*$SIZE_T; # 10*16 is for v20-v28 offload
+my $FRAME=$LOCALS+64+10*16+18*$SIZE_T; # 10*16 is for v23-v31 offload
 
 sub VMXROUND {
 my $odd = pop;
@@ -445,22 +445,22 @@ $code.=<<___;
        li      r10,`15+$LOCALS+64`
        li      r11,`31+$LOCALS+64`
        mfspr   r12,256
-       stvx    v20,r10,$sp
+       stvx    v23,r10,$sp
        addi    r10,r10,32
-       stvx    v21,r11,$sp
+       stvx    v24,r11,$sp
        addi    r11,r11,32
-       stvx    v22,r10,$sp
+       stvx    v25,r10,$sp
        addi    r10,r10,32
-       stvx    v23,r11,$sp
+       stvx    v26,r11,$sp
        addi    r11,r11,32
-       stvx    v24,r10,$sp
+       stvx    v27,r10,$sp
        addi    r10,r10,32
-       stvx    v25,r11,$sp
+       stvx    v28,r11,$sp
        addi    r11,r11,32
-       stvx    v26,r10,$sp
+       stvx    v29,r10,$sp
        addi    r10,r10,32
-       stvx    v27,r11,$sp
-       stvx    v28,r10,$sp
+       stvx    v30,r11,$sp
+       stvx    v31,r10,$sp
        stw     r12,`$FRAME-$SIZE_T*18-4`($sp)  # save vrsave
        $PUSH   r14,`$FRAME-$SIZE_T*18`($sp)
        $PUSH   r15,`$FRAME-$SIZE_T*17`($sp)
@@ -480,7 +480,7 @@ $code.=<<___;
        $PUSH   r29,`$FRAME-$SIZE_T*3`($sp)
        $PUSH   r30,`$FRAME-$SIZE_T*2`($sp)
        $PUSH   r31,`$FRAME-$SIZE_T*1`($sp)
-       li      r12,-8
+       li      r12,-4096+511
        $PUSH   r0, `$FRAME+$LRSAVE`($sp)
        mtspr   256,r12                         # preserve 29 AltiVec registers
 
@@ -588,9 +588,13 @@ ___
        my @thread3=&ROUND(0,4,8,12);
 
        foreach (@thread0) {
-               eval;                   eval(shift(@thread3));
-               eval(shift(@thread1));  eval(shift(@thread3));
-               eval(shift(@thread2));  eval(shift(@thread3));
+               eval;
+               eval(shift(@thread1));
+               eval(shift(@thread2));
+
+               eval(shift(@thread3));
+               eval(shift(@thread3));
+               eval(shift(@thread3));
        }
        foreach (@thread3) { eval; }
 
@@ -600,9 +604,13 @@ ___
        @thread3=&ROUND(0,5,10,15);
 
        foreach (@thread0) {
-               eval;                   eval(shift(@thread3));
-               eval(shift(@thread1));  eval(shift(@thread3));
-               eval(shift(@thread2));  eval(shift(@thread3));
+               eval;
+               eval(shift(@thread1));
+               eval(shift(@thread2));
+
+               eval(shift(@thread3));
+               eval(shift(@thread3));
+               eval(shift(@thread3));
        }
        foreach (@thread3) { eval; }
 $code.=<<___;
@@ -843,22 +851,22 @@ Ldone_vmx:
        li      r10,`15+$LOCALS+64`
        li      r11,`31+$LOCALS+64`
        mtspr   256,r12                         # restore vrsave
-       lvx     v20,r10,$sp
+       lvx     v23,r10,$sp
        addi    r10,r10,32
-       lvx     v21,r11,$sp
+       lvx     v24,r11,$sp
        addi    r11,r11,32
-       lvx     v22,r10,$sp
+       lvx     v25,r10,$sp
        addi    r10,r10,32
-       lvx     v23,r11,$sp
+       lvx     v26,r11,$sp
        addi    r11,r11,32
-       lvx     v24,r10,$sp
+       lvx     v27,r10,$sp
        addi    r10,r10,32
-       lvx     v25,r11,$sp
+       lvx     v28,r11,$sp
        addi    r11,r11,32
-       lvx     v26,r10,$sp
+       lvx     v29,r10,$sp
        addi    r10,r10,32
-       lvx     v27,r11,$sp
-       lvx     v28,r10,$sp
+       lvx     v30,r11,$sp
+       lvx     v31,r10,$sp
        $POP    r0, `$FRAME+$LRSAVE`($sp)
        $POP    r14,`$FRAME-$SIZE_T*18`($sp)
        $POP    r15,`$FRAME-$SIZE_T*17`($sp)
diff --git a/crypto/modes/asm/ghashp8-ppc.pl b/crypto/modes/asm/ghashp8-ppc.pl
index 6df485e..6a2ac71 100755
--- a/crypto/modes/asm/ghashp8-ppc.pl
+++ b/crypto/modes/asm/ghashp8-ppc.pl
@@ -30,7 +30,7 @@
 # 2x aggregated reduction improves performance by 50% (resulting
 # performance on POWER8 is 1 cycle per processed byte), and 4x
 # aggregated reduction - by 170% or 2.7x (resulting in 0.55 cpb).
-# POWER9 delivers 0.40 cpb.
+# POWER9 delivers 0.51 cpb.
 
 $flavour=shift;
 $output =shift;
diff --git a/crypto/poly1305/asm/poly1305-ppc.pl 
b/crypto/poly1305/asm/poly1305-ppc.pl
index cb4ae23..0c6d015 100755
--- a/crypto/poly1305/asm/poly1305-ppc.pl
+++ b/crypto/poly1305/asm/poly1305-ppc.pl
@@ -28,7 +28,7 @@
 # PPC970               7.00/+114%      3.51/+205%
 # POWER7               3.75/+260%      1.93/+100%
 # POWER8               -               2.03/+200%
-# POWER9               -               1.56/+150%
+# POWER9               -               2.00/+150%
 #
 # Do we need floating-point implementation for PPC? Results presented
 # in poly1305_ieee754.c are tricky to compare to, because they are for
diff --git a/crypto/poly1305/asm/poly1305-ppcfp.pl 
b/crypto/poly1305/asm/poly1305-ppcfp.pl
index 2abb8e2..09f8185 100755
--- a/crypto/poly1305/asm/poly1305-ppcfp.pl
+++ b/crypto/poly1305/asm/poly1305-ppcfp.pl
@@ -26,7 +26,6 @@
 # PPC970               6.03/+80%
 # POWER7               3.50/+30%
 # POWER8               3.75/+10%
-# POWER9               2.80/+12%
 
 $flavour = shift;
 
diff --git a/crypto/poly1305/poly1305_ieee754.c 
b/crypto/poly1305/poly1305_ieee754.c
index 1a06e03..995a02e 100644
--- a/crypto/poly1305/poly1305_ieee754.c
+++ b/crypto/poly1305/poly1305_ieee754.c
@@ -38,7 +38,6 @@
  * POWER6               4.92
  * POWER7               4.50
  * POWER8               4.10
- * POWER9               3.14
  *
  * z10                  11.2
  * z196+                7.30
diff --git a/crypto/sha/asm/keccak1600-ppc64.pl 
b/crypto/sha/asm/keccak1600-ppc64.pl
index bc1023e..30e70c5 100755
--- a/crypto/sha/asm/keccak1600-ppc64.pl
+++ b/crypto/sha/asm/keccak1600-ppc64.pl
@@ -30,7 +30,7 @@
 # PPC970/G5    14.6/+120%
 # POWER7       10.3/+100%
 # POWER8       11.5/+85%
-# POWER9       7.2/+45%
+# POWER9       9.4/+45%
 #
 # (*)  Corresponds to SHA3-256. Percentage after slash is improvement
 #      over gcc-4.x-generated KECCAK_1X_ALT code. Newer compilers do
diff --git a/crypto/sha/asm/keccak1600p8-ppc.pl 
b/crypto/sha/asm/keccak1600p8-ppc.pl
index a0aeeb0..de2bcd6 100755
--- a/crypto/sha/asm/keccak1600p8-ppc.pl
+++ b/crypto/sha/asm/keccak1600p8-ppc.pl
@@ -23,7 +23,7 @@
 # buffer for r=1088, which matches SHA3-256. This is 17% better than
 # scalar PPC64 code. It probably should be noted that if POWER8's
 # successor can achieve higher scalar instruction issue rate, then
-# this module will loose... And it does on POWER9 with 8.8 vs. 7.2.
+# this module will loose... And it does on POWER9 with 12.0 vs. 9.4.
 
 $flavour = shift;
 
diff --git a/crypto/sha/asm/sha512p8-ppc.pl b/crypto/sha/asm/sha512p8-ppc.pl
index a33ae4d..e3f522c 100755
--- a/crypto/sha/asm/sha512p8-ppc.pl
+++ b/crypto/sha/asm/sha512p8-ppc.pl
@@ -37,8 +37,8 @@
 # build of sha512-ppc.pl, presented for reference.
 #
 #              POWER8          POWER9
-# SHA256       9.9 [15.8]      9.2 [9.3]
-# SHA512       6.3 [10.3]      5.8 [5.9]
+# SHA256       9.7 [15.8]      11.2 [12.5]
+# SHA512       6.1 [10.3]      7.0 [7.9]
 
 $flavour=shift;
 $output =shift;
@@ -79,7 +79,8 @@ if ($output =~ /512/) {
 }
 
 $func="sha${bits}_block_p8";
-$FRAME=8*$SIZE_T;
+$LOCALS=8*$SIZE_T+8*16;
+$FRAME=$LOCALS+9*16+6*$SIZE_T;
 
 $sp ="r1";
 $toc="r2";
@@ -91,16 +92,17 @@ $idx="r7";
 $lrsave="r8";
 $offload="r11";
 $vrsave="r12";
-($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,10,26..31));
- $x00=0 if ($flavour =~ /osx/);
+@I = ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,10,26..31));
+      $x00=0 if ($flavour =~ /osx/);
 
 @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("v$_",(0..7));
-@X=map("v$_",(8..23));
-($Ki,$Func,$S0,$S1,$s0,$s1,$lemask)=map("v$_",(24..31));
+@X=map("v$_",(8..19,24..27));
+($Ki,$Func,$Sigma,$lemask)=map("v$_",(28..31));
 
 sub ROUND {
 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
 my $j=($i+1)%16;
+my $k=($i+2)%8;
 
 $code.=<<___           if ($i<15 && ($i%(16/$SZ))==(16/$SZ-1));
        lvx_u           @X[$i+1],0,$inp         ; load X[i] in advance
@@ -112,26 +114,30 @@ ___
 $code.=<<___           if ($LENDIAN && $i<16 && ($i%(16/$SZ))==0);
        vperm           @X[$i],@X[$i],@X[$i],$lemask
 ___
+$code.=<<___           if ($i>=15);
+       vshasigma${sz}  $Sigma,@X[($j+1)%16],0,0
+       vaddu${sz}m     @X[$j],@X[$j],$Sigma
+       vshasigma${sz}  $Sigma,@X[($j+14)%16],0,15
+       vaddu${sz}m     @X[$j],@X[$j],$Sigma
+       vaddu${sz}m     @X[$j],@X[$j],@X[($j+9)%16]
+___
 $code.=<<___;
-       `"vshasigma${sz}        $s0,@X[($j+1)%16],0,0"          if ($i>=15)`
-       vsel            $Func,$g,$f,$e          ; Ch(e,f,g)
-       vshasigma${sz}  $S1,$e,1,15             ; Sigma1(e)
        vaddu${sz}m     $h,$h,@X[$i%16]         ; h+=X[i]
-       vshasigma${sz}  $S0,$a,1,0              ; Sigma0(a)
-       `"vshasigma${sz}        $s1,@X[($j+14)%16],0,15"        if ($i>=15)`
+       vsel            $Func,$g,$f,$e          ; Ch(e,f,g)
+       vaddu${sz}m     $g,$g,$Ki               ; future h+=K[i]
        vaddu${sz}m     $h,$h,$Func             ; h+=Ch(e,f,g)
+       vshasigma${sz}  $Sigma,$e,1,15          ; Sigma1(e)
+       vaddu${sz}m     $h,$h,$Sigma            ; h+=Sigma1(e)
        vxor            $Func,$a,$b
-       `"vaddu${sz}m           @X[$j],@X[$j],@X[($j+9)%16]"    if ($i>=15)`
-       vaddu${sz}m     $h,$h,$S1               ; h+=Sigma1(e)
        vsel            $Func,$b,$c,$Func       ; Maj(a,b,c)
-       vaddu${sz}m     $g,$g,$Ki               ; future h+=K[i]
        vaddu${sz}m     $d,$d,$h                ; d+=h
-       vaddu${sz}m     $S0,$S0,$Func           ; Sigma0(a)+Maj(a,b,c)
-       `"vaddu${sz}m           @X[$j],@X[$j],$s0"              if ($i>=15)`
-       lvx             $Ki,$idx,$Tbl           ; load next K[i]
-       addi            $idx,$idx,16
-       vaddu${sz}m     $h,$h,$S0               ; h+=Sigma0(a)+Maj(a,b,c)
-       `"vaddu${sz}m           @X[$j],@X[$j],$s1"              if ($i>=15)`
+       vshasigma${sz}  $Sigma,$a,1,0           ; Sigma0(a)
+       vaddu${sz}m     $Sigma,$Sigma,$Func     ; Sigma0(a)+Maj(a,b,c)
+       vaddu${sz}m     $h,$h,$Sigma            ; h+=Sigma0(a)+Maj(a,b,c)
+       lvx             $Ki,@I[$k],$idx         ; load next K[i]
+___
+$code.=<<___           if ($k == 7);
+       addi            $idx,$idx,0x80
 ___
 }
 
@@ -142,21 +148,13 @@ $code=<<___;
 .globl $func
 .align 6
 $func:
-       $STU            $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
+       $STU            $sp,-$FRAME($sp)
        mflr            $lrsave
-       li              r10,`$FRAME+8*16+15`
-       li              r11,`$FRAME+8*16+31`
-       stvx            v20,r10,$sp             # ABI says so
+       li              r10,`$LOCALS+15`
+       li              r11,`$LOCALS+31`
+       stvx            v24,r10,$sp             # ABI says so
        addi            r10,r10,32
        mfspr           $vrsave,256
-       stvx            v21,r11,$sp
-       addi            r11,r11,32
-       stvx            v22,r10,$sp
-       addi            r10,r10,32
-       stvx            v23,r11,$sp
-       addi            r11,r11,32
-       stvx            v24,r10,$sp
-       addi            r10,r10,32
        stvx            v25,r11,$sp
        addi            r11,r11,32
        stvx            v26,r10,$sp
@@ -169,26 +167,26 @@ $func:
        addi            r11,r11,32
        stvx            v30,r10,$sp
        stvx            v31,r11,$sp
-       li              r11,-1
-       stw             $vrsave,`$FRAME+21*16-4`($sp)   # save vrsave
+       li              r11,-4096+255
+       stw             $vrsave,`$FRAME+6*$SIZE_T-4`($sp)       # save vrsave
        li              $x10,0x10
-       $PUSH           r26,`$FRAME+21*16+0*$SIZE_T`($sp)
+       $PUSH           r26,`$FRAME-6*$SIZE_T`($sp)
        li              $x20,0x20
-       $PUSH           r27,`$FRAME+21*16+1*$SIZE_T`($sp)
+       $PUSH           r27,`$FRAME-5*$SIZE_T`($sp)
        li              $x30,0x30
-       $PUSH           r28,`$FRAME+21*16+2*$SIZE_T`($sp)
+       $PUSH           r28,`$FRAME-4*$SIZE_T`($sp)
        li              $x40,0x40
-       $PUSH           r29,`$FRAME+21*16+3*$SIZE_T`($sp)
+       $PUSH           r29,`$FRAME-3*$SIZE_T`($sp)
        li              $x50,0x50
-       $PUSH           r30,`$FRAME+21*16+4*$SIZE_T`($sp)
+       $PUSH           r30,`$FRAME-2*$SIZE_T`($sp)
        li              $x60,0x60
-       $PUSH           r31,`$FRAME+21*16+5*$SIZE_T`($sp)
+       $PUSH           r31,`$FRAME-1*$SIZE_T`($sp)
        li              $x70,0x70
-       $PUSH           $lrsave,`$FRAME+21*16+6*$SIZE_T+$LRSAVE`($sp)
+       $PUSH           $lrsave,`$FRAME+$LRSAVE`($sp)
        mtspr           256,r11
 
        bl              LPICmeup
-       addi            $offload,$sp,$FRAME+15
+       addi            $offload,$sp,`8*$SIZE_T+15`
 ___
 $code.=<<___           if ($LENDIAN);
        li              $idx,8
@@ -222,9 +220,9 @@ $code.=<<___;
 .align 5
 Loop:
        lvx             $Ki,$x00,$Tbl
-       li              $idx,16
        lvx_u           @X[0],0,$inp
        addi            $inp,$inp,16
+       mr              $idx,$Tbl               # copy $Tbl
        stvx            $A,$x00,$offload        # offload $A-$H
        stvx            $B,$x10,$offload
        stvx            $C,$x20,$offload
@@ -234,8 +232,7 @@ Loop:
        stvx            $G,$x60,$offload
        stvx            $H,$x70,$offload
        vaddu${sz}m     $H,$H,$Ki               # h+K[i]
-       lvx             $Ki,$idx,$Tbl
-       addi            $idx,$idx,16
+       lvx             $Ki,$x10,$Tbl
 ___
 for ($i=0;$i<16;$i++)  { &ROUND($i,@V); unshift(@V,pop(@V)); }
 $code.=<<___;
@@ -268,10 +265,9 @@ $code.=<<___;
        bne             Loop
 ___
 $code.=<<___           if ($SZ==4);
-       lvx             @X[0],$idx,$Tbl
-       addi            $idx,$idx,16
+       lvx             @X[0],$x20,$idx
        vperm           $A,$A,$B,$Ki            # pack the answer
-       lvx             @X[1],$idx,$Tbl
+       lvx             @X[1],$x30,$idx
        vperm           $E,$E,$F,$Ki
        vperm           $A,$A,$C,@X[0]
        vperm           $E,$E,$G,@X[0]
@@ -291,19 +287,11 @@ $code.=<<___              if ($SZ==8);
        stvx_u          $G,$x30,$ctx
 ___
 $code.=<<___;
-       li              r10,`$FRAME+8*16+15`
+       li              r10,`$LOCALS+15`
        mtlr            $lrsave
-       li              r11,`$FRAME+8*16+31`
+       li              r11,`$LOCALS+31`
        mtspr           256,$vrsave
-       lvx             v20,r10,$sp             # ABI says so
-       addi            r10,r10,32
-       lvx             v21,r11,$sp
-       addi            r11,r11,32
-       lvx             v22,r10,$sp
-       addi            r10,r10,32
-       lvx             v23,r11,$sp
-       addi            r11,r11,32
-       lvx             v24,r10,$sp
+       lvx             v24,r10,$sp             # ABI says so
        addi            r10,r10,32
        lvx             v25,r11,$sp
        addi            r11,r11,32
@@ -317,13 +305,13 @@ $code.=<<___;
        addi            r11,r11,32
        lvx             v30,r10,$sp
        lvx             v31,r11,$sp
-       $POP            r26,`$FRAME+21*16+0*$SIZE_T`($sp)
-       $POP            r27,`$FRAME+21*16+1*$SIZE_T`($sp)
-       $POP            r28,`$FRAME+21*16+2*$SIZE_T`($sp)
-       $POP            r29,`$FRAME+21*16+3*$SIZE_T`($sp)
-       $POP            r30,`$FRAME+21*16+4*$SIZE_T`($sp)
-       $POP            r31,`$FRAME+21*16+5*$SIZE_T`($sp)
-       addi            $sp,$sp,`$FRAME+21*16+6*$SIZE_T`
+       $POP            r26,`$FRAME-6*$SIZE_T`($sp)
+       $POP            r27,`$FRAME-5*$SIZE_T`($sp)
+       $POP            r28,`$FRAME-4*$SIZE_T`($sp)
+       $POP            r29,`$FRAME-3*$SIZE_T`($sp)
+       $POP            r30,`$FRAME-2*$SIZE_T`($sp)
+       $POP            r31,`$FRAME-1*$SIZE_T`($sp)
+       addi            $sp,$sp,$FRAME
        blr
        .long           0
        .byte           0,12,4,1,0x80,6,3,0
_____
openssl-commits mailing list
To unsubscribe: https://mta.openssl.org/mailman/listinfo/openssl-commits

[openssl-commits] [openssl] master update

Reply via email to