The branch master has been updated
       via  9d0e4dc6351df7d0c08400c4b4cf17c017022e50 (commit)
      from  a5fd24d19bbb586b1c6d235c2021e9bead22c9f5 (commit)


- Log -----------------------------------------------------------------
commit 9d0e4dc6351df7d0c08400c4b4cf17c017022e50
Author: Andy Polyakov <[email protected]>
Date:   Tue Nov 10 21:11:24 2015 +0100

    bn/asm/s390x.S: improve performance on z196 and z13 by up to 26%. [even z10 
is couple percent faster]. Triggered by RT#4128, but solves the problem by real 
modulo-scheduling.
    
    Reviewed-by: Rich Salz <[email protected]>

-----------------------------------------------------------------------

Summary of changes:
 crypto/bn/asm/s390x.S | 109 +++++++++++++++++++++++++++++++++-----------------
 1 file changed, 72 insertions(+), 37 deletions(-)

diff --git a/crypto/bn/asm/s390x.S b/crypto/bn/asm/s390x.S
index 43fcb79..f5eebe4 100755
--- a/crypto/bn/asm/s390x.S
+++ b/crypto/bn/asm/s390x.S
@@ -18,71 +18,106 @@
 .align 4
 bn_mul_add_words:
        lghi    zero,0          // zero = 0
-       la      %r1,0(%r2)      // put rp aside
-       lghi    %r2,0           // i=0;
+       la      %r1,0(%r2)      // put rp aside [to give way to]
+       lghi    %r2,0           // return value
        ltgfr   %r4,%r4
        bler    %r14            // if (len<=0) return 0;
 
-       stmg    %r6,%r10,48(%r15)
-       lghi    %r10,3
-       lghi    %r8,0           // carry = 0
-       nr      %r10,%r4        // len%4
+       stmg    %r6,%r13,48(%r15)
+       lghi    %r2,3
+       lghi    %r12,0          // carry = 0
+       slgr    %r1,%r3         // rp-=ap
+       nr      %r2,%r4         // len%4
        sra     %r4,2           // cnt=len/4
        jz      .Loop1_madd     // carry is incidentally cleared if branch taken
        algr    zero,zero       // clear carry
 
-.Loop4_madd:
-       lg      %r7,0(%r2,%r3)  // ap[i]
+       lg      %r7,0(%r3)      // ap[0]
+       lg      %r9,8(%r3)      // ap[1]
        mlgr    %r6,%r5         // *=w
-       alcgr   %r7,%r8         // +=carry
-       alcgr   %r6,zero
-       alg     %r7,0(%r2,%r1)  // +=rp[i]
-       stg     %r7,0(%r2,%r1)  // rp[i]=
+       brct    %r4,.Loop4_madd
+       j       .Loop4_madd_tail
 
-       lg      %r9,8(%r2,%r3)
+.Loop4_madd:
        mlgr    %r8,%r5
+       lg      %r11,16(%r3)    // ap[i+2]
+       alcgr   %r7,%r12        // +=carry
+       alcgr   %r6,zero
+       alg     %r7,0(%r3,%r1)  // +=rp[i]
+       stg     %r7,0(%r3,%r1)  // rp[i]=
+
+       mlgr    %r10,%r5
+       lg      %r13,24(%r3)
        alcgr   %r9,%r6
        alcgr   %r8,zero
-       alg     %r9,8(%r2,%r1)
-       stg     %r9,8(%r2,%r1)
+       alg     %r9,8(%r3,%r1)
+       stg     %r9,8(%r3,%r1)
+
+       mlgr    %r12,%r5
+       lg      %r7,32(%r3)
+       alcgr   %r11,%r8
+       alcgr   %r10,zero
+       alg     %r11,16(%r3,%r1)
+       stg     %r11,16(%r3,%r1)
 
-       lg      %r7,16(%r2,%r3)
        mlgr    %r6,%r5
-       alcgr   %r7,%r8
-       alcgr   %r6,zero
-       alg     %r7,16(%r2,%r1)
-       stg     %r7,16(%r2,%r1)
+       lg      %r9,40(%r3)
+       alcgr   %r13,%r10
+       alcgr   %r12,zero
+       alg     %r13,24(%r3,%r1)
+       stg     %r13,24(%r3,%r1)
 
-       lg      %r9,24(%r2,%r3)
+       la      %r3,32(%r3)     // i+=4
+       brct    %r4,.Loop4_madd
+
+.Loop4_madd_tail:
        mlgr    %r8,%r5
+       lg      %r11,16(%r3)
+       alcgr   %r7,%r12        // +=carry
+       alcgr   %r6,zero
+       alg     %r7,0(%r3,%r1)  // +=rp[i]
+       stg     %r7,0(%r3,%r1)  // rp[i]=
+
+       mlgr    %r10,%r5
+       lg      %r13,24(%r3)
        alcgr   %r9,%r6
        alcgr   %r8,zero
-       alg     %r9,24(%r2,%r1)
-       stg     %r9,24(%r2,%r1)
+       alg     %r9,8(%r3,%r1)
+       stg     %r9,8(%r3,%r1)
 
-       la      %r2,32(%r2)     // i+=4
-       brct    %r4,.Loop4_madd
+       mlgr    %r12,%r5
+       alcgr   %r11,%r8
+       alcgr   %r10,zero
+       alg     %r11,16(%r3,%r1)
+       stg     %r11,16(%r3,%r1)
 
-       la      %r10,1(%r10)            // see if len%4 is zero ...
-       brct    %r10,.Loop1_madd        // without touching condition code:-)
+       alcgr   %r13,%r10
+       alcgr   %r12,zero
+       alg     %r13,24(%r3,%r1)
+       stg     %r13,24(%r3,%r1)
+
+       la      %r3,32(%r3)     // i+=4
+
+       la      %r2,1(%r2)      // see if len%4 is zero ...
+       brct    %r2,.Loop1_madd // without touching condition code:-)
 
 .Lend_madd:
-       alcgr   %r8,zero        // collect carry bit
-       lgr     %r2,%r8
-       lmg     %r6,%r10,48(%r15)
+       lgr     %r2,zero        // return value
+       alcgr   %r2,%r12        // collect even carry bit
+       lmg     %r6,%r13,48(%r15)
        br      %r14
 
 .Loop1_madd:
-       lg      %r7,0(%r2,%r3)  // ap[i]
+       lg      %r7,0(%r3)      // ap[i]
        mlgr    %r6,%r5         // *=w
-       alcgr   %r7,%r8         // +=carry
+       alcgr   %r7,%r12        // +=carry
        alcgr   %r6,zero
-       alg     %r7,0(%r2,%r1)  // +=rp[i]
-       stg     %r7,0(%r2,%r1)  // rp[i]=
+       alg     %r7,0(%r3,%r1)  // +=rp[i]
+       stg     %r7,0(%r3,%r1)  // rp[i]=
 
-       lgr     %r8,%r6
-       la      %r2,8(%r2)      // i++
-       brct    %r10,.Loop1_madd
+       lgr     %r12,%r6
+       la      %r3,8(%r3)      // i++
+       brct    %r2,.Loop1_madd
 
        j       .Lend_madd
 .size  bn_mul_add_words,.-bn_mul_add_words
_____
openssl-commits mailing list
To unsubscribe: https://mta.openssl.org/mailman/listinfo/openssl-commits

Reply via email to