> (Affects 1.0.2 only.)
> 
> In crypto/ec/asm/ecp_nistz256-x86_64.pl, __ecp_nistz256_sqr_montq,
> under "Now the reduction" there are a number of comments saying
> "doesn't overflow". Unfortunately, they aren't correct.

Got math wrong:-( Attached is not only fixed version, but even faster
one. On related note. It's possible to improve server-side DSA by ~5% by
switching [back] to scatter-gather. [Change from scatter-gather was
caused by concern about timing dependency, but I argue that concern is
not valid in most cases.] There also are x86 and and ARM versions pending:

#               with/without -DECP_NISTZ256_ASM
# Pentium       +66-168%
# PIII          +73-175%
# P4            +68-140%
# Core2         +90-215%
# Sandy Bridge  +105-265% (contemporary i[57]-* are all close to this)
# Atom          +66-160%
# Opteron       +54-112%
# Bulldozer     +99-240%
# VIA Nano      +93-300%

#                       with/without -DECP_NISTZ256_ASM
# Cortex-A8             +53-173%
# Cortex-A9             +76-205%
# Cortex-A15            +100-316%
# Snapdragon S4         +66-187%

No, bug in question is not there. Nor is AD*X code path is affected.

diff --git a/crypto/ec/asm/ecp_nistz256-x86_64.pl 
b/crypto/ec/asm/ecp_nistz256-x86_64.pl
index 4486a5e..f328b85 100755
--- a/crypto/ec/asm/ecp_nistz256-x86_64.pl
+++ b/crypto/ec/asm/ecp_nistz256-x86_64.pl
@@ -31,15 +31,15 @@
 # Further optimization by <ap...@openssl.org>:
 #
 #              this/original
-# Opteron      +8-33%
-# Bulldozer    +10-30%
-# P4           +14-38%
-# Westmere     +8-23%
-# Sandy Bridge +8-24%
-# Ivy Bridge   +7-25%
-# Haswell      +5-25%
-# Atom         +10-32%
-# VIA Nano     +37-130%
+# Opteron      +10-43%
+# Bulldozer    +14-43%
+# P4           +18-50%
+# Westmere     +12-36%
+# Sandy Bridge +9-36%
+# Ivy Bridge   +9-36%
+# Haswell      +8-37%
+# Atom         +15-50%
+# VIA Nano     +43-160%
 #
 # Ranges denote minimum and maximum improvement coefficients depending
 # on benchmark. Lower coefficients are for ECDSA sign, relatively
@@ -550,28 +550,20 @@ __ecp_nistz256_mul_montq:
        # and add the result to the acc.
        # Due to the special form of p256 we do some optimizations
        #
-       # acc[0] x p256[0] = acc[0] x 2^64 - acc[0]
-       # then we add acc[0] and get acc[0] x 2^64
-
-       mulq    $poly1
-       xor     $t0, $t0
-       add     $acc0, $acc1            # +=acc[0]*2^64
-       adc     \$0, %rdx
-       add     %rax, $acc1
-       mov     $acc0, %rax
-
-       # acc[0] x p256[2] = 0
-       adc     %rdx, $acc2
-       adc     \$0, $t0
+       # acc[0] x p256[0..1] = acc[0] x 2^96 - acc[0]
+       # then we add acc[0] and get acc[0] x 2^96
 
+       mov     $acc0, $t1
+       shl     \$32, $acc0
        mulq    $poly3
-       xor     $acc0, $acc0
-       add     $t0, $acc3
-       adc     \$0, %rdx
-       add     %rax, $acc3
+       shr     \$32, $t1
+       add     $acc0, $acc1            # +=acc[0]<<96
+       adc     $t1, $acc2
+       adc     %rax, $acc3
         mov    8*1($b_ptr), %rax
        adc     %rdx, $acc4
        adc     \$0, $acc5
+       xor     $acc0, $acc0
 
        ########################################################################
        # Multiply by b[1]
@@ -608,23 +600,17 @@ __ecp_nistz256_mul_montq:
 
        ########################################################################
        # Second reduction step 
-       mulq    $poly1
-       xor     $t0, $t0
-       add     $acc1, $acc2
-       adc     \$0, %rdx
-       add     %rax, $acc2
-       mov     $acc1, %rax
-       adc     %rdx, $acc3
-       adc     \$0, $t0
-
+       mov     $acc1, $t1
+       shl     \$32, $acc1
        mulq    $poly3
-       xor     $acc1, $acc1
-       add     $t0, $acc4
-       adc     \$0, %rdx
-       add     %rax, $acc4
+       shr     \$32, $t1
+       add     $acc1, $acc2
+       adc     $t1, $acc3
+       adc     %rax, $acc4
         mov    8*2($b_ptr), %rax
        adc     %rdx, $acc5
        adc     \$0, $acc0
+       xor     $acc1, $acc1
 
        ########################################################################
        # Multiply by b[2]
@@ -661,23 +647,17 @@ __ecp_nistz256_mul_montq:
 
        ########################################################################
        # Third reduction step  
-       mulq    $poly1
-       xor     $t0, $t0
-       add     $acc2, $acc3
-       adc     \$0, %rdx
-       add     %rax, $acc3
-       mov     $acc2, %rax
-       adc     %rdx, $acc4
-       adc     \$0, $t0
-
+       mov     $acc2, $t1
+       shl     \$32, $acc2
        mulq    $poly3
-       xor     $acc2, $acc2
-       add     $t0, $acc5
-       adc     \$0, %rdx
-       add     %rax, $acc5
+       shr     \$32, $t1
+       add     $acc2, $acc3
+       adc     $t1, $acc4
+       adc     %rax, $acc5
         mov    8*3($b_ptr), %rax
        adc     %rdx, $acc0
        adc     \$0, $acc1
+       xor     $acc2, $acc2
 
        ########################################################################
        # Multiply by b[3]
@@ -714,20 +694,14 @@ __ecp_nistz256_mul_montq:
 
        ########################################################################
        # Final reduction step  
-       mulq    $poly1
-       #xor    $t0, $t0
-       add     $acc3, $acc4
-       adc     \$0, %rdx
-       add     %rax, $acc4
-       mov     $acc3, %rax
-       adc     %rdx, $acc5
-       #adc    \$0, $t0                # doesn't overflow
-
+       mov     $acc3, $t1
+       shl     \$32, $acc3
        mulq    $poly3
-       #add    $t0, $acc0
-       #adc    \$0, %rdx
+       shr     \$32, $t1
+       add     $acc3, $acc4
+       adc     $t1, $acc5
         mov    $acc4, $t0
-       add     %rax, $acc0
+       adc     %rax, $acc0
        adc     %rdx, $acc1
         mov    $acc5, $t1
        adc     \$0, $acc2
@@ -897,82 +871,55 @@ __ecp_nistz256_sqr_montq:
        ##########################################
        # Now the reduction
        # First iteration
-       mulq    $a_ptr
-       #xor    $t0, $t0
-       add     $acc0, $acc1
-       adc     \$0, %rdx
-       add     %rax, $acc1
-       mov     $acc0, %rax
-       adc     %rdx, $acc2     # doesn't overflow
-       #adc    \$0, $t0
-
+       mov     $acc0, $t0
+       shl     \$32, $acc0
        mulq    $t1
-       xor     $acc0, $acc0
-       #add    $t0, $acc3
-       #adc    \$0, %rdx
-       add     %rax, $acc3
+       shr     \$32, $t0
+       add     $acc0, $acc1            # +=acc[0]<<96
+       adc     $t0, $acc2
+       adc     %rax, $acc3
         mov    $acc1, %rax
        adc     %rdx, $acc4
-       adc     \$0, $acc0
+       adc     \$0, $acc5
+       xor     $acc0, $acc0
 
        ##########################################
        # Second iteration
-       mulq    $a_ptr
-       #xor    $t0, $t0
-       add     $acc1, $acc2
-       adc     \$0, %rdx
-       add     %rax, $acc2
-       mov     $acc1, %rax
-       adc     %rdx, $acc3     # doesn't overflow
-       #adc    \$0, $t0
-
+       mov     $acc1, $t0
+       shl     \$32, $acc1
        mulq    $t1
-       xor     $acc1, $acc1
-       #add    $t0, $acc4
-       #adc    \$0, %rdx
-       add     %rax, $acc4
+       shr     \$32, $t0
+       add     $acc1, $acc2
+       adc     $t0, $acc3
+       adc     %rax, $acc4
         mov    $acc2, %rax
        adc     %rdx, $acc0
-       adc     \$0, $acc1
+       xor     $acc1, $acc1
 
        ##########################################
        # Third iteration
-       mulq    $a_ptr
-       #xor    $t0, $t0
-       add     $acc2, $acc3
-       adc     \$0, %rdx
-       add     %rax, $acc3
-       mov     $acc2, %rax
-       adc     %rdx, $acc4     # doesn't overflow
-       #adc    \$0, $t0
-
+       mov     $acc2, $t0
+       shl     \$32, $acc2
        mulq    $t1
-       xor     $acc2, $acc2
-       #add    $t0, $acc0
-       #adc    \$0, %rdx
-       add     %rax, $acc0
+       shr     \$32, $t0
+       add     $acc2, $acc3
+       adc     $t0, $acc4
+       adc     %rax, $acc0
         mov    $acc3, %rax
        adc     %rdx, $acc1
-       adc     \$0, $acc2
+       xor     $acc2, $acc2
 
        ###########################################
        # Last iteration
-       mulq    $a_ptr
-       #xor    $t0, $t0
-       add     $acc3, $acc4
-       adc     \$0, %rdx
-       add     %rax, $acc4
-       mov     $acc3, %rax
-       adc     %rdx, $acc0     # doesn't overflow
-       #adc    \$0, $t0
-
+       mov     $acc3, $t0
+       shl     \$32, $acc3
        mulq    $t1
-       xor     $acc3, $acc3
-       #add    $t0, $acc1
-       #adc    \$0, %rdx
-       add     %rax, $acc1
+       shr     \$32, $t0
+       add     $acc3, $acc4
+       adc     $t0, $acc0
+       adc     %rax, $acc1
        adc     %rdx, $acc2
-       adc     \$0, $acc3
+       xor     $acc3, $acc3
 
        ############################################
        # Add the rest of the acc

Reply via email to