> (Affects 1.0.2 only.) > > In crypto/ec/asm/ecp_nistz256-x86_64.pl, __ecp_nistz256_sqr_montq, > under "Now the reduction" there are a number of comments saying > "doesn't overflow". Unfortunately, they aren't correct.
Got math wrong:-( Attached is not only fixed version, but even faster one. On related note. It's possible to improve server-side DSA by ~5% by switching [back] to scatter-gather. [Change from scatter-gather was caused by concern about timing dependency, but I argue that concern is not valid in most cases.] There also are x86 and and ARM versions pending: # with/without -DECP_NISTZ256_ASM # Pentium +66-168% # PIII +73-175% # P4 +68-140% # Core2 +90-215% # Sandy Bridge +105-265% (contemporary i[57]-* are all close to this) # Atom +66-160% # Opteron +54-112% # Bulldozer +99-240% # VIA Nano +93-300% # with/without -DECP_NISTZ256_ASM # Cortex-A8 +53-173% # Cortex-A9 +76-205% # Cortex-A15 +100-316% # Snapdragon S4 +66-187% No, bug in question is not there. Nor is AD*X code path is affected.
diff --git a/crypto/ec/asm/ecp_nistz256-x86_64.pl b/crypto/ec/asm/ecp_nistz256-x86_64.pl index 4486a5e..f328b85 100755 --- a/crypto/ec/asm/ecp_nistz256-x86_64.pl +++ b/crypto/ec/asm/ecp_nistz256-x86_64.pl @@ -31,15 +31,15 @@ # Further optimization by <ap...@openssl.org>: # # this/original -# Opteron +8-33% -# Bulldozer +10-30% -# P4 +14-38% -# Westmere +8-23% -# Sandy Bridge +8-24% -# Ivy Bridge +7-25% -# Haswell +5-25% -# Atom +10-32% -# VIA Nano +37-130% +# Opteron +10-43% +# Bulldozer +14-43% +# P4 +18-50% +# Westmere +12-36% +# Sandy Bridge +9-36% +# Ivy Bridge +9-36% +# Haswell +8-37% +# Atom +15-50% +# VIA Nano +43-160% # # Ranges denote minimum and maximum improvement coefficients depending # on benchmark. Lower coefficients are for ECDSA sign, relatively @@ -550,28 +550,20 @@ __ecp_nistz256_mul_montq: # and add the result to the acc. # Due to the special form of p256 we do some optimizations # - # acc[0] x p256[0] = acc[0] x 2^64 - acc[0] - # then we add acc[0] and get acc[0] x 2^64 - - mulq $poly1 - xor $t0, $t0 - add $acc0, $acc1 # +=acc[0]*2^64 - adc \$0, %rdx - add %rax, $acc1 - mov $acc0, %rax - - # acc[0] x p256[2] = 0 - adc %rdx, $acc2 - adc \$0, $t0 + # acc[0] x p256[0..1] = acc[0] x 2^96 - acc[0] + # then we add acc[0] and get acc[0] x 2^96 + mov $acc0, $t1 + shl \$32, $acc0 mulq $poly3 - xor $acc0, $acc0 - add $t0, $acc3 - adc \$0, %rdx - add %rax, $acc3 + shr \$32, $t1 + add $acc0, $acc1 # +=acc[0]<<96 + adc $t1, $acc2 + adc %rax, $acc3 mov 8*1($b_ptr), %rax adc %rdx, $acc4 adc \$0, $acc5 + xor $acc0, $acc0 ######################################################################## # Multiply by b[1] @@ -608,23 +600,17 @@ __ecp_nistz256_mul_montq: ######################################################################## # Second reduction step - mulq $poly1 - xor $t0, $t0 - add $acc1, $acc2 - adc \$0, %rdx - add %rax, $acc2 - mov $acc1, %rax - adc %rdx, $acc3 - adc \$0, $t0 - + mov $acc1, $t1 + shl \$32, $acc1 mulq $poly3 - xor $acc1, $acc1 - add $t0, $acc4 - adc \$0, %rdx - add %rax, $acc4 + shr \$32, $t1 + add $acc1, $acc2 + adc $t1, $acc3 + adc %rax, $acc4 mov 8*2($b_ptr), %rax adc %rdx, $acc5 adc \$0, $acc0 + xor $acc1, $acc1 ######################################################################## # Multiply by b[2] @@ -661,23 +647,17 @@ __ecp_nistz256_mul_montq: ######################################################################## # Third reduction step - mulq $poly1 - xor $t0, $t0 - add $acc2, $acc3 - adc \$0, %rdx - add %rax, $acc3 - mov $acc2, %rax - adc %rdx, $acc4 - adc \$0, $t0 - + mov $acc2, $t1 + shl \$32, $acc2 mulq $poly3 - xor $acc2, $acc2 - add $t0, $acc5 - adc \$0, %rdx - add %rax, $acc5 + shr \$32, $t1 + add $acc2, $acc3 + adc $t1, $acc4 + adc %rax, $acc5 mov 8*3($b_ptr), %rax adc %rdx, $acc0 adc \$0, $acc1 + xor $acc2, $acc2 ######################################################################## # Multiply by b[3] @@ -714,20 +694,14 @@ __ecp_nistz256_mul_montq: ######################################################################## # Final reduction step - mulq $poly1 - #xor $t0, $t0 - add $acc3, $acc4 - adc \$0, %rdx - add %rax, $acc4 - mov $acc3, %rax - adc %rdx, $acc5 - #adc \$0, $t0 # doesn't overflow - + mov $acc3, $t1 + shl \$32, $acc3 mulq $poly3 - #add $t0, $acc0 - #adc \$0, %rdx + shr \$32, $t1 + add $acc3, $acc4 + adc $t1, $acc5 mov $acc4, $t0 - add %rax, $acc0 + adc %rax, $acc0 adc %rdx, $acc1 mov $acc5, $t1 adc \$0, $acc2 @@ -897,82 +871,55 @@ __ecp_nistz256_sqr_montq: ########################################## # Now the reduction # First iteration - mulq $a_ptr - #xor $t0, $t0 - add $acc0, $acc1 - adc \$0, %rdx - add %rax, $acc1 - mov $acc0, %rax - adc %rdx, $acc2 # doesn't overflow - #adc \$0, $t0 - + mov $acc0, $t0 + shl \$32, $acc0 mulq $t1 - xor $acc0, $acc0 - #add $t0, $acc3 - #adc \$0, %rdx - add %rax, $acc3 + shr \$32, $t0 + add $acc0, $acc1 # +=acc[0]<<96 + adc $t0, $acc2 + adc %rax, $acc3 mov $acc1, %rax adc %rdx, $acc4 - adc \$0, $acc0 + adc \$0, $acc5 + xor $acc0, $acc0 ########################################## # Second iteration - mulq $a_ptr - #xor $t0, $t0 - add $acc1, $acc2 - adc \$0, %rdx - add %rax, $acc2 - mov $acc1, %rax - adc %rdx, $acc3 # doesn't overflow - #adc \$0, $t0 - + mov $acc1, $t0 + shl \$32, $acc1 mulq $t1 - xor $acc1, $acc1 - #add $t0, $acc4 - #adc \$0, %rdx - add %rax, $acc4 + shr \$32, $t0 + add $acc1, $acc2 + adc $t0, $acc3 + adc %rax, $acc4 mov $acc2, %rax adc %rdx, $acc0 - adc \$0, $acc1 + xor $acc1, $acc1 ########################################## # Third iteration - mulq $a_ptr - #xor $t0, $t0 - add $acc2, $acc3 - adc \$0, %rdx - add %rax, $acc3 - mov $acc2, %rax - adc %rdx, $acc4 # doesn't overflow - #adc \$0, $t0 - + mov $acc2, $t0 + shl \$32, $acc2 mulq $t1 - xor $acc2, $acc2 - #add $t0, $acc0 - #adc \$0, %rdx - add %rax, $acc0 + shr \$32, $t0 + add $acc2, $acc3 + adc $t0, $acc4 + adc %rax, $acc0 mov $acc3, %rax adc %rdx, $acc1 - adc \$0, $acc2 + xor $acc2, $acc2 ########################################### # Last iteration - mulq $a_ptr - #xor $t0, $t0 - add $acc3, $acc4 - adc \$0, %rdx - add %rax, $acc4 - mov $acc3, %rax - adc %rdx, $acc0 # doesn't overflow - #adc \$0, $t0 - + mov $acc3, $t0 + shl \$32, $acc3 mulq $t1 - xor $acc3, $acc3 - #add $t0, $acc1 - #adc \$0, %rdx - add %rax, $acc1 + shr \$32, $t0 + add $acc3, $acc4 + adc $t0, $acc0 + adc %rax, $acc1 adc %rdx, $acc2 - adc \$0, $acc3 + xor $acc3, $acc3 ############################################ # Add the rest of the acc