Hi,
> This patch is a contribution to OpenSSL.
>
> It concerns the P256 ECC implementation.
>
> The patch improves upon our previous submission, by providing a dedicated
> function to perform modular inversion modulo the P256 group order.
>
> Results:
> The performance improvements, for single threaded applications, compared to
> the current (development) version of OpenSSL are as follows.
>
> (measured by "openssl speed" utility)
>
>
> On Architecture Codename Haswell:
> ECDSA sign: 1.28X
> ECDSA verify: 1.10X
>
> On Architecture Broadwell:
> ECDSA sign: 1.42X
> ECDSA verify: 1.18X
>
> We license the whole submission under BSD license.
>
> Developers and authors:
> ***************************************************************************
> Shay Gueron (1, 2), and Vlad Krasnov (3)
> (1) University of Haifa, Israel
> (2) Intel Corporation, Israel Development Center, Haifa, Israel
> (3) CloudFlare, Inc.
> ***************************************************************************
Attached is version refactored for updated layout. It's few percent
faster than original (for several small reasons, e.g. avoiding excessive
%rip-relative addressing because it doesn't fuse, optimizing
back-to-back value passing through registers in squaring) and probably
more readable (for example squaring uses $acc6 and $acc7). Then I've got
nervous around possibility of unaccounted carry and rearranged reduction
step in manner that precludes it. To be more specific here is fragment
of original reduction step:
mov 8*1+.Lord(%rip), $t4
mul $t0
add $t1, $acc1
adc \$0, $t3
add $t4, $acc1
mov $t0, $t1
adc $t3, $acc2
adc \$0, $t1
sub $t0, $acc2
sbb \$0, $t1
Concern was that if $t0 happens to be all-ones, then you risk
unaccounted carry in last adc above. Well, upon closer look concern
appears to be false, but as it's a bit counter-intuitive alternative is
provided anyway.
diff --git a/crypto/ec/asm/ecp_nistz256-x86_64.pl b/crypto/ec/asm/ecp_nistz256-x86_64.pl
index c2621c2..39e60da 100755
--- a/crypto/ec/asm/ecp_nistz256-x86_64.pl
+++ b/crypto/ec/asm/ecp_nistz256-x86_64.pl
@@ -2,7 +2,13 @@
##############################################################################
# #
-# Copyright 2014 Intel Corporation #
+# Copyright (c) 2014,2015 Intel Corporation #
+# Copyright (c) 2015 CloudFlare, Inc. #
+# All rights reserved. #
+# #
+# This software is dual licensed under the Apache V.2.0 and BSD licenses #
+# #
+##############################################################################
# #
# Licensed under the Apache License, Version 2.0 (the "License"); #
# you may not use this file except in compliance with the License. #
@@ -18,10 +24,41 @@
# #
##############################################################################
# #
+# Redistribution and use in source and binary forms, with or without #
+# modification, are permitted provided that the following conditions are #
+# met: #
+# #
+# # Redistributions of source code must retain the above copyright #
+# notice, this list of conditions and the following disclaimer. #
+# #
+# # Redistributions in binary form must reproduce the above copyright #
+# notice, this list of conditions and the following disclaimer in the #
+# documentation and/or other materials provided with the #
+# distribution. #
+# #
+# # Neither the name of the copyright holders nor the names of its #
+# contributors may be used to endorse or promote products derived from #
+# this software without specific prior written permission. #
+# #
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS #
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED #
+# TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR#
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR #
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, #
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, #
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR #
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF #
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING #
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS #
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #
+# #
+##############################################################################
+# #
# Developers and authors: #
-# Shay Gueron (1, 2), and Vlad Krasnov (1) #
+# Shay Gueron (1, 2), and Vlad Krasnov (1, 3) #
# (1) Intel Corporation, Israel Development Center #
# (2) University of Haifa #
+# (3) CloudFlare, Inc. #
# Reference: #
# S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with#
# 256 Bit Primes" #
@@ -31,23 +68,25 @@
# Further optimization by <[email protected]>:
#
# this/original with/without -DECP_NISTZ256_ASM(*)
-# Opteron +12-49% +110-150%
-# Bulldozer +14-45% +175-210%
-# P4 +18-46% n/a :-(
-# Westmere +12-34% +80-87%
-# Sandy Bridge +9-35% +110-120%
-# Ivy Bridge +9-35% +110-125%
-# Haswell +8-37% +140-160%
-# Broadwell +18-58% +145-210%
-# Atom +15-50% +130-180%
-# VIA Nano +43-160% +300-480%
+# Opteron +15-49% +150-195%
+# Bulldozer +18-45% +175-240%
+# P4 +24-46% +100-150%
+# Westmere +18-34% +87-160%
+# Sandy Bridge +14-35% +120-185%
+# Ivy Bridge +11-35% +125-180%
+# Haswell +10-37% +160-200%
+# Broadwell +24-58% +210-270%
+# Atom +20-50% +180-240%
+# VIA Nano +50-160% +480-480%
#
# (*) "without -DECP_NISTZ256_ASM" refers to build with
# "enable-ec_nistp_64_gcc_128";
#
# Ranges denote minimum and maximum improvement coefficients depending
-# on benchmark. Lower coefficients are for ECDSA sign, relatively fastest
-# server-side operation. Keep in mind that +100% means 2x improvement.
+# on benchmark. In "this/original" column lower coefficient is for
+# ECDSA sign, while in "with/without" - for ECDH key agreement, and
+# higher - for ECDSA sign, relatively fastest server-side operation.
+# Keep in mind that +100% means 2x improvement.
$flavour = shift;
$output = shift;
@@ -108,6 +147,12 @@ $code.=<<___;
.long 3,3,3,3,3,3,3,3
.LONE_mont:
.quad 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe
+
+# Constants for computations modulo ord(p256)
+.Lord:
+.quad 0xf3b9cac2fc632551, 0xbce6faada7179e84, 0xffffffffffffffff, 0xffffffff00000000
+.LordK:
+.quad 0xccd1c8aaee00bc4f
___
{
@@ -435,6 +480,1012 @@ my ($poly1,$poly3)=($acc6,$acc7);
$code.=<<___;
################################################################################
+# void ecp_nistz256_ord_mul_mont(
+# uint64_t res[4],
+# uint64_t a[4],
+# uint64_t b[4]);
+
+.globl ecp_nistz256_ord_mul_mont
+.type ecp_nistz256_ord_mul_mont,\@function,3
+.align 32
+ecp_nistz256_ord_mul_mont:
+___
+$code.=<<___ if ($addx);
+ mov \$0x80100, %ecx
+ and OPENSSL_ia32cap_P+8(%rip), %ecx
+ cmp \$0x80100, %ecx
+ je .Lecp_nistz256_ord_mul_montx
+___
+$code.=<<___;
+ push %rbp
+ push %rbx
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ mov 8*0($b_org), %rax
+ mov $b_org, $b_ptr
+ lea .Lord(%rip), %r14
+ mov .LordK(%rip), %r15
+
+ ################################# * b[0]
+ mov %rax, $t0
+ mulq 8*0($a_ptr)
+ mov %rax, $acc0
+ mov $t0, %rax
+ mov %rdx, $acc1
+
+ mulq 8*1($a_ptr)
+ add %rax, $acc1
+ mov $t0, %rax
+ adc \$0, %rdx
+ mov %rdx, $acc2
+
+ mulq 8*2($a_ptr)
+ add %rax, $acc2
+ mov $t0, %rax
+ adc \$0, %rdx
+
+ mov $acc0, $acc5
+ imulq %r15,$acc0
+
+ mov %rdx, $acc3
+ mulq 8*3($a_ptr)
+ add %rax, $acc3
+ mov $acc0, %rax
+ adc \$0, %rdx
+ mov %rdx, $acc4
+
+ ################################# First reduction step
+ mulq 8*0(%r14)
+ mov $acc0, $t1
+ add %rax, $acc5 # guaranteed to be zero
+ mov $acc0, %rax
+ adc \$0, %rdx
+ mov %rdx, $t0
+
+ sub $acc0, $acc2
+ sbb \$0, $acc0 # can't borrow
+
+ mulq 8*1(%r14)
+ add $t0, $acc1
+ adc \$0, %rdx
+ add %rax, $acc1
+ mov $t1, %rax
+ adc %rdx, $acc2
+ mov $t1, %rdx
+ adc \$0, $acc0 # can't overflow
+
+ shl \$32, %rax
+ shr \$32, %rdx
+ sub %rax, $acc3
+ mov 8*1($b_ptr), %rax
+ sbb %rdx, $t1 # can't borrow
+
+ add $acc0, $acc3
+ adc $t1, $acc4
+ adc \$0, $acc5
+
+ ################################# * b[1]
+ mov %rax, $t0
+ mulq 8*0($a_ptr)
+ add %rax, $acc1
+ mov $t0, %rax
+ adc \$0, %rdx
+ mov %rdx, $t1
+
+ mulq 8*1($a_ptr)
+ add $t1, $acc2
+ adc \$0, %rdx
+ add %rax, $acc2
+ mov $t0, %rax
+ adc \$0, %rdx
+ mov %rdx, $t1
+
+ mulq 8*2($a_ptr)
+ add $t1, $acc3
+ adc \$0, %rdx
+ add %rax, $acc3
+ mov $t0, %rax
+ adc \$0, %rdx
+
+ mov $acc1, $t0
+ imulq %r15, $acc1
+
+ mov %rdx, $t1
+ mulq 8*3($a_ptr)
+ add $t1, $acc4
+ adc \$0, %rdx
+ xor $acc0, $acc0
+ add %rax, $acc4
+ mov $acc1, %rax
+ adc %rdx, $acc5
+ adc \$0, $acc0
+
+ ################################# Second reduction step
+ mulq 8*0(%r14)
+ mov $acc1, $t1
+ add %rax, $t0 # guaranteed to be zero
+ mov $acc1, %rax
+ adc %rdx, $t0
+
+ sub $acc1, $acc3
+ sbb \$0, $acc1 # can't borrow
+
+ mulq 8*1(%r14)
+ add $t0, $acc2
+ adc \$0, %rdx
+ add %rax, $acc2
+ mov $t1, %rax
+ adc %rdx, $acc3
+ mov $t1, %rdx
+ adc \$0, $acc1 # can't overflow
+
+ shl \$32, %rax
+ shr \$32, %rdx
+ sub %rax, $acc4
+ mov 8*2($b_ptr), %rax
+ sbb %rdx, $t1 # can't borrow
+
+ add $acc1, $acc4
+ adc $t1, $acc5
+ adc \$0, $acc0
+
+ ################################## * b[2]
+ mov %rax, $t0
+ mulq 8*0($a_ptr)
+ add %rax, $acc2
+ mov $t0, %rax
+ adc \$0, %rdx
+ mov %rdx, $t1
+
+ mulq 8*1($a_ptr)
+ add $t1, $acc3
+ adc \$0, %rdx
+ add %rax, $acc3
+ mov $t0, %rax
+ adc \$0, %rdx
+ mov %rdx, $t1
+
+ mulq 8*2($a_ptr)
+ add $t1, $acc4
+ adc \$0, %rdx
+ add %rax, $acc4
+ mov $t0, %rax
+ adc \$0, %rdx
+
+ mov $acc2, $t0
+ imulq %r15, $acc2
+
+ mov %rdx, $t1
+ mulq 8*3($a_ptr)
+ add $t1, $acc5
+ adc \$0, %rdx
+ xor $acc1, $acc1
+ add %rax, $acc5
+ mov $acc2, %rax
+ adc %rdx, $acc0
+ adc \$0, $acc1
+
+ ################################# Third reduction step
+ mulq 8*0(%r14)
+ mov $acc2, $t1
+ add %rax, $t0 # guaranteed to be zero
+ mov $acc2, %rax
+ adc %rdx, $t0
+
+ sub $acc2, $acc4
+ sbb \$0, $acc2 # can't borrow
+
+ mulq 8*1(%r14)
+ add $t0, $acc3
+ adc \$0, %rdx
+ add %rax, $acc3
+ mov $t1, %rax
+ adc %rdx, $acc4
+ mov $t1, %rdx
+ adc \$0, $acc2 # can't overflow
+
+ shl \$32, %rax
+ shr \$32, %rdx
+ sub %rax, $acc5
+ mov 8*3($b_ptr), %rax
+ sbb %rdx, $t1 # can't borrow
+
+ add $acc2, $acc5
+ adc $t1, $acc0
+ adc \$0, $acc1
+
+ ################################# * b[3]
+ mov %rax, $t0
+ mulq 8*0($a_ptr)
+ add %rax, $acc3
+ mov $t0, %rax
+ adc \$0, %rdx
+ mov %rdx, $t1
+
+ mulq 8*1($a_ptr)
+ add $t1, $acc4
+ adc \$0, %rdx
+ add %rax, $acc4
+ mov $t0, %rax
+ adc \$0, %rdx
+ mov %rdx, $t1
+
+ mulq 8*2($a_ptr)
+ add $t1, $acc5
+ adc \$0, %rdx
+ add %rax, $acc5
+ mov $t0, %rax
+ adc \$0, %rdx
+
+ mov $acc3, $t0
+ imulq %r15, $acc3
+
+ mov %rdx, $t1
+ mulq 8*3($a_ptr)
+ add $t1, $acc0
+ adc \$0, %rdx
+ xor $acc2, $acc2
+ add %rax, $acc0
+ mov $acc3, %rax
+ adc %rdx, $acc1
+ adc \$0, $acc2
+
+ ################################# Last reduction step
+ mulq 8*0(%r14)
+ mov $acc3, $t1
+ add %rax, $t0 # guaranteed to be zero
+ mov $acc3, %rax
+ adc %rdx, $t0
+
+ sub $acc3, $acc5
+ sbb \$0, $acc3 # can't borrow
+
+ mulq 8*1(%r14)
+ add $t0, $acc4
+ adc \$0, %rdx
+ add %rax, $acc4
+ mov $t1, %rax
+ adc %rdx, $acc5
+ mov $t1, %rdx
+ adc \$0, $acc3 # can't overflow
+
+ shl \$32, %rax
+ shr \$32, %rdx
+ sub %rax, $acc0
+ sbb %rdx, $t1 # can't borrow
+
+ add $acc3, $acc0
+ adc $t1, $acc1
+ adc \$0, $acc2
+
+ ################################# Subtract ord
+ mov $acc4, $a_ptr
+ sub 8*0(%r14), $acc4
+ mov $acc5, $acc3
+ sbb 8*1(%r14), $acc5
+ mov $acc0, $t0
+ sbb 8*2(%r14), $acc0
+ mov $acc1, $t1
+ sbb 8*3(%r14), $acc1
+ sbb \$0, $acc2
+
+ cmovc $a_ptr, $acc4
+ cmovc $acc3, $acc5
+ cmovc $t0, $acc0
+ cmovc $t1, $acc1
+
+ mov $acc4, 8*0($r_ptr)
+ mov $acc5, 8*1($r_ptr)
+ mov $acc0, 8*2($r_ptr)
+ mov $acc1, 8*3($r_ptr)
+
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbx
+ pop %rbp
+ ret
+.size ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont
+
+################################################################################
+# void ecp_nistz256_ord_sqr_mont(
+# uint64_t res[4],
+# uint64_t a[4],
+# int rep);
+
+.globl ecp_nistz256_ord_sqr_mont
+.type ecp_nistz256_ord_sqr_mont,\@function,3
+.align 32
+ecp_nistz256_ord_sqr_mont:
+___
+$code.=<<___ if ($addx);
+ mov \$0x80100, %ecx
+ and OPENSSL_ia32cap_P+8(%rip), %ecx
+ cmp \$0x80100, %ecx
+ je .Lecp_nistz256_ord_sqr_montx
+___
+$code.=<<___;
+ push %rbp
+ push %rbx
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ mov 8*0($a_ptr), $acc0
+ mov 8*1($a_ptr), %rax
+ mov 8*2($a_ptr), $acc6
+ mov 8*3($a_ptr), $acc7
+ lea .Lord(%rip), $a_ptr # pointer to modulus
+ mov $b_org, $b_ptr
+ jmp .Loop_ord_sqr
+
+.align 32
+.Loop_ord_sqr:
+ ################################# a[1:] * a[0]
+ mov %rax, $t1 # put aside a[1]
+ mul $acc0 # a[1] * a[0]
+ mov %rax, $acc1
+ movq $t1, %xmm1 # offload a[1]
+ mov $acc6, %rax
+ mov %rdx, $acc2
+
+ mul $acc0 # a[2] * a[0]
+ add %rax, $acc2
+ mov $acc7, %rax
+ movq $acc6, %xmm2 # offload a[2]
+ adc \$0, %rdx
+ mov %rdx, $acc3
+
+ mul $acc0 # a[3] * a[0]
+ add %rax, $acc3
+ mov $acc7, %rax
+ movq $acc7, %xmm3 # offload a[3]
+ adc \$0, %rdx
+ mov %rdx, $acc4
+
+ ################################# a[3] * a[2]
+ mul $acc6 # a[3] * a[2]
+ mov %rax, $acc5
+ mov $acc6, %rax
+ mov %rdx, $acc6
+
+ ################################# a[2:] * a[1]
+ mul $t1 # a[2] * a[1]
+ add %rax, $acc3
+ mov $acc7, %rax
+ adc \$0, %rdx
+ mov %rdx, $acc7
+
+ mul $t1 # a[3] * a[1]
+ add %rax, $acc4
+ adc \$0, %rdx
+
+ add $acc7, $acc4
+ adc %rdx, $acc5
+ adc \$0, $acc6 # can't overflow
+
+ ################################# *2
+ xor $acc7, $acc7
+ mov $acc0, %rax
+ add $acc1, $acc1
+ adc $acc2, $acc2
+ adc $acc3, $acc3
+ adc $acc4, $acc4
+ adc $acc5, $acc5
+ adc $acc6, $acc6
+ adc \$0, $acc7
+
+ ################################# Missing products
+ mul %rax # a[0] * a[0]
+ mov %rax, $acc0
+ movq %xmm1, %rax
+ mov %rdx, $t1
+
+ mul %rax # a[1] * a[1]
+ add $t1, $acc1
+ adc %rax, $acc2
+ movq %xmm2, %rax
+ adc \$0, %rdx
+ mov %rdx, $t1
+
+ mul %rax # a[2] * a[2]
+ add $t1, $acc3
+ adc %rax, $acc4
+ movq %xmm3, %rax
+ adc \$0, %rdx
+ mov %rdx, $t1
+
+ mov $acc0, $t0
+ imulq 8*4($a_ptr), $acc0 # *= .LordK
+
+ mul %rax # a[3] * a[3]
+ add $t1, $acc5
+ adc %rax, $acc6
+ mov 8*0($a_ptr), %rax # modulus[0]
+ adc %rdx, $acc7 # can't overflow
+
+ ################################# First reduction step
+ mul $acc0
+ mov $acc0, $t1
+ add %rax, $t0 # guaranteed to be zero
+ mov 8*1($a_ptr), %rax # modulus[1]
+ adc %rdx, $t0
+
+ sub $acc0, $acc2
+ sbb \$0, $t1 # can't borrow
+
+ mul $acc0
+ add $t0, $acc1
+ adc \$0, %rdx
+ add %rax, $acc1
+ mov $acc0, %rax
+ adc %rdx, $acc2
+ mov $acc0, %rdx
+ adc \$0, $t1 # can't overflow
+
+ mov $acc1, $t0
+ imulq 8*4($a_ptr), $acc1 # *= .LordK
+
+ shl \$32, %rax
+ shr \$32, %rdx
+ sub %rax, $acc3
+ mov 8*0($a_ptr), %rax
+ sbb %rdx, $acc0 # can't borrow
+
+ add $t1, $acc3
+ adc \$0, $acc0 # can't overflow
+
+ ################################# Second reduction step
+ mul $acc1
+ mov $acc1, $t1
+ add %rax, $t0 # guaranteed to be zero
+ mov 8*1($a_ptr), %rax
+ adc %rdx, $t0
+
+ sub $acc1, $acc3
+ sbb \$0, $t1 # can't borrow
+
+ mul $acc1
+ add $t0, $acc2
+ adc \$0, %rdx
+ add %rax, $acc2
+ mov $acc1, %rax
+ adc %rdx, $acc3
+ mov $acc1, %rdx
+ adc \$0, $t1 # can't overflow
+
+ mov $acc2, $t0
+ imulq 8*4($a_ptr), $acc2 # *= .LordK
+
+ shl \$32, %rax
+ shr \$32, %rdx
+ sub %rax, $acc0
+ mov 8*0($a_ptr), %rax
+ sbb %rdx, $acc1 # can't borrow
+
+ add $t1, $acc0
+ adc \$0, $acc1 # can't overflow
+
+ ################################# Third reduction step
+ mul $acc2
+ mov $acc2, $t1
+ add %rax, $t0 # guaranteed to be zero
+ mov 8*1($a_ptr), %rax
+ adc %rdx, $t0
+
+ sub $acc2, $acc0
+ sbb \$0, $t1 # can't borrow
+
+ mul $acc2
+ add $t0, $acc3
+ adc \$0, %rdx
+ add %rax, $acc3
+ mov $acc2, %rax
+ adc %rdx, $acc0
+ mov $acc2, %rdx
+ adc \$0, $t1 # can't overflow
+
+ mov $acc3, $t0
+ imulq 8*4($a_ptr), $acc3 # *= .LordK
+
+ shl \$32, %rax
+ shr \$32, %rdx
+ sub %rax, $acc1
+ mov 8*0($a_ptr), %rax
+ sbb %rdx, $acc2 # can't borrow
+
+ add $t1, $acc1
+ adc \$0, $acc2 # can't overflow
+
+ ################################# Last reduction step
+ mul $acc3
+ mov $acc3, $t1
+ add %rax, $t0 # guaranteed to be zero
+ mov 8*1($a_ptr), %rax
+ adc %rdx, $t0
+
+ sub $acc3, $acc1
+ sbb \$0, $t1 # can't borrow
+
+ mul $acc3
+ add $t0, $acc0
+ adc \$0, %rdx
+ add %rax, $acc0
+ mov $acc3, %rax
+ adc %rdx, $acc1
+ mov $acc3, %rdx
+ adc \$0, $t1 # can't overflow
+
+ shl \$32, %rax
+ shr \$32, %rdx
+ sub %rax, $acc2
+ sbb %rdx, $acc3 # can't borrow
+
+ add $t1, $acc2
+ adc \$0, $acc3 # can't overflow
+
+ ################################# Add bits [511:256] of the sqr result
+ xor %rdx, %rdx
+ add $acc4, $acc0
+ adc $acc5, $acc1
+ mov $acc0, $acc4
+ adc $acc6, $acc2
+ adc $acc7, $acc3
+ mov $acc1, %rax
+ adc \$0, %rdx
+
+ ################################# Compare to modulus
+ sub 8*0($a_ptr), $acc0
+ mov $acc2, $acc6
+ sbb 8*1($a_ptr), $acc1
+ sbb 8*2($a_ptr), $acc2
+ mov $acc3, $acc7
+ sbb 8*3($a_ptr), $acc3
+ sbb \$0, %rdx
+
+ cmovc $acc4, $acc0
+ cmovnc $acc1, %rax
+ cmovnc $acc2, $acc6
+ cmovnc $acc3, $acc7
+
+ dec $b_ptr
+ jnz .Loop_ord_sqr
+
+ mov $acc0, 8*0($r_ptr)
+ mov %rax, 8*1($r_ptr)
+ pxor %xmm1, %xmm1
+ mov $acc6, 8*2($r_ptr)
+ pxor %xmm2, %xmm2
+ mov $acc7, 8*3($r_ptr)
+ pxor %xmm3, %xmm3
+
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbx
+ pop %rbp
+ ret
+.size ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont
+___
+$code.=<<___ if ($addx);
+################################################################################
+.type ecp_nistz256_ord_mul_montx,\@function,3
+.align 32
+ecp_nistz256_ord_mul_montx:
+.Lecp_nistz256_ord_mul_montx:
+ push %rbp
+ push %rbx
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ mov $b_org, $b_ptr
+ mov 8*0($b_org), %rdx
+ mov 8*0($a_ptr), $acc1
+ mov 8*1($a_ptr), $acc2
+ mov 8*2($a_ptr), $acc3
+ mov 8*3($a_ptr), $acc4
+ lea -128($a_ptr), $a_ptr # control u-op density
+ lea .Lord-128(%rip), %r14
+ mov .LordK(%rip), %r15
+
+ ################################# Multiply by b[0]
+ mulx $acc1, $acc0, $acc1
+ mulx $acc2, $t0, $acc2
+ mulx $acc3, $t1, $acc3
+ add $t0, $acc1
+ mulx $acc4, $t0, $acc4
+ mov $acc0, %rdx
+ mulx %r15, %rdx, %rax
+ adc $t1, $acc2
+ adc $t0, $acc3
+ adc \$0, $acc4
+
+ ################################# reduction
+ xor $acc5, $acc5 # $acc5=0, cf=0, of=0
+ mulx 8*0+128(%r14), $t0, $t1
+ adcx $t0, $acc0 # guaranteed to be zero
+ adox $t1, $acc1
+
+ mulx 8*1+128(%r14), $t0, $t1
+ adcx $t0, $acc1
+ adox $t1, $acc2
+
+ mulx 8*2+128(%r14), $t0, $t1
+ adcx $t0, $acc2
+ adox $t1, $acc3
+
+ mulx 8*3+128(%r14), $t0, $t1
+ mov 8*1($b_ptr), %rdx
+ adcx $t0, $acc3
+ adox $t1, $acc4
+ adcx $acc0, $acc4
+ adox $acc0, $acc5
+ adc \$0, $acc5 # cf=0, of=0
+
+ ################################# Multiply by b[1]
+ mulx 8*0+128($a_ptr), $t0, $t1
+ adcx $t0, $acc1
+ adox $t1, $acc2
+
+ mulx 8*1+128($a_ptr), $t0, $t1
+ adcx $t0, $acc2
+ adox $t1, $acc3
+
+ mulx 8*2+128($a_ptr), $t0, $t1
+ adcx $t0, $acc3
+ adox $t1, $acc4
+
+ mulx 8*3+128($a_ptr), $t0, $t1
+ mov $acc1, %rdx
+ mulx %r15, %rdx, %rax
+ adcx $t0, $acc4
+ adox $t1, $acc5
+
+ adcx $acc0, $acc5
+ adox $acc0, $acc0
+ adc \$0, $acc0 # cf=0, of=0
+
+ ################################# reduction
+ mulx 8*0+128(%r14), $t0, $t1
+ adcx $t0, $acc1 # guaranteed to be zero
+ adox $t1, $acc2
+
+ mulx 8*1+128(%r14), $t0, $t1
+ adcx $t0, $acc2
+ adox $t1, $acc3
+
+ mulx 8*2+128(%r14), $t0, $t1
+ adcx $t0, $acc3
+ adox $t1, $acc4
+
+ mulx 8*3+128(%r14), $t0, $t1
+ mov 8*2($b_ptr), %rdx
+ adcx $t0, $acc4
+ adox $t1, $acc5
+ adcx $acc1, $acc5
+ adox $acc1, $acc0
+ adc \$0, $acc0 # cf=0, of=0
+
+ ################################# Multiply by b[2]
+ mulx 8*0+128($a_ptr), $t0, $t1
+ adcx $t0, $acc2
+ adox $t1, $acc3
+
+ mulx 8*1+128($a_ptr), $t0, $t1
+ adcx $t0, $acc3
+ adox $t1, $acc4
+
+ mulx 8*2+128($a_ptr), $t0, $t1
+ adcx $t0, $acc4
+ adox $t1, $acc5
+
+ mulx 8*3+128($a_ptr), $t0, $t1
+ mov $acc2, %rdx
+ mulx %r15, %rdx, %rax
+ adcx $t0, $acc5
+ adox $t1, $acc0
+
+ adcx $acc1, $acc0
+ adox $acc1, $acc1
+ adc \$0, $acc1 # cf=0, of=0
+
+ ################################# reduction
+ mulx 8*0+128(%r14), $t0, $t1
+ adcx $t0, $acc2 # guaranteed to be zero
+ adox $t1, $acc3
+
+ mulx 8*1+128(%r14), $t0, $t1
+ adcx $t0, $acc3
+ adox $t1, $acc4
+
+ mulx 8*2+128(%r14), $t0, $t1
+ adcx $t0, $acc4
+ adox $t1, $acc5
+
+ mulx 8*3+128(%r14), $t0, $t1
+ mov 8*3($b_ptr), %rdx
+ adcx $t0, $acc5
+ adox $t1, $acc0
+ adcx $acc2, $acc0
+ adox $acc2, $acc1
+ adc \$0, $acc1 # cf=0, of=0
+
+ ################################# Multiply by b[3]
+ mulx 8*0+128($a_ptr), $t0, $t1
+ adcx $t0, $acc3
+ adox $t1, $acc4
+
+ mulx 8*1+128($a_ptr), $t0, $t1
+ adcx $t0, $acc4
+ adox $t1, $acc5
+
+ mulx 8*2+128($a_ptr), $t0, $t1
+ adcx $t0, $acc5
+ adox $t1, $acc0
+
+ mulx 8*3+128($a_ptr), $t0, $t1
+ mov $acc3, %rdx
+ mulx %r15, %rdx, %rax
+ adcx $t0, $acc0
+ adox $t1, $acc1
+
+ adcx $acc2, $acc1
+ adox $acc2, $acc2
+ adc \$0, $acc2 # cf=0, of=0
+
+ ################################# reduction
+ mulx 8*0+128(%r14), $t0, $t1
+ adcx $t0, $acc3 # guranteed to be zero
+ adox $t1, $acc4
+
+ mulx 8*1+128(%r14), $t0, $t1
+ adcx $t0, $acc4
+ adox $t1, $acc5
+
+ mulx 8*2+128(%r14), $t0, $t1
+ adcx $t0, $acc5
+ adox $t1, $acc0
+
+ mulx 8*3+128(%r14), $t0, $t1
+ lea 128(%r14),%r14
+ mov $acc4, $t2
+ adcx $t0, $acc0
+ adox $t1, $acc1
+ mov $acc5, $t3
+ adcx $acc3, $acc1
+ adox $acc3, $acc2
+ adc \$0, $acc2
+
+ #################################
+ # Branch-less conditional subtraction of P
+ mov $acc0, $t0
+ sub 8*0(%r14), $acc4
+ sbb 8*1(%r14), $acc5
+ sbb 8*2(%r14), $acc0
+ mov $acc1, $t1
+ sbb 8*3(%r14), $acc1
+ sbb \$0, $acc2
+
+ cmovc $t2, $acc4
+ cmovc $t3, $acc5
+ cmovc $t0, $acc0
+ cmovc $t1, $acc1
+
+ mov $acc4, 8*0($r_ptr)
+ mov $acc5, 8*1($r_ptr)
+ mov $acc0, 8*2($r_ptr)
+ mov $acc1, 8*3($r_ptr)
+
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbx
+ pop %rbp
+ ret
+.size ecp_nistz256_ord_mul_montx,.-ecp_nistz256_ord_mul_montx
+
+.type ecp_nistz256_ord_sqr_montx,\@function,3
+.align 32
+ecp_nistz256_ord_sqr_montx:
+.Lecp_nistz256_ord_sqr_montx:
+ push %rbp
+ push %rbx
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ mov $b_org, $b_ptr
+ mov 8*0($a_ptr), %rdx
+ mov 8*1($a_ptr), $acc6
+ mov 8*2($a_ptr), $acc7
+ mov 8*3($a_ptr), $acc0
+ lea .Lord(%rip), $a_ptr
+ jmp .Loop_ord_sqrx
+
+.align 32
+.Loop_ord_sqrx:
+ mulx $acc6, $acc1, $acc2 # a[0]*a[1]
+ mulx $acc7, $t0, $acc3 # a[0]*a[2]
+ mov %rdx, %rax # offload a[0]
+ movq $acc6, %xmm1 # offload a[1]
+ mulx $acc0, $t1, $acc4 # a[0]*a[3]
+ mov $acc6, %rdx
+ add $t0, $acc2
+ movq $acc7, %xmm2 # offload a[2]
+ adc $t1, $acc3
+ adc \$0, $acc4
+ xor $acc5, $acc5 # $acc5=0,cf=0,of=0
+ #################################
+ mulx $acc7, $t0, $t1 # a[1]*a[2]
+ adcx $t0, $acc3
+ adox $t1, $acc4
+
+ mulx $acc0, $t0, $t1 # a[1]*a[3]
+ mov $acc7, %rdx
+ adcx $t0, $acc4
+ adox $t1, $acc5
+ adc \$0, $acc5
+ #################################
+ mulx $acc0, $t0, $acc6 # a[2]*a[3]
+ mov %rax, %rdx
+ movq $acc0, %xmm3 # offload a[3]
+ xor $acc7, $acc7 # $acc7=0,cf=0,of=0
+ adcx $acc1, $acc1 # acc1:6<<1
+ adox $t0, $acc5
+ adcx $acc2, $acc2
+ adox $acc7, $acc6 # of=0
+
+ ################################# a[i]*a[i]
+ mulx %rdx, $acc0, $t1
+ movq %xmm1, %rdx
+ adcx $acc3, $acc3
+ adox $t1, $acc1
+ adcx $acc4, $acc4
+ mulx %rdx, $t0, $t4
+ movq %xmm2, %rdx
+ adcx $acc5, $acc5
+ adox $t0, $acc2
+ adcx $acc6, $acc6
+ mulx %rdx, $t0, $t1
+ .byte 0x67
+ movq %xmm3, %rdx
+ adox $t4, $acc3
+ adcx $acc7, $acc7
+ adox $t0, $acc4
+ adox $t1, $acc5
+ mulx %rdx, $t0, $t4
+ adox $t0, $acc6
+ adox $t4, $acc7
+
+ ################################# reduction
+ mov $acc0, %rdx
+ mulx 8*4($a_ptr), %rdx, $t0
+
+ xor %rax, %rax # cf=0, of=0
+ mulx 8*0($a_ptr), $t0, $t1
+ adcx $t0, $acc0 # guaranteed to be zero
+ adox $t1, $acc1
+ mulx 8*1($a_ptr), $t0, $t1
+ adcx $t0, $acc1
+ adox $t1, $acc2
+ mulx 8*2($a_ptr), $t0, $t1
+ adcx $t0, $acc2
+ adox $t1, $acc3
+ mulx 8*3($a_ptr), $t0, $t1
+ adcx $t0, $acc3
+ adox $t1, $acc0 # of=0
+ adcx %rax, $acc0 # cf=0
+
+ #################################
+ mov $acc1, %rdx
+ mulx 8*4($a_ptr), %rdx, $t0
+
+ mulx 8*0($a_ptr), $t0, $t1
+ adox $t0, $acc1 # guaranteed to be zero
+ adcx $t1, $acc2
+ mulx 8*1($a_ptr), $t0, $t1
+ adox $t0, $acc2
+ adcx $t1, $acc3
+ mulx 8*2($a_ptr), $t0, $t1
+ adox $t0, $acc3
+ adcx $t1, $acc0
+ mulx 8*3($a_ptr), $t0, $t1
+ adox $t0, $acc0
+ adcx $t1, $acc1 # cf=0
+ adox %rax, $acc1 # of=0
+
+ #################################
+ mov $acc2, %rdx
+ mulx 8*4($a_ptr), %rdx, $t0
+
+ mulx 8*0($a_ptr), $t0, $t1
+ adcx $t0, $acc2 # guaranteed to be zero
+ adox $t1, $acc3
+ mulx 8*1($a_ptr), $t0, $t1
+ adcx $t0, $acc3
+ adox $t1, $acc0
+ mulx 8*2($a_ptr), $t0, $t1
+ adcx $t0, $acc0
+ adox $t1, $acc1
+ mulx 8*3($a_ptr), $t0, $t1
+ adcx $t0, $acc1
+ adox $t1, $acc2 # of=0
+ adcx %rax, $acc2 # cf=0
+
+ #################################
+ mov $acc3, %rdx
+ mulx 8*4($a_ptr), %rdx, $t0
+
+ mulx 8*0($a_ptr), $t0, $t1
+ adox $t0, $acc3 # guaranteed to be zero
+ adcx $t1, $acc0
+ mulx 8*1($a_ptr), $t0, $t1
+ adox $t0, $acc0
+ adcx $t1, $acc1
+ mulx 8*2($a_ptr), $t0, $t1
+ adox $t0, $acc1
+ adcx $t1, $acc2
+ mulx 8*3($a_ptr), $t0, $t1
+ adox $t0, $acc2
+ adcx $t1, $acc3
+ adox %rax, $acc3
+
+ ################################# accumulate upper half
+ add $acc0, $acc4 # add $acc4, $acc0
+ adc $acc5, $acc1
+ mov $acc4, %rdx
+ adc $acc6, $acc2
+ adc $acc7, $acc3
+ mov $acc1, $acc6
+ adc \$0, %rax
+
+ ################################# compare to modulus
+ sub 8*0($a_ptr), $acc4
+ mov $acc2, $acc7
+ sbb 8*1($a_ptr), $acc1
+ sbb 8*2($a_ptr), $acc2
+ mov $acc3, $acc0
+ sbb 8*3($a_ptr), $acc3
+ sbb \$0, %rax
+
+ cmovnc $acc4, %rdx
+ cmovnc $acc1, $acc6
+ cmovnc $acc2, $acc7
+ cmovnc $acc3, $acc0
+
+ dec $b_ptr
+ jnz .Loop_ord_sqrx
+
+ mov %rdx, 8*0($r_ptr)
+ mov $acc6, 8*1($r_ptr)
+ pxor %xmm1, %xmm1
+ mov $acc7, 8*2($r_ptr)
+ pxor %xmm2, %xmm2
+ mov $acc0, 8*3($r_ptr)
+ pxor %xmm3, %xmm3
+
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbx
+ pop %rbp
+ ret
+
+.size ecp_nistz256_ord_sqr_montx,.-ecp_nistz256_ord_sqr_montx
+___
+$code.=<<___;
+################################################################################
# void ecp_nistz256_to_mont(
# uint64_t res[4],
# uint64_t in[4]);
diff --git a/crypto/ec/ec_lcl.h b/crypto/ec/ec_lcl.h
index ebfaae3..8bae928 100644
--- a/crypto/ec/ec_lcl.h
+++ b/crypto/ec/ec_lcl.h
@@ -196,6 +196,10 @@ struct ec_method_st {
int (*field_decode) (const EC_GROUP *, BIGNUM *r, const BIGNUM *a,
BN_CTX *);
int (*field_set_to_one) (const EC_GROUP *, BIGNUM *r, BN_CTX *);
+
+ /* Inverse modulo order */
+ int (*field_inverse_mod_ord) (const EC_GROUP *, BIGNUM *r, BIGNUM *x,
+ BN_CTX *ctx);
} /* EC_METHOD */ ;
typedef struct ec_extra_data_st {
@@ -545,7 +549,6 @@ void ec_GFp_nistp_points_make_affine_internal(size_t num, void *point_array,
void ec_GFp_nistp_recode_scalar_bits(unsigned char *sign,
unsigned char *digit, unsigned char in);
#endif
-int ec_precompute_mont_data(EC_GROUP *);
#ifdef ECP_NISTZ256_ASM
/** Returns GFp methods using montgomery multiplication, with x86-64 optimized
@@ -612,3 +615,6 @@ int ossl_ecdsa_verify(int type, const unsigned char *dgst, int dgst_len,
const unsigned char *sigbuf, int sig_len, EC_KEY *eckey);
int ossl_ecdsa_verify_sig(const unsigned char *dgst, int dgst_len,
const ECDSA_SIG *sig, EC_KEY *eckey);
+
+int EC_GROUP_do_inverse_ord(const EC_GROUP *group, BIGNUM *res,
+ BIGNUM *x, BN_CTX *ctx);
diff --git a/crypto/ec/ec_lib.c b/crypto/ec/ec_lib.c
index 7cb4759..556d526 100644
--- a/crypto/ec/ec_lib.c
+++ b/crypto/ec/ec_lib.c
@@ -262,6 +262,8 @@ int EC_METHOD_get_field_type(const EC_METHOD *meth)
return meth->field_type;
}
+static int ec_precompute_mont_data(EC_GROUP *);
+
int EC_GROUP_set_generator(EC_GROUP *group, const EC_POINT *generator,
const BIGNUM *order, const BIGNUM *cofactor)
{
@@ -305,11 +307,6 @@ const EC_POINT *EC_GROUP_get0_generator(const EC_GROUP *group)
return group->generator;
}
-BN_MONT_CTX *EC_GROUP_get_mont_data(const EC_GROUP *group)
-{
- return group->mont_data;
-}
-
int EC_GROUP_get_order(const EC_GROUP *group, BIGNUM *order, BN_CTX *ctx)
{
if (!BN_copy(order, group->order))
@@ -1063,7 +1060,7 @@ int EC_GROUP_have_precompute_mult(const EC_GROUP *group)
* ec_precompute_mont_data sets |group->mont_data| from |group->order| and
* returns one on success. On error it returns zero.
*/
-int ec_precompute_mont_data(EC_GROUP *group)
+static int ec_precompute_mont_data(EC_GROUP *group)
{
BN_CTX *ctx = BN_CTX_new();
int ret = 0;
@@ -1091,3 +1088,12 @@ int ec_precompute_mont_data(EC_GROUP *group)
BN_CTX_free(ctx);
return ret;
}
+
+int EC_GROUP_do_inverse_ord(const EC_GROUP *group, BIGNUM *res,
+ BIGNUM *x, BN_CTX *ctx)
+{
+ if (group->meth->field_inverse_mod_ord != NULL) {
+ return group->meth->field_inverse_mod_ord(group, res, x, ctx);
+ }
+ else return 0;
+}
diff --git a/crypto/ec/ecdsa_ossl.c b/crypto/ec/ecdsa_ossl.c
index e48d100..cbe2d06 100644
--- a/crypto/ec/ecdsa_ossl.c
+++ b/crypto/ec/ecdsa_ossl.c
@@ -182,30 +182,33 @@ static int ecdsa_sign_setup(EC_KEY *eckey, BN_CTX *ctx_in,
}
while (BN_is_zero(r));
- /* compute the inverse of k */
- if (EC_GROUP_get_mont_data(group) != NULL) {
- /*
- * We want inverse in constant time, therefore we utilize the fact
- * order must be prime and use Fermats Little Theorem instead.
- */
- if (!BN_set_word(X, 2)) {
- ECerr(EC_F_ECDSA_SIGN_SETUP, ERR_R_BN_LIB);
- goto err;
- }
- if (!BN_mod_sub(X, order, X, order, ctx)) {
- ECerr(EC_F_ECDSA_SIGN_SETUP, ERR_R_BN_LIB);
- goto err;
- }
- BN_set_flags(X, BN_FLG_CONSTTIME);
- if (!BN_mod_exp_mont_consttime
- (k, k, X, order, ctx, EC_GROUP_get_mont_data(group))) {
- ECerr(EC_F_ECDSA_SIGN_SETUP, ERR_R_BN_LIB);
- goto err;
- }
- } else {
- if (!BN_mod_inverse(k, k, order, ctx)) {
- ECerr(EC_F_ECDSA_SIGN_SETUP, ERR_R_BN_LIB);
- goto err;
+ /* Check if optimized inverse is implemented */
+ if (EC_GROUP_do_inverse_ord(group, k, k, ctx) == 0) {
+ /* compute the inverse of k */
+ if (group->mont_data != NULL) {
+ /*
+ * We want inverse in constant time, therefore we utilize the fact
+ * order must be prime and use Fermats Little Theorem instead.
+ */
+ if (!BN_set_word(X, 2)) {
+ ECerr(EC_F_ECDSA_SIGN_SETUP, ERR_R_BN_LIB);
+ goto err;
+ }
+ if (!BN_mod_sub(X, order, X, order, ctx)) {
+ ECerr(EC_F_ECDSA_SIGN_SETUP, ERR_R_BN_LIB);
+ goto err;
+ }
+ BN_set_flags(X, BN_FLG_CONSTTIME);
+ if (!BN_mod_exp_mont_consttime(k, k, X, order, ctx,
+ group->mont_data)) {
+ ECerr(EC_F_ECDSA_SIGN_SETUP, ERR_R_BN_LIB);
+ goto err;
+ }
+ } else {
+ if (!BN_mod_inverse(k, k, order, ctx)) {
+ ECerr(EC_F_ECDSA_SIGN_SETUP, ERR_R_BN_LIB);
+ goto err;
+ }
}
}
@@ -420,9 +423,12 @@ int ossl_ecdsa_verify_sig(const unsigned char *dgst, int dgst_len,
goto err;
}
/* calculate tmp1 = inv(S) mod order */
- if (!BN_mod_inverse(u2, sig->s, order, ctx)) {
- ECerr(EC_F_OSSL_ECDSA_VERIFY_SIG, ERR_R_BN_LIB);
- goto err;
+ /* Check if optimized inverse is implemented */
+ if (EC_GROUP_do_inverse_ord(group, u2, sig->s, ctx) == 0) {
+ if (!BN_mod_inverse(u2, sig->s, order, ctx)) {
+ ECerr(EC_F_OSSL_ECDSA_VERIFY_SIG, ERR_R_BN_LIB);
+ goto err;
+ }
}
/* digest -> m */
i = BN_num_bits(order);
diff --git a/crypto/ec/ecp_nistz256.c b/crypto/ec/ecp_nistz256.c
index 3d83303..cdea215 100644
--- a/crypto/ec/ecp_nistz256.c
+++ b/crypto/ec/ecp_nistz256.c
@@ -903,7 +903,7 @@ __owur static int ecp_nistz256_mult_precompute(EC_GROUP *group, BN_CTX *ctx)
*/
#if defined(ECP_NISTZ256_AVX2)
# if !(defined(__x86_64) || defined(__x86_64__) || \
- defined(_M_AMD64) || defined(_MX64)) || \
+ defined(_M_AMD64) || defined(_M_X64)) || \
!(defined(__GNUC__) || defined(_MSC_VER)) /* this is for ALIGN32 */
# undef ECP_NISTZ256_AVX2
# else
@@ -1480,6 +1480,107 @@ static int ecp_nistz256_window_have_precompute_mult(const EC_GROUP *group)
ecp_nistz256_pre_comp_clear_free) != NULL;
}
+#if defined(__x86_64) || defined(__x86_64__) || \
+ defined(_M_AMD64) || defined(_M_X64)
+/* Montgomery mul modulo Order(P): res = a*b*2^-256 mod Order(P) */
+void ecp_nistz256_ord_mul_mont(BN_ULONG res[P256_LIMBS],
+ const BN_ULONG a[P256_LIMBS],
+ const BN_ULONG b[P256_LIMBS]);
+void ecp_nistz256_ord_sqr_mont(BN_ULONG res[P256_LIMBS],
+ const BN_ULONG a[P256_LIMBS],
+ int rep);
+
+static int ecp_nistz256_inv_mod_ord(const EC_GROUP *group, BIGNUM *r,
+ BIGNUM *x, BN_CTX *ctx)
+{
+ const BN_ULONG RR[P256_LIMBS] = { TOBN(0x83244c95,0xbe79eea2),
+ TOBN(0x4699799c,0x49bd6fa6),
+ TOBN(0x2845b239,0x2b6bec59),
+ TOBN(0x66e12d94,0xf3d95620) };
+ const BN_ULONG one[P256_LIMBS] = { TOBN(0,1),TOBN(0,0),
+ TOBN(0,0),TOBN(0,0) };
+ const unsigned char expLo[32] = { 0xb,0xc,0xe,0x6,0xf,0xa,0xa,0xd,
+ 0xa,0x7,0x1,0x7,0x9,0xe,0x8,0x4,
+ 0xf,0x3,0xb,0x9,0xc,0xa,0xc,0x2,
+ 0xf,0xc,0x6,0x3,0x2,0x5,0x4,0xf };
+ /*
+ * We don't use entry 0 in the table, so we omit it and address
+ * with -1 offset.
+ */
+ BN_ULONG table[15][P256_LIMBS];
+ BN_ULONG out[P256_LIMBS], t[P256_LIMBS];
+ int i, ret = 0;
+
+ /*
+ * Catch allocation failure early.
+ */
+ if (bn_wexpand(r, P256_LIMBS) == NULL)
+ goto err;
+
+ if ((BN_num_bits(x) > 256) || BN_is_negative(x)) {
+ BIGNUM *tmp;
+
+ if ((tmp = BN_CTX_get(ctx)) == NULL)
+ goto err;
+
+ if (!BN_nnmod(tmp, x, group->order, ctx)) {
+ goto err;
+ }
+ x = tmp;
+ }
+
+ /*
+ * The exponent is public, no need in constant time access
+ */
+ ecp_nistz256_ord_mul_mont(table[0], bn_get_words(x), RR);
+ for (i = 2; i < 16; i += 2) {
+ ecp_nistz256_ord_sqr_mont(table[i-1], table[i/2-1], 1);
+ ecp_nistz256_ord_mul_mont(table[i], table[i-1], table[0]);
+ }
+
+ /*
+ * The top 128bit of the exponent are highly redudndant, so we
+ * perform an optimized flow
+ */
+ ecp_nistz256_ord_sqr_mont(out, table[15-1], 4); /* f0 */
+ ecp_nistz256_ord_mul_mont(out, out, table[15-1]); /* ff */
+
+ ecp_nistz256_ord_sqr_mont(t, out, 8); /* ff00 */
+ ecp_nistz256_ord_mul_mont(out, out, t); /* ffff */
+
+ ecp_nistz256_ord_sqr_mont(t, out, 16); /* ffff0000 */
+ ecp_nistz256_ord_mul_mont(out, out, t); /* ffffffff */
+
+ ecp_nistz256_ord_sqr_mont(t, out, 64); /* ffffffff0000000000000000 */
+ ecp_nistz256_ord_mul_mont(t, out, t); /* ffffffff00000000ffffffff */
+
+ ecp_nistz256_ord_sqr_mont(t, t, 32); /* ffffffff00000000ffffffff00000000 */
+ ecp_nistz256_ord_mul_mont(out, out, t); /* ffffffff00000000ffffffffffffffff */
+
+ /*
+ * The bottom 128 bit of the exponent are easier done with a table
+ */
+ for(i = 0; i < 32; i++) {
+ ecp_nistz256_ord_sqr_mont(out, out, 4);
+ ecp_nistz256_ord_mul_mont(out, out, table[expLo[i]-1]);
+ }
+
+ ecp_nistz256_ord_mul_mont(out, out, one);
+
+ /*
+ * Can't fail, but check return code to be consistent anyway.
+ */
+ if (!bn_set_words(r, out, P256_LIMBS))
+ goto err;
+
+ ret = 1;
+err:
+ return ret;
+}
+#else
+# define ecp_nistz256_inv_mod_ord NULL
+#endif
+
const EC_METHOD *EC_GFp_nistz256_method(void)
{
static const EC_METHOD ret = {
@@ -1519,7 +1620,8 @@ const EC_METHOD *EC_GFp_nistz256_method(void)
0, /* field_div */
ec_GFp_mont_field_encode,
ec_GFp_mont_field_decode,
- ec_GFp_mont_field_set_to_one
+ ec_GFp_mont_field_set_to_one,
+ ecp_nistz256_inv_mod_ord /* can be #defined-ed NULL */
};
return &ret;
diff --git a/include/openssl/ec.h b/include/openssl/ec.h
index 1dc2db1..f294101 100644
--- a/include/openssl/ec.h
+++ b/include/openssl/ec.h
@@ -243,12 +243,6 @@ int EC_GROUP_set_generator(EC_GROUP *group, const EC_POINT *generator,
*/
const EC_POINT *EC_GROUP_get0_generator(const EC_GROUP *group);
-/** Returns the montgomery data for order(Generator)
- * \param group EC_GROUP object
- * \return the currently used generator (possibly NULL).
-*/
-BN_MONT_CTX *EC_GROUP_get_mont_data(const EC_GROUP *group);
-
/** Gets the order of a EC_GROUP
* \param group EC_GROUP object
* \param order BIGNUM to which the order is copied
diff --git a/util/libeay.num b/util/libeay.num
index e9a678b..fc16e39 100755
--- a/util/libeay.num
+++ b/util/libeay.num
@@ -4412,7 +4412,7 @@ ECDSA_METHOD_set_app_data 4768 1_1_0 NOEXIST::FUNCTION:
sk_deep_copy 4769 1_1_0 EXIST::FUNCTION:
ECDSA_METHOD_get_app_data 4770 1_1_0 NOEXIST::FUNCTION:
X509_VERIFY_PARAM_add1_host 4771 1_1_0 EXIST::FUNCTION:
-EC_GROUP_get_mont_data 4772 1_1_0 EXIST::FUNCTION:EC
+EC_GROUP_get_mont_data 4772 1_1_0 NOEXIST::FUNCTION:
i2d_re_X509_tbs 4773 1_1_0 EXIST::FUNCTION:
EVP_PKEY_asn1_set_item 4774 1_1_0 EXIST::FUNCTION:
RSA_security_bits 4775 1_1_0 EXIST::FUNCTION:RSA
_______________________________________________
openssl-dev mailing list
To unsubscribe: https://mta.openssl.org/mailman/listinfo/openssl-dev